In [None]:
import constants
import pandas  as pd
from tqdm import tqdm
import hashlib
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import scipy.stats

In [None]:
sns.set_style(style="whitegrid")

In [None]:
tqdm.pandas()

In [None]:
constants.set_crawl_source(constants.CRAWL_SOURCE_EXTENDED)

In [None]:
with open(constants.ZIPCODE_TO_STRATA, "rb+") as fp:
    zc_to_strata = pickle.load(fp)

zipcodes = constants.EXTENDED_ZIPCODES

with open(constants.CENSUS_STRATA_DATA, "rb+") as fp:
    df_strata = pd.read_pickle(fp)
    df_strata = df_strata.loc[zipcodes]
    
business_df = pd.read_pickle(constants.BUSINESS_DATA_FILE)

In [None]:
stats_df = pd.read_pickle(constants.RECALSSIFICATION_DATA_FILE)

In [None]:
df = pd.read_pickle(constants.LONG_DATA_FILE)

In [None]:
df["crawl_number"] = df["crawl_id"].apply(lambda x: constants.CRAWL_NUMBER[x])
df = df.sort_values("crawl_number")

In [None]:
df["experiment"] = df["experiment"].replace({constants.CrawlExperiment.DENSITY:"Density",constants.CrawlExperiment.INCOME:"Income"})
business_df["experiment"] = business_df["experiment"].replace({constants.CrawlExperiment.DENSITY:"Density",constants.CrawlExperiment.INCOME:"Income"})

In [None]:
df = df.sort_values("stratum")
stratum_labels = {0:"Top 20%", 1:"20-40%", 2: "40-60%", 3: "60-80%", 4: "Bot. 20%"}
df["stratum"] = df["stratum"].replace(stratum_labels)
stratum_sort_key = {value:key for key,value in stratum_labels.items()}
stratum_sort = lambda s: s.replace(stratum_sort_key)

# How does the number of reviews interact with the strata?

In [None]:
# display("Experiment: Density")
# sns.barplot(x="stratum",y=0,data=df[df.experiment == 2].groupby("stratum").size().to_frame().reset_index()).set(title="Number of reviews (Density)")
# plt.show()
# sns.barplot(x="stratum",y=0,data=business_df[business_df.experiment == 2].groupby("stratum").size().to_frame().reset_index()).set(title="Number of businesses (Density)")
# plt.show()
# # Number of reviews in each strata
# data=df[df.experiment == 2].groupby("business_id").apply(lambda subdf: None if len(subdf) == 0 else pd.Series([len(subdf),subdf.stratum.iloc[0]],["count","stratum"])).dropna()
# sns.boxplot(x="count",y="stratum",data=data,orient="h").set(title="Number of reviews per business (Density)")
# plt.xscale('log')
# plt.show()

data=df[(df.crawl_id == "crawl_x0")].groupby(["experiment","stratum"]).apply(lambda subdf: None if len(subdf) == 0 else pd.Series([len(subdf),subdf.stratum.iloc[0],subdf.experiment.iloc[0]],["count","stratum","experiment"])).dropna()
sns.barplot(x="stratum",y="count",hue="experiment",data=data).set(title="Number of reviews")
plt.savefig("../../graphs/number_of_reviews_extended.pdf",bbox_inches='tight')
plt.show()

data=business_df.groupby(["experiment","stratum"]).apply(lambda subdf: None if len(subdf) == 0 else pd.Series([len(subdf),subdf.stratum.iloc[0],subdf.experiment.iloc[0]],["count","stratum","experiment"])).dropna()
sns.barplot(x="stratum",y="count",hue="experiment",data=data).set(title="Number of businesses")
plt.savefig("../../graphs/number_of_businesses_extended.pdf",bbox_inches='tight')
plt.show()

# Number of reviews in each strata
data=df[(df.crawl_id == "crawl_x0")].groupby(["experiment","stratum","business_id"]).apply(lambda subdf: None if len(subdf) == 0 else pd.Series([len(subdf),subdf.stratum.iloc[0],subdf.experiment.iloc[0]],["count","stratum","experiment"])).dropna()
data=data.reset_index(drop=True).sort_values("stratum",key=stratum_sort)
sns.boxplot(x="count",y="experiment",hue="stratum",data=data,orient="h").set(xlabel="Number of reviews",ylabel="Experiment")
#sns.histplot(x="stratum",hue="experiment",data=df[(df.crawl_id == "crawl_x0")],discrete=True,multiple="dodge").set(title="Number of reviews per business")
plt.xscale('log')
plt.legend(loc="lower left")

plt.savefig("../../graphs/reviews_per_business_stratified.pdf",bbox_inches='tight')
plt.show()

In [None]:
now = pd.Timestamp.now()
earliest_reviews = df.groupby(["experiment","stratum","business_id"]).apply(lambda subdf: now - subdf.date.min()).rename("earliest_review").dt.days.to_frame().reset_index()
earliest_reviews=earliest_reviews.reset_index().sort_values("stratum",key=stratum_sort)
sns.barplot(x="experiment",y="earliest_review",hue="stratum",data=earliest_reviews)
earliest_reviews.groupby(["experiment","stratum"]).earliest_review.median()

In [None]:
data=df[(df.crawl_id == "crawl_x0")].groupby(["experiment","stratum","business_id"]).apply(lambda subdf: None if len(subdf) == 0 else pd.Series([subdf.size, subdf[subdf.flagged == False].size / subdf.size,subdf.stratum.iloc[0],subdf.experiment.iloc[0]],["count", "percent_recommended","stratum","experiment"])).dropna()
data = data.dropna()
data=data.reset_index(drop=True).sort_values("stratum",key=stratum_sort)
sns.boxplot(x="percent_recommended",y="experiment",hue="stratum",data=data,orient="h").set(xlabel="Percentage recommended",ylabel="Experiment")#.set(title="Percentage recommended by business")
plt.legend(loc="upper left")
plt.savefig("../../graphs/percentage_recommended_per_businesses_extended.pdf",bbox_inches='tight')
plt.show()

In [None]:
sns.barplot(y="percent_recommended",x="stratum",hue="experiment",data=data).set(title="Percentage recommended by business")

In [None]:
display(scipy.stats.spearmanr(data["percent_recommended"], data["count"]))
data.reset_index(drop=True).groupby(["experiment","stratum"]).apply(lambda x: pd.Series(scipy.stats.spearmanr(x["percent_recommended"], x["count"]),index=["correlation","p_value"]))

In [None]:
data.reset_index(drop=True).groupby(["experiment","stratum"]).apply(lambda x: x["percent_recommended"].median())

In [None]:
data

In [None]:
%%script false --no-raise-error
for experiment in ["Income","Density"]:
    print(f"{experiment}")
    for i in range(0,4):
        for j in range(i+1,5):
            result = scipy.stats.epps_singleton_2samp(data[(data.experiment == experiment) & (data.stratum == i)]["count"].to_numpy(), data[(data.experiment == "Income") & (data.stratum == j)]["count"].to_numpy())
            print(f"\t{i},{j}: {result}")

# Reclassification

In [None]:
def get_experiment_and_strata(row):
    df_row = df.loc[row.members[0]]
    return pd.Series([df_row.experiment,df_row.stratum])

#experiment_strata = stats_df.progress_apply(get_experiment_and_strata,axis=1)

In [None]:
#stats_df = pd.concat([stats_df,experiment_strata.rename({0:"Experiment",1:"stratum"},axis=1)],axis=1)

In [None]:
#stats_df["Stratum"] = stats_df.stratum.progress_apply(lambda x: constants.COMBINED_STRATUM_COMMON_NAMES[x].replace("$","\\$"))

In [None]:
stats_df["stratum"] = stats_df["stratum"].replace(stratum_labels)
stats_df=stats_df.sort_values("stratum",key=stratum_sort)

In [None]:
stats_df["experiment"] = stats_df["experiment"].replace({2: "Density", 3: "Income"})

In [None]:
data=stats_df.sort_values("stratum",key=stratum_sort)
sns.barplot(x="experiment",y="reclassification_swaps",hue="stratum",data=stats_df).set(xlabel="Experiment",ylabel="Average number of reclassifications")
plt.savefig(f"../../graphs/stratified_reclass_swaps_{constants.CRAWL_SOURCE}.pdf",bbox_inches='tight')