In [None]:
import constants
import pandas  as pd
from tqdm import tqdm
import hashlib
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import scipy.stats

In [None]:
tqdm.pandas()

In [None]:
constants.set_crawl_source(constants.CRAWL_SOURCE_EXTENDED)

In [None]:
with open(constants.ZIPCODE_TO_STRATA, "rb+") as fp:
    zc_to_strata = pickle.load(fp)

zipcodes = constants.EXTENDED_ZIPCODES

with open(constants.CENSUS_STRATA_DATA, "rb+") as fp:
    df_strata = pd.read_pickle(fp)
    df_strata = df_strata.loc[zipcodes]
    
business_df = pd.read_pickle(constants.BUSINESS_DATA_FILE)

In [None]:
stats_df = pd.read_pickle(constants.RECALSSIFICATION_DATA_FILE)

In [None]:
df = pd.read_pickle(constants.LONG_DATA_FILE)

In [None]:
df["crawl_number"] = df["crawl_id"].apply(lambda x: constants.CRAWL_NUMBER[x])
df = df.sort_values("crawl_number")

In [None]:
df["experiment"] = df["experiment"].replace({constants.CrawlExperiment.DENSITY:"Density",constants.CrawlExperiment.INCOME:"Income"})
business_df["experiment"] = business_df["experiment"].replace({constants.CrawlExperiment.DENSITY:"Density",constants.CrawlExperiment.INCOME:"Income"})

# Divide reviews by whether or not they're new for that crawl

In [None]:
crawl_max_dates = df.groupby("crawl_id").date.max()
crawl_min_dates = pd.Series({crawl_id:crawl_max_dates.loc[constants.CRAWL_ORDER[constants.CRAWL_ORDER.index(crawl_id)-1]] for crawl_id in crawl_max_dates.index})

In [None]:
def is_new(row):
    return row.date > crawl_min_dates.loc[row.crawl_id] and row.date <= crawl_max_dates.loc[row.crawl_id]
df["is_new"] = df.progress_apply(is_new,axis=1)

In [None]:
df["recommended_discrete"] = df["flagged"].apply(lambda flagged: 0 if flagged else 1)

In [None]:
sns.barplot(data=df[df.crawl_number != 0],x="crawl_id",y="recommended_discrete",hue="is_new")

In [None]:
#Heatmap
def get_perc_recommended(subdf):
    vc = subdf.flagged.value_counts()
    rec_perc = vc.loc[False] / len(subdf)
    ci = scipy.stats.t.interval(alpha=0.95,df=len(subdf),loc=rec_perc,scale=scipy.stats.sem(subdf.flagged.apply(lambda x: 0 if x else 1)))
    return pd.Series([rec_perc,len(subdf),ci], index=["recommend_percent","count","ci"])

heatmap_data = df.groupby(["experiment","is_new","crawl_id","stratum"]).progress_apply(get_perc_recommended).reset_index()
for experiment in ["Density","Income"]:
    sns.heatmap(data=heatmap_data[(heatmap_data.experiment == experiment) & ~heatmap_data.is_new].pivot("stratum","crawl_id","recommend_percent"),vmin=0.7,vmax=1).set(title=f"Old reviews percentage recommended ({experiment})")
    plt.show()
    sns.heatmap(data=heatmap_data[(heatmap_data.experiment == experiment) & heatmap_data.is_new].pivot("stratum","crawl_id","recommend_percent"),vmin=0.7,vmax=1).set(title=f"New reviews percentage recommended ({experiment})")
    plt.show()
    

sns.heatmap(data=heatmap_data[(heatmap_data.experiment == experiment) & heatmap_data.is_new].pivot("Stratum1","crawl_id","recommend_percent"),vmin=0.7,vmax=1).set(title=f"New reviews percentage recommended ({experiment})")
plt.show()



In [None]:
#Heatmap
def get_perc_recommended(subdf):
    vc = subdf.flagged.value_counts()
    return vc.loc[False] / len(subdf)

In [None]:
df[df.is_new].groupby("crawl_id").stratum.value_counts()

In [None]:
heatmap_data[heatmap_data.is_new]

##### It looks like everything is within the margin of error

# Chicago 

In [None]:
constants.set_crawl_source(constants.CRAWL_SOURCE_CHICAGO)

In [None]:
chicago_business_df = pd.read_pickle(constants.BUSINESS_DATA_FILE)

In [None]:
chicago_stats_df = pd.read_pickle(constants.RECALSSIFICATION_DATA_FILE)
chicago_stats_df = chicago_stats_df.reset_index(level=[1])

In [None]:
chicago_df = pd.read_pickle(constants.LONG_DATA_FILE)

In [None]:
chicago_df["crawl_number"] = chicago_df["crawl_id"].apply(lambda x: constants.CRAWL_NUMBER[x])
chicago_df = chicago_df.sort_values("crawl_number")

In [None]:
chicago_df

# Divide reviews by whether or not they're new for that crawl

In [None]:
chicago_crawl_max_dates = chicago_df.groupby("crawl_id").date.max()
chicago_crawl_min_dates = pd.Series({crawl_id:chicago_crawl_max_dates.loc[constants.CRAWL_ORDER[constants.CRAWL_ORDER.index(crawl_id)-1]] for crawl_id in chicago_crawl_max_dates.index})

In [None]:
def is_new(row):
    return row.date > chicago_crawl_min_dates.loc[row.crawl_id] and row.date <= chicago_crawl_max_dates.loc[row.crawl_id]
chicago_df["is_new"] = chicago_df.progress_apply(is_new,axis=1)

In [None]:
sns.histplot(data=chicago_df[chicago_df.is_new],x="crawl_id",hue="flagged",discrete=True,multiple="dodge")

In [None]:
heatmap_data = chicago_df.groupby(["is_new","crawl_id"]).progress_apply(get_perc_recommended).rename("recommend_percent").to_frame().reset_index()

In [None]:
chicago_df["notflagged"] = ~chicago_df.flagged

In [None]:
sns.barplot(data=chicago_df[chicago_df.crawl_id != "crawl_10"],x="crawl_id",y="notflagged",hue="is_new")

In [None]:
sns.barplot(data=heatmap_data,x="crawl_id",y="recommend_percent",hue="is_new")

In [None]:
chicago_crawl_max_dates

In [None]:
heatmap_data

In [None]:
chicago_df