In [None]:
import constants
import pandas  as pd
from tqdm import tqdm
import hashlib
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import nltk
import random
from nltk.corpus import wordnet
import datetime
import math
import scipy.stats

In [None]:
sns.set_style("whitegrid")

In [None]:
# May need to run this
if False:
    nltk.download('wordnet')
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')

In [None]:
tqdm.pandas()

# Load data

In [None]:
extended = True
constants.set_crawl_source(constants.CRAWL_SOURCE_EXTENDED if extended else constants.CRAWL_SOURCE_CHICAGO)

In [None]:
target_crawl = "crawl_x3" if extended else "crawl_17" 

In [None]:
with open(constants.ZIPCODE_TO_STRATA, "rb+") as fp:
    zc_to_strata = pickle.load(fp)

zipcodes = constants.EXTENDED_ZIPCODES

with open(constants.CENSUS_STRATA_DATA, "rb+") as fp:
    df_strata = pd.read_pickle(fp)
    df_strata = df_strata.loc[zipcodes]
    
business_df = pd.read_pickle(constants.BUSINESS_DATA_FILE)

In [None]:
df = pd.read_pickle(constants.LONG_DATA_FILE)

In [None]:
crawled_business_data = pd.read_pickle(constants.CRAWLED_BUSINESS_DATA_FILE)

In [None]:
set_a = set(crawled_business_data.business_id)
set_b = set(df[df.crawl_id == target_crawl].business_id)
print(f"{len(set_a)}, {len(set_b)}, {len(set_a & set_b)}, {len(set_a | set_b)}")

In [None]:
for ammenity in tqdm(["num_ammenities","ammenity_customers_must_wear_masks","ammenity_employees_wear_masks","ammenity_restaurants_attire","ammenity_Caters","ammenity_dogs_allowed","ammenity_employees_wear_masks"]):
    if ammenity not in df:
        df[ammenity] = df[df.crawl_id == target_crawl].apply(lambda row: crawled_business_data[ammenity].loc[(target_crawl,row.business_id)], axis=1)


In [None]:
df_target = df[df.crawl_id == target_crawl]

# Lemmatize

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()

In [None]:
flagged_words = ["mask","vaccine"]
flagged_lemmas = set((lemmatizer.lemmatize(word) for word in flagged_words))

In [None]:
#https://gaurav5430.medium.com/using-nltk-for-lemmatizing-sentences-c1bfff963258
# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def get_word_flags(review):
    flags = {f"flag_{fl}": False for fl in flagged_lemmas}
    for token, tag in nltk.pos_tag(nltk.word_tokenize(review)):
        tag = nltk_tag_to_wordnet_tag(tag)
        if tag is None:
            tag = "n"
        lemma = lemmatizer.lemmatize(token, pos=tag)
        if lemma in flagged_lemmas:
            flags[f"flag_{lemma}"] = True
    return pd.Series(flags)

In [None]:
df_flagged = pd.concat([df_target, df_target.content.progress_apply(get_word_flags)],axis=1)

In [None]:
df_target = df_flagged[df_flagged.crawl_id == target_crawl]

# Let's look at ammenities

In [None]:
def get_proportion_matching(subdf,parent_df=None,ammenity=None):
#     ammenity_value = subdf[ammenity].iloc[0]
#     experiments = subdf.experiment.unique()
#     all_with_params = parent_df[(parent_df.experiment.isin(experiments)) & (parent_df[ammenity].isna() if math.isnan(ammenity_value) else (parent_df[ammenity] == ammenity_value))]

    try:
        ammenity_value = subdf[ammenity].iloc[0]
    except KeyError:
        ammenity_value = parent_df.loc[subdf.index][ammenity].iloc[0]
    experiments = subdf.experiment.unique()
    all_with_params = parent_df[(parent_df.experiment.isin(experiments)) & (parent_df[ammenity].isna() if (type(ammenity_value) == float and math.isnan(ammenity_value)) else (parent_df[ammenity] == ammenity_value))]

    return len(subdf)/len(all_with_params)

def fill_proportion_matching(subdf,parent_df=None,ammenity=None):
    try:
        ammenity_value = subdf[ammenity].iloc[0]
    except KeyError:
        ammenity_value = parent_df.loc[subdf.index][ammenity].iloc[0]
    experiments = subdf.experiment.unique()
    all_with_params = parent_df[(parent_df.experiment.isin(experiments)) & (parent_df[ammenity].isna() if (type(ammenity_value) == float and math.isnan(ammenity_value)) else (parent_df[ammenity] == ammenity_value))]
    return pd.Series([1] * len(subdf) + [0] * (len(all_with_params) - len(subdf)))

def get_significance(subdf,parent_df=None,base_trait=None,ammenity=None):
    ks_stat,ks_p_value =scipy.stats.ks_2samp(subdf[base_trait], parent_df[base_trait])
    es_stat,es_p_value =scipy.stats.epps_singleton_2samp(subdf[base_trait], parent_df[base_trait])
    return pd.Series([ks_stat, ks_p_value, es_stat, es_p_value, len(subdf)], index=["ks_stat","ks_p_value","es_stat","es_p_value","size"])

def plot_ammenity_rating(subdf, ammenity, parent_df=None, base_trait="rating",suffix="",plot_kwargs={}):
    
    if parent_df is None:
        parent_df = subdf
        
    if len(suffix) != 0 and suffix[0] != "_":
        suffix = f"_{suffix}"
    
    try:
        display(subdf.groupby([ammenity], dropna=False).apply(get_significance, parent_df=parent_df,ammenity=ammenity,base_trait=base_trait))
    except:
        pass
    try:
        print(f"Correlation: {scipy.stats.spearmanr(df_target[base_trait], df_target[ammenity])}")
    except:
        print("Unable to perform correlation check")
    try:
        df_rating_masked  = subdf.groupby([ammenity,base_trait], dropna=False).apply(fill_proportion_matching, parent_df=parent_df, ammenity=ammenity).reset_index()
    except KeyError:
        print(list(subdf))
        raise
    df_rating_masked_filled = df_rating_masked.fillna("Not listed")
    sns.barplot(x=base_trait,y=0,hue=ammenity,data=df_rating_masked_filled,**plot_kwargs).set(xlabel="Rating",ylabel="Proportion")#.set(title=f"Proprotion of reviews with each {base_trait}")
    fn = f"proprotion_{ammenity}_by_{base_trait}_{constants.CRAWL_SOURCE}{suffix}.pdf".replace("/","_")
    plt.ylim(0,1)
    plt.savefig(f"../../graphs/{fn}", bbox_inches = 'tight')
    print(f"Saved to ../../graphs/{fn}")
#     df_rating_masked  = subdf.groupby([ammenity,base_trait], dropna=False).apply(get_proportion_matching, parent_df=parent_df, ammenity=ammenity).reset_index()
#     df_rating_masked_filled = df_rating_masked.fillna("Not listed")
#     sns.barplot(x=base_trait,y=0,hue=ammenity,data=df_rating_masked_filled).set(title=f"Proprotion of reviews with each {base_trait}")
#     plt.show()
    

In [None]:
print(f"""Stats:
Mask mentions:
{df_target[(df_target.date >= pd.to_datetime("2020-03-01"))].flag_mask.value_counts()}
Mask requirements by business:
{crawled_business_data['ammenity_customers_must_wear_masks'].value_counts(dropna=False)}
Mask requirements by review:
{df_target[df_target.date >= pd.to_datetime("2020-08-06")]['ammenity_customers_must_wear_masks'].value_counts(dropna=False)}
""")

In [None]:
scipy.stats.spearmanr(df_target.rating, df_target.num_ammenities)

In [None]:
df_target.groupby(["experiment","stratum"]).apply(lambda subdf: pd.Series(list(scipy.stats.spearmanr(subdf.rating, subdf.num_ammenities)),index=["correlation","p_value"]))

In [None]:
plot_ammenity_rating(df_target, "num_ammenities")

In [None]:
plot_ammenity_rating(df_target, "ammenity_restaurants_attire")

In [None]:
plot_ammenity_rating(df_target, "ammenity_Caters")

In [None]:
plot_ammenity_rating(df_target, "ammenity_dogs_allowed")

In [None]:
plot_ammenity_rating(df_target, "ammenity_employees_wear_masks")

In [None]:
plot_ammenity_rating(df_target, "ammenity_customers_must_wear_masks", base_trait="flagged")

In [None]:
plot_ammenity_rating(df_target[df_target.date >= pd.to_datetime("2021-08-01")], "ammenity_customers_must_wear_masks")

In [None]:
df_target['ammenity_customers_must_wear_masks'] = df_target['ammenity_customers_must_wear_masks'].fillna("Not listed")

In [None]:
display(scipy.stats.ks_2samp(df_target[df_target.ammenity_customers_must_wear_masks == "Not listed"].rating, df_target[df_target.ammenity_customers_must_wear_masks != "Not listed"].rating))
display(scipy.stats.ks_2samp(df_target[df_target.ammenity_customers_must_wear_masks == True].rating, df_target[df_target.ammenity_customers_must_wear_masks == False].rating))

In [None]:
plot_ammenity_rating(df_target[(df_target.flag_mask) & (df_target.date >= pd.to_datetime("2021-07-01"))], "ammenity_customers_must_wear_masks")

In [None]:
print("Reviews that mention masks")
df_target["Masks mentions/masks required"] = df_target.apply(lambda row: f"{row['flag_mask']}/{row['ammenity_customers_must_wear_masks']}",axis=1)
df_march_2020 = df_target[df_target.date >= pd.to_datetime("2021-07-06")].copy()
plot_ammenity_rating(df_march_2020, "Masks mentions/masks required")
# subdf = df_march_2020
# parent_df = subdf
# base_trait = "rating"
# ammenity = "masks_mentioned_masks_required"
# try:
#     df_rating_masked  = subdf.groupby([ammenity,base_trait]).apply(fill_proportion_matching, parent_df=parent_df, ammenity=ammenity).reset_index()
# except KeyError:
#     print(list(subdf))
#     raise
# df_rating_masked_filled = df_rating_masked.fillna("Not listed")
# sns.barplot(x=base_trait,y=0,hue=ammenity,data=df_rating_masked_filled).set(title=f"Proprotion of reviews with each {base_trait}")
df_march_2020 = None

In [None]:
print("Reviews that mention masks")
df_target['ammenity_customers_must_wear_masks'] = df_target['ammenity_customers_must_wear_masks'].fillna("Not listed")
df_march_2020 = df_target[df_target.date >= pd.to_datetime("2020-03-01")].copy()
df_august_2021 = df_target[df_target.date >= pd.to_datetime("2021-08-06")].copy()
index=["flagged", "Masks:","Rating"]
results_df1 = df_march_2020.apply(lambda row: pd.Series([row.flagged,"Mentioned" if row['flag_mask'] else "Not mentioned", row.rating],index=index),axis=1)
results_df2 = df_august_2021.apply(lambda row: pd.Series([row.flagged,"Required" if row['ammenity_customers_must_wear_masks'] == True else ("Not required" if row['ammenity_customers_must_wear_masks'] == False else row['ammenity_customers_must_wear_masks']), row.rating],index=index),axis=1)
results_df = pd.concat([results_df1,results_df2])
results_df["experiment"] = -1

plot_kwargs = {"hue_order":["Required","Not required", "Not listed", "Mentioned", "Not mentioned"]}

plot_ammenity_rating(results_df, ammenity="Masks:",base_trait="Rating",plot_kwargs=plot_kwargs)
plt.show()
plot_ammenity_rating(results_df[results_df.flagged == False], ammenity="Masks:",base_trait="Rating",plot_kwargs=plot_kwargs)

In [None]:
print("Recommended")
display(scipy.stats.spearmanr(df_august_2021[(df_august_2021.flagged == False) & (df_august_2021.ammenity_customers_must_wear_masks != "Not listed")].ammenity_customers_must_wear_masks, df_august_2021[(df_august_2021.flagged == False) & (df_august_2021.ammenity_customers_must_wear_masks != "Not listed")].rating))
display(results_df[(results_df.flagged == False)].groupby("Masks:").mean())
display(results_df[(results_df.flagged == False)].groupby("Masks:").std())
print("All")
display(scipy.stats.spearmanr(df_august_2021[(df_august_2021.ammenity_customers_must_wear_masks != "Not listed")].ammenity_customers_must_wear_masks, df_august_2021[(df_august_2021.ammenity_customers_must_wear_masks != "Not listed")].rating))
display(results_df.groupby("Masks:").mean())
display(results_df.groupby("Masks:").std())


In [None]:
results_df1 = None
results_df2 = None
results_df = None

In [None]:
print("Reviews that mention masks")
df_target['ammenity_customers_must_wear_masks'] = df_target['ammenity_customers_must_wear_masks'].fillna("Not listed")
df_target["Masks mentions/masks required"] = df_target.apply(lambda row: f"{row['flag_mask']}/{row['ammenity_customers_must_wear_masks']}",axis=1)
df_march_2020 = df_target[df_target.date >= pd.to_datetime("2021-08-06")].copy()
plot_ammenity_rating(df_march_2020, "Masks mentions/masks required")
# subdf = df_march_2020
# parent_df = subdf
# base_trait = "rating"
# ammenity = "masks_mentioned_masks_required"
# try:
#     df_rating_masked  = subdf.groupby([ammenity,base_trait]).apply(fill_proportion_matching, parent_df=parent_df, ammenity=ammenity).reset_index()
# except KeyError:
#     print(list(subdf))
#     raise
# df_rating_masked_filled = df_rating_masked.fillna("Not listed")
# sns.barplot(x=base_trait,y=0,hue=ammenity,data=df_rating_masked_filled).set(title=f"Proprotion of reviews with each {base_trait}")
df_march_2020 = None

In [None]:
plot_ammenity_rating(df_target[~df_target.flag_mask], "ammenity_customers_must_wear_masks", parent_df=df_target, suffix="not_mentions_masks")

In [None]:
print(f'Percentage of reviews that mention masks that occur before March 01, 2021 {len(df_flagged[df_flagged.flag_mask & (df_flagged.date < pd.to_datetime("2020-03-01"))]) / len(df_flagged[df_flagged.flag_mask]):%}')

In [None]:
sns.ecdfplot(x="date",data=df_flagged[df_flagged.flag_mask])

In [None]:
df_flagged[df_flagged.flag_mask & (df_flagged.date >= pd.to_datetime("2020-03-01"))].sample(1).content.iloc[0]

In [None]:
one_month = pd.Timedelta("30 days")
three_months = pd.Timedelta("90 days")

def get_windowed_view(df,field,start_time=None):
    if start_time is None:
        start_time = datetime.datetime(year=df.date.min().year,month=1,day=1)
    end_time = datetime.datetime(year=df.date.max().year+1,month=1,day=1)
    for stratum in df_flagged_crawl.stratum.unique():
        df_s = df[df.stratum == stratum]
        t = start_time
        while t < end_time:
            t += one_month
            for business_id, subdf in df_s.groupby("business_id"):
                prop = windowed_proportion(subdf,t,field)
                if prop != None:
                    yield pd.Series({"business_id": business_id, "stratum": stratum, "date": t, "proportion": prop})

def windowed_proportion(df,date,field):
    df_window = df[(df.date < date) & (df.date >= date - one_month)]
    if len(df_window) == 0:
        return None
    return len(df_window[df_window[field]])/len(df_window)

for experiment, experiment_name in [(2,"Density"),(3,"Income")]:
    for crawl_id in constants.CRAWL_ORDER[-1:]:
        df_flagged_crawl = df_flagged[(df_flagged.experiment == experiment) & (df_flagged.crawl_id == crawl_id)]
        data = pd.DataFrame(get_windowed_view(df_flagged_crawl,"flag_mask",start_time=datetime.datetime(year=2020,month=1,day=1)))
        sns.lineplot(x="date",y="proportion",hue="stratum",palette="tab10",data=data).set(title=f"Proportion of reviews that mention mask, 1 month sliding window ({experiment_name}/{crawl_id})")
        plt.show()

In [None]:
one_month = pd.Timedelta("30 days")
three_months = pd.Timedelta("90 days")

def get_windowed_view(df,field):
    start_time = datetime.datetime(year=2020,month=1,day=1)
    end_time = datetime.datetime(year=df.date.max().year+1,month=1,day=1)
    for stratum in df_flagged_crawl.stratum.unique():
        df_s = df[df.stratum == stratum]
        t = start_time
        while t < end_time:
            t += one_month
            prop = windowed_proportion(df_s,t,field)
            if prop != None:
                yield pd.Series({"stratum": stratum, "date": t, "proportion": prop})

def windowed_proportion(df,date,field):
    df_window = df[(df.date < date + one_month) & (df.date >= date - one_month)]
    if len(df_window) == 0:
        return None
    return len(df_window[df_window[field]])/len(df_window)

for experiment, experiment_name in [(2,"Density"),(3,"Income")]:
    for crawl_id in constants.CRAWL_ORDER[-1:]:
        df_flagged_crawl = df_flagged[(df_flagged.experiment == experiment) & (df_flagged.crawl_id == crawl_id) & (df_flagged.flag_mask)]
        data = pd.DataFrame(get_windowed_view(df_flagged_crawl,"flagged"))
        ax = sns.lineplot(x="date",y="proportion",hue="stratum",palette="tab10",data=data)
        ax.set(title=f"Proportion of reviews that mention mask that were not recommended, 2 month sliding window ({experiment_name}/{crawl_id})")
        for item in ax.get_xticklabels():
            item.set_rotation(45)
        plt.show()

In [None]:
one_month = pd.Timedelta("30 days")
three_months = pd.Timedelta("90 days")

def get_windowed_view(df,field):
    start_time = datetime.datetime(year=df.date.min().year,month=1,day=1)
    end_time = datetime.datetime(year=df.date.max().year+1,month=1,day=1)
    for stratum in df_flagged_crawl.stratum.unique():
        df_s = df[df.stratum == stratum]
        t = start_time
        while t < end_time:
            t += one_month
            prop = windowed_proportion(df_s,t,field)
            if prop != None:
                yield pd.Series({"stratum": stratum, "date": t, "proportion": prop})

def windowed_proportion(df,date,field):
    df_window = df[(df.date < date + three_months) & (df.date >= date - three_months)]
    if len(df_window) == 0:
        return None
    return len(df_window[df_window[field]])/len(df_window)

for experiment, experiment_name in [(2,"Density"),(3,"Income")]:
    for crawl_id in constants.CRAWL_ORDER[-1:]:
        df_flagged_crawl = df_flagged[(df_flagged.experiment == experiment) & (df_flagged.crawl_id == crawl_id)]
        data = pd.DataFrame(get_windowed_view(df_flagged_crawl,"flagged"))
        sns.lineplot(x="date",y="proportion",hue="stratum",palette="tab10",data=data).set(title=f"Proportion of reviews that were not recommended, 6 month sliding window ({experiment_name}/{crawl_id})")
        plt.show()

In [None]:
for experiment, experiment_name in [(2,"Density"),(3,"Income")]:
    for crawl_id in constants.CRAWL_ORDER[-1:]:
        df_flagged_crawl = df_flagged[(df_flagged.experiment == experiment) & (df_flagged.crawl_id == crawl_id)]
        sns.ecdfplot(x="date",hue="stratum",data=df_flagged_crawl[df_flagged_crawl.flag_mask & df_flagged_crawl.flagged],stat="count").set(title=f"Number of not recommended reviews mentioning masks ({experiment_name}/{crawl_id})")
        plt.show()
        sns.ecdfplot(x="date",hue="stratum",data=df_flagged_crawl[df_flagged_crawl.flag_mask],stat="count").set(title=f"Number of reviews mentioning masks ({experiment_name}/{crawl_id})")
        plt.show()

In [None]:
for experiment, experiment_name in [(2,"Density"),(3,"Income")]:
    for crawl_id in constants.CRAWL_ORDER[-1:]:
        df_flagged_crawl = df_flagged[(df_flagged.experiment == experiment) & (df_flagged.crawl_id == crawl_id)]
        sns.ecdfplot(x="date",hue="stratum",data=df_flagged_crawl[df_flagged_crawl.flag_mask],stat="count").set(title=f"Number of reviews mentioning masks ({experiment_name}/{crawl_id})")
        plt.show()
        sns.ecdfplot(x="date",hue="stratum",data=df_flagged_crawl,stat="count").set(title=f"All reviews ({experiment_name}/{crawl_id})")
        plt.show()

In [None]:
print("All proportions normalized by stratum")

for experiment, experiment_name in [(2,"Density"),(3,"Income")]:
    for crawl_id in constants.CRAWL_ORDER:
        
        df_flagged_crawl = df_flagged[df_flagged.crawl_id == crawl_id]

        flagged_counts = df_flagged_crawl[df_flagged_crawl.experiment==experiment].groupby(["flag_mask","stratum"]).size()
        print(f"{experiment_name} strata sizes")
        display(flagged_counts)
        def relative_size(subdf):
            if len(subdf) == 0:
                return None
            return len(subdf)/flagged_counts.loc[(subdf.flag_mask.iloc[0],subdf.stratum.iloc[0])]

        heatmap_data = df_flagged_crawl[df_flagged_crawl.experiment==experiment].groupby(["stratum","flag_mask","rating"]).apply(relative_size).rename("count").to_frame().reset_index().dropna()
        heatmap_data_flagged = heatmap_data[heatmap_data.flag_mask].pivot("stratum","rating","count")
        sns.heatmap(data=heatmap_data_flagged).set(title=f"Proportion of reviews mentioning masks with a rating in each strata ({experiment_name}/{crawl_id})")
        plt.show()

        flagged_counts = df_flagged_crawl[df_flagged_crawl.experiment==experiment].groupby(["stratum"]).size()
        def relative_size(subdf):
            if len(subdf) == 0:
                return None
            return len(subdf)/flagged_counts.loc[subdf.stratum.iloc[0]]

        heatmap_data = df_flagged_crawl[df_flagged_crawl.experiment==experiment].groupby(["stratum","rating"]).apply(relative_size).rename("count").to_frame().reset_index().dropna()
        heatmap_data_no_flag = heatmap_data.pivot("stratum","rating","count")
        sns.heatmap(data=heatmap_data_no_flag,vmin=0,vmax=1).set(title=f"Proportion of reviews with a rating in each strata ({experiment_name}/{crawl_id})")
        plt.show()

In [None]:
flagged_counts = df_flagged.flag_mask.value_counts()
def relative_size(subdf):
    if len(subdf) == 0:
        return None
    return len(subdf)/flagged_counts.loc[subdf.flag_mask.iloc[0]]

heatmap_data = df_flagged[df_flagged.experiment==2].groupby(["stratum","flag_mask","flagged"]).size().rename("count").to_frame().reset_index().dropna()
heatmap_data_flagged = heatmap_data[heatmap_data.flag_mask].pivot("stratum","flagged","count")
sns.heatmap(data=heatmap_data_flagged)
plt.show()
heatmap_data_no_flag = heatmap_data[~heatmap_data.flag_mask].pivot("stratum","flagged","count")
sns.heatmap(data=heatmap_data_no_flag)