In [None]:
import json
import os
import re
import traceback
import datetime

import math
import scipy
import scipy.stats
import numpy as np
import random

import pyzipcode
import hashlib

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from IPython.display import Markdown
from IPython.display import HTML
from tqdm import tqdm

import sqlite3
import pandas as pd
import nltk

import statsmodels.stats.multitest as multitest
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression,LogisticRegression
import sklearn.preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.model_selection import cross_val_score, cross_validate, LeaveOneOut, KFold
from sklearn.metrics import r2_score

import itertools
import collections
import functools
import collections


In [None]:
sns.set_style(style="whitegrid")

In [None]:
yc_reviews = pd.read_pickle("../data/pickles/yelpchi_reviews.pkl")
yc_updated_reviews = pd.read_pickle("../data/pickles/yelpchi_updated_reviews.pkl")
yc_businesses = pd.read_pickle("../data/pickles/yelpchi_businesses_with_chain.pkl")

chicago_reviews = pd.read_pickle("../data/pickles/chicago_reviews.pkl")
chicago_businesses = pd.read_pickle("../data/pickles/chicago_businesses.pkl")

In [None]:
max_yc = yc_reviews.date.max()
yc_new_reviews = yc_updated_reviews[yc_updated_reviews.date > max_yc]

In [None]:
yc_reviews.rating.groupby(by=yc_reviews.rating).size()

# Define distance metrics

In [None]:
def total_variational_distance(sample_a, sample_b):
    """
    Based on the L1 distance here: https://en.wikipedia.org/wiki/Total_variation_distance_of_probability_measures
    
    TODO switch to log probabilities if needed
    """
    sample_a = pd.Series(sample_a)
    sample_b = pd.Series(sample_b)
    
    df = pd.DataFrame()
    df["a"] = sample_a.groupby(sample_a).size()
    df["b"] = sample_b.groupby(sample_b).size()
    df = df.fillna(0)
    df["a"] = df.a / sum(df.a)
    df["b"] = df.b / sum(df.b)

    s = 0
    l1 = np.abs(np.array(df.a)-np.array(df.b))
    tvd = np.sum(l1) * 0.5
    return tvd

In [None]:
total_variational_distance(yc_reviews.rating,yc_updated_reviews.rating)

#### Get the ratio of filtered:total reviews

In [None]:
def get_filtered_ratio(bid):
    #print(bid)
    business_reviews = yc_updated_reviews[(yc_updated_reviews.businessID == bid)]
    business_filtered_reviews = business_reviews[business_reviews.flagged == 'Y']
    
    return len(business_filtered_reviews) / len(business_reviews)
    
yc_businesses["filtered_ratio"] = pd.Series(yc_businesses.index, index=yc_businesses.index).apply(get_filtered_ratio)

def get_filtered_ratio(bid):
    business_reviews = yc_reviews[(yc_reviews.businessID == bid)]
    business_filtered_reviews = business_reviews[business_reviews.flagged == 'Y']
    
    return len(business_filtered_reviews) / len(business_reviews)
    
yc_businesses["yc_filtered_ratio"] = pd.Series(yc_businesses.index, index=yc_businesses.index).apply(get_filtered_ratio)

def get_filtered_ratio(bid):
    business_reviews = yc_new_reviews[(yc_new_reviews.businessID == bid)]
    business_filtered_reviews = business_reviews[business_reviews.flagged == 'Y']
    
    if len(business_reviews) == 0:
        return None
    
    return len(business_filtered_reviews) / len(business_reviews)
    
yc_businesses["yc_new_filtered_ratio"] = pd.Series(yc_businesses.index, index=yc_businesses.index).apply(get_filtered_ratio)

In [None]:
len(yc_updated_reviews[yc_updated_reviews.businessID == "tQfLGoolUMu2J0igcWcoZg"])

In [None]:
yc_businesses.sort_values("filtered_ratio")

# Ratings stats

In [None]:
bins = np.arange(0.75,5.5,0.5)
sns.distplot(yc_businesses.rating,bins=bins,kde=False)

### Compute the divergence between the rating distribution overall and for each businesses

In [None]:
bins = np.arange(0.75,5.5,0.5)
sns.distplot(yc_updated_reviews.rating,kde=False,bins=bins)
sns.distplot(yc_updated_reviews[yc_updated_reviews.flagged=='N'].rating,kde=False,bins=bins)
sns.distplot(yc_updated_reviews[yc_updated_reviews.flagged=='Y'].rating,kde=False,bins=bins)
bins

In [None]:
scipy.stats.epps_singleton_2samp(yc_updated_reviews[yc_updated_reviews.flagged=='N'].rating,yc_updated_reviews[yc_updated_reviews.flagged=='Y'].rating)

In [None]:
ratings_baseline_trunc = sorted(yc_updated_reviews[yc_updated_reviews.flagged=='N'].rating)
ratings_baseline_mixed = sorted(yc_updated_reviews.rating)
ratings_baseline_filtd = sorted(yc_updated_reviews[yc_updated_reviews.flagged=='Y'].rating)

yc_ratings_baseline_trunc = sorted(yc_reviews[yc_reviews.flagged=='N'].rating)
yc_ratings_baseline_mixed = sorted(yc_reviews.rating)
yc_ratings_baseline_filtd = sorted(yc_reviews[yc_reviews.flagged=='Y'].rating)

yc_new_ratings_baseline_trunc = sorted(yc_new_reviews[yc_new_reviews.flagged=='N'].rating)
yc_new_ratings_baseline_mixed = sorted(yc_new_reviews.rating)
yc_new_ratings_baseline_filtd = sorted(yc_new_reviews[yc_new_reviews.flagged=='Y'].rating)

In [None]:
distance_trunc = pd.DataFrame()
distance_mixed = pd.DataFrame()
distance_filtd = pd.DataFrame()

In [None]:
#Statistics to use

def get_tvd(baseline,comparison):
    if len(comparison) < 5:
        return None
    return pd.Series([total_variational_distance(baseline,comparison)],index=["statistic"])

def get_es2(baseline,comparison):
    if len(comparison) < 5:
        return None
    return pd.Series(scipy.stats.epps_singleton_2samp(baseline,comparison),index=["statistic","pvalue"])

def get_ks2(baseline,comparison):
    if len(comparison) < 5:
        return None
    return pd.Series(scipy.stats.ks_2samp(baseline,comparison),index=["statistic","pvalue"])

def metric_wrapper(metric_fxn=None,df_baseline=None,mode=None):
    """
    Mode: N - flagged N; Y - flagged Y; A - any flag
    """
    def wrapped(group):
        if mode == "Y":
            return metric_fxn(df_baseline,group[group.flagged=='Y'].rating)
        elif mode == "N":
            return metric_fxn(df_baseline,group[group.flagged=='N'].rating)
        elif mode == "A":
            return metric_fxn(df_baseline,group.rating)
        else:
            raise
    return wrapped

In [None]:
def update_with_metric(metric_fxn=None,metric_name=None,log=False,pvalue=False,df=None,df_name=None,df_baseline=None,mode=None):
    
    if len(df_name) > 0:
        df_name = f"{df_name}_"
    
    distance_addition = pd.DataFrame()
    
    #Wrap function
    metric_fxn = metric_wrapper(metric_fxn, df_baseline, mode)

    #Compute statistic
    distance_results = df.groupby("businessID").apply(metric_fxn)

    #Adjust the p-values to account for multiple hypothesis testing
    if pvalue:
        results = multitest.multipletests(distance_results[distance_results.pvalue.notnull()]["pvalue"])
        distance_addition[f"{df_name}{metric_name}_pvalue"] = pd.Series(results[1],index=distance_results[distance_results.pvalue.notnull()].index)
        distance_addition[f"{df_name}{metric_name}_significant"] = pd.Series(results[0],index=distance_results[distance_results.pvalue.notnull()].index)

    #Rename
    distance_addition[f"{df_name}{metric_name}_statistic"] = distance_results["statistic"]

    if log:
        distance_addition[f"{df_name}log_{metric_name}_statistic"] = np.log(distance_addition[f"{df_name}{metric_name}_statistic"])
        
    return distance_addition

In [None]:
def get_metric_args(metric_name):
    metric_args = {
        "tvd": {
            "metric_fxn": get_tvd,
            "log": False,
            "pvalue": False
        },
        "es": {
            "metric_fxn": get_es2,
            "log": True,
            "pvalue": True
        },
        "ks": {
            "metric_fxn": get_ks2,
            "log": False,
            "pvalue": True
        }
    }[metric_name]
    
    metric_args["metric_name"] = metric_name
    
    return metric_args

def get_df_args(df_name,mode):
    df_args = {
        "": {
            "df": yc_updated_reviews
        },
        "yc": {
            "df": yc_reviews
        },
        "yc_new": {
            "df": yc_new_reviews
        }
    }[df_name]
    df_args["df_name"] = df_name
    
    df_baseline = {
        "": {
            "A": ratings_baseline_mixed,
            "Y": ratings_baseline_filtd,
            "N": ratings_baseline_trunc
        },
        "yc": {
            "A": yc_ratings_baseline_mixed,
            "Y": yc_ratings_baseline_filtd,
            "N": yc_ratings_baseline_trunc
        },
        "yc_new": {
            "A": yc_new_ratings_baseline_mixed,
            "Y": yc_new_ratings_baseline_filtd,
            "N": yc_new_ratings_baseline_trunc,
        }
    }[df_name][mode]

    mode_args = {
        "df_baseline": df_baseline,
        "mode": mode
    }
    
    df_args.update(mode_args)
    return df_args

def get_args(metric_name=None,df_name=None,mode=None):
    args = get_metric_args(metric_name)
    args.update(get_df_args(df_name,mode))
    return args

In [None]:
distance_trunc = pd.DataFrame()
distance_mixed = pd.DataFrame()
distance_filtd = pd.DataFrame()
for metric_name in ["tvd","ks","es"]:
    for df_name in ["","yc","yc_new"]:
        distance_trunc_addition = update_with_metric(**get_args(metric_name,df_name,"N"))
        distance_trunc = pd.concat([distance_trunc,distance_trunc_addition],axis=1)
    
        distance_mixed_addition = update_with_metric(**get_args(metric_name,df_name,"A"))
        distance_mixed = pd.concat([distance_mixed,distance_mixed_addition],axis=1)
        
        distance_filtd_addition = update_with_metric(**get_args(metric_name,df_name,"Y"))
        distance_filtd = pd.concat([distance_filtd,distance_filtd_addition],axis=1)

#### Total Variational Distance

In [None]:
%%script false --no-raise-error
distance_trunc_addition = pd.DataFrame()
distance_mixed_addition = pd.DataFrame()
distance_filtd_addition = pd.DataFrame()

#Compute statistic

distance_trunc_results = yc_updated_reviews.groupby("businessID").apply(lambda group: get_tvd(ratings_baseline_trunc,group[group.flagged=='N'].rating))
distance_mixed_results = yc_updated_reviews.groupby("businessID").apply(lambda group: get_tvd(ratings_baseline_mixed,group.rating))
distance_filtd_results = yc_updated_reviews.groupby("businessID").apply(lambda group: get_tvd(ratings_baseline_filtd,group[group.flagged=='Y'].rating))

distance_trunc_addition["tvd_statistic"] = distance_trunc_results["statistic"]
distance_mixed_addition["tvd_statistic"] = distance_mixed_results["statistic"]
distance_filtd_addition["tvd_statistic"] = distance_filtd_results["statistic"]

distance_trunc = pd.concat([distance_trunc,distance_trunc_addition],axis=1)
distance_mixed = pd.concat([distance_mixed,distance_mixed_addition],axis=1)
distance_filtd = pd.concat([distance_filtd,distance_filtd_addition],axis=1)

In [None]:
%%script false --no-raise-error
distance_trunc_addition = pd.DataFrame()
distance_mixed_addition = pd.DataFrame()
distance_filtd_addition = pd.DataFrame()

#Compute statistic

distance_trunc_results = yc_reviews.groupby("businessID").apply(lambda group: get_tvd(yc_ratings_baseline_trunc,group[group.flagged=='N'].rating))
distance_mixed_results = yc_reviews.groupby("businessID").apply(lambda group: get_tvd(yc_ratings_baseline_mixed,group.rating))
distance_filtd_results = yc_reviews.groupby("businessID").apply(lambda group: get_tvd(yc_ratings_baseline_filtd,group[group.flagged=='Y'].rating))

distance_trunc_addition["yc_tvd_statistic"] = distance_trunc_results["statistic"]
distance_mixed_addition["yc_tvd_statistic"] = distance_mixed_results["statistic"]
distance_filtd_addition["yc_tvd_statistic"] = distance_filtd_results["statistic"]

distance_trunc = pd.concat([distance_trunc,distance_trunc_addition],axis=1)
distance_mixed = pd.concat([distance_mixed,distance_mixed_addition],axis=1)
distance_filtd = pd.concat([distance_filtd,distance_filtd_addition],axis=1)

In [None]:
%%script false --no-raise-error
distance_trunc_addition = pd.DataFrame()
distance_mixed_addition = pd.DataFrame()
distance_filtd_addition = pd.DataFrame()

#Compute statistic

distance_trunc_results = yc_new_reviews.groupby("businessID").apply(lambda group: get_tvd(yc_new_ratings_baseline_trunc,group[group.flagged=='N'].rating))
distance_mixed_results = yc_new_reviews.groupby("businessID").apply(lambda group: get_tvd(yc_new_ratings_baseline_mixed,group.rating))
distance_filtd_results = yc_new_reviews.groupby("businessID").apply(lambda group: get_tvd(yc_new_ratings_baseline_filtd,group[group.flagged=='Y'].rating))

distance_trunc_addition["yc_new_tvd_statistic"] = distance_trunc_results["statistic"]
distance_mixed_addition["yc_new_tvd_statistic"] = distance_mixed_results["statistic"]
distance_filtd_addition["yc_new_tvd_statistic"] = distance_filtd_results["statistic"]

distance_trunc = pd.concat([distance_trunc,distance_trunc_addition],axis=1)
distance_mixed = pd.concat([distance_mixed,distance_mixed_addition],axis=1)
distance_filtd = pd.concat([distance_filtd,distance_filtd_addition],axis=1)

#### Singleton-Epps statistic

In [None]:
%%script false --no-raise-error
distance_trunc_addition = pd.DataFrame()
distance_mixed_addition = pd.DataFrame()
distance_filtd_addition = pd.DataFrame()

#Compute statistic

distance_trunc_results = yc_updated_reviews.groupby("businessID").apply(lambda group: get_es2(ratings_baseline_trunc,group[group.flagged=='N'].rating))
distance_mixed_results = yc_updated_reviews.groupby("businessID").apply(lambda group: get_es2(ratings_baseline_mixed,group.rating))
distance_filtd_results = yc_updated_reviews.groupby("businessID").apply(lambda group: get_es2(ratings_baseline_filtd,group[group.flagged=='Y'].rating))

#Adjust the p-values to account for multiple hypothesis testing
results_trunc = multitest.multipletests(distance_trunc_results[distance_trunc_results.pvalue.notnull()]["pvalue"])
results_mixed = multitest.multipletests(distance_mixed_results[distance_mixed_results.pvalue.notnull()]["pvalue"])
results_filtd = multitest.multipletests(distance_filtd_results[distance_filtd_results.pvalue.notnull()]["pvalue"])
print(results_trunc[2:4])
print(results_mixed[2:4])
print(results_filtd[2:4])

#Rename
distance_trunc_addition["es_significant"] = pd.Series(results_trunc[0],index=distance_trunc_results[distance_trunc_results.pvalue.notnull()].index)
distance_mixed_addition["es_significant"] = pd.Series(results_mixed[0],index=distance_mixed_results[distance_mixed_results.pvalue.notnull()].index)
distance_filtd_addition["es_significant"] = pd.Series(results_filtd[0],index=distance_filtd_results[distance_filtd_results.pvalue.notnull()].index)

distance_trunc_addition["es_pvalue"] = pd.Series(results_trunc[1],index=distance_trunc_results[distance_trunc_results.pvalue.notnull()].index)
distance_mixed_addition["es_pvalue"] = pd.Series(results_mixed[1],index=distance_mixed_results[distance_mixed_results.pvalue.notnull()].index)
distance_filtd_addition["es_pvalue"] = pd.Series(results_filtd[1],index=distance_filtd_results[distance_filtd_results.pvalue.notnull()].index)

distance_trunc_addition["es_statistic"] = distance_trunc_results["statistic"]
distance_mixed_addition["es_statistic"] = distance_mixed_results["statistic"]
distance_filtd_addition["es_statistic"] = distance_filtd_results["statistic"]

distance_trunc_addition["log_es_statistic"] = np.log(distance_trunc_addition["es_statistic"])
distance_mixed_addition["log_es_statistic"] = np.log(distance_mixed_addition["es_statistic"])
distance_filtd_addition["log_es_statistic"] = np.log(distance_mixed_addition["es_statistic"])

distance_trunc = pd.concat([distance_trunc,distance_trunc_addition],axis=1)
distance_mixed = pd.concat([distance_mixed,distance_mixed_addition],axis=1)
distance_filtd = pd.concat([distance_filtd,distance_filtd_addition],axis=1)

In [None]:
%%script false --no-raise-error
distance_trunc_addition = pd.DataFrame()
distance_mixed_addition = pd.DataFrame()
distance_filtd_addition = pd.DataFrame()

#Compute statistic

distance_trunc_results = yc_reviews.groupby("businessID").apply(lambda group: get_es2(yc_ratings_baseline_trunc,group[group.flagged=='N'].rating))
distance_mixed_results = yc_reviews.groupby("businessID").apply(lambda group: get_es2(yc_ratings_baseline_mixed,group.rating))
distance_filtd_results = yc_reviews.groupby("businessID").apply(lambda group: get_es2(yc_ratings_baseline_filtd,group[group.flagged=='Y'].rating))

#Adjust the p-values to account for multiple hypothesis testing
results_trunc = multitest.multipletests(distance_trunc_results[distance_trunc_results.pvalue.notnull()]["pvalue"])
results_mixed = multitest.multipletests(distance_mixed_results[distance_mixed_results.pvalue.notnull()]["pvalue"])
results_filtd = multitest.multipletests(distance_filtd_results[distance_filtd_results.pvalue.notnull()]["pvalue"])
print(results_trunc[2:4])
print(results_mixed[2:4])
print(results_filtd[2:4])

#Rename
distance_trunc_addition["yc_es_significant"] = pd.Series(results_trunc[0],index=distance_trunc_results[distance_trunc_results.pvalue.notnull()].index)
distance_trunc_addition["yc_es_pvalue"] = pd.Series(results_trunc[1],index=distance_trunc_results[distance_trunc_results.pvalue.notnull()].index)
distance_mixed_addition["yc_es_significant"] = pd.Series(results_mixed[0],index=distance_mixed_results[distance_mixed_results.pvalue.notnull()].index)
distance_mixed_addition["yc_es_pvalue"] = pd.Series(results_mixed[1],index=distance_mixed_results[distance_mixed_results.pvalue.notnull()].index)
distance_filtd_addition["yc_es_significant"] = pd.Series(results_filtd[0],index=distance_filtd_results[distance_filtd_results.pvalue.notnull()].index)
distance_filtd_addition["yc_es_pvalue"] = pd.Series(results_filtd[1],index=distance_filtd_results[distance_filtd_results.pvalue.notnull()].index)

distance_trunc_addition["yc_es_statistic"] = distance_trunc_results["statistic"]
distance_trunc_addition["yc_log_es_statistic"] = np.log(distance_trunc_addition["yc_es_statistic"])
distance_mixed_addition["yc_es_statistic"] = distance_mixed_results["statistic"]
distance_mixed_addition["yc_log_es_statistic"] = np.log(distance_mixed_addition["yc_es_statistic"])
distance_filtd_addition["yc_es_statistic"] = distance_filtd_results["statistic"]
distance_filtd_addition["yc_log_es_statistic"] = np.log(distance_mixed_addition["yc_es_statistic"])

distance_trunc = pd.concat([distance_trunc,distance_trunc_addition],axis=1)
distance_mixed = pd.concat([distance_mixed,distance_mixed_addition],axis=1)
distance_filtd = pd.concat([distance_filtd,distance_filtd_addition],axis=1)

In [None]:
%%script false --no-raise-error
distance_trunc_addition = pd.DataFrame()
distance_mixed_addition = pd.DataFrame()
distance_filtd_addition = pd.DataFrame()

#Compute statistic

distance_trunc_results = yc_new_reviews.groupby("businessID").apply(lambda group: get_es2(yc_new_ratings_baseline_trunc,group[group.flagged=='N'].rating))
distance_mixed_results = yc_new_reviews.groupby("businessID").apply(lambda group: get_es2(yc_new_ratings_baseline_mixed,group.rating))
distance_filtd_results = yc_new_reviews.groupby("businessID").apply(lambda group: get_es2(yc_new_ratings_baseline_filtd,group[group.flagged=='Y'].rating))

#Adjust the p-values to account for multiple hypothesis testing
results_trunc = multitest.multipletests(distance_trunc_results[distance_trunc_results.pvalue.notnull()]["pvalue"])
results_mixed = multitest.multipletests(distance_mixed_results[distance_mixed_results.pvalue.notnull()]["pvalue"])
results_filtd = multitest.multipletests(distance_filtd_results[distance_filtd_results.pvalue.notnull()]["pvalue"])
print(results_trunc[2:4])
print(results_mixed[2:4])
print(results_filtd[2:4])

#Rename
distance_trunc_addition["yc_new_es_significant"] = pd.Series(results_trunc[0],index=distance_trunc_results[distance_trunc_results.pvalue.notnull()].index)
distance_trunc_addition["yc_new_es_pvalue"] = pd.Series(results_trunc[1],index=distance_trunc_results[distance_trunc_results.pvalue.notnull()].index)
distance_mixed_addition["yc_new_es_significant"] = pd.Series(results_mixed[0],index=distance_mixed_results[distance_mixed_results.pvalue.notnull()].index)
distance_mixed_addition["yc_new_es_pvalue"] = pd.Series(results_mixed[1],index=distance_mixed_results[distance_mixed_results.pvalue.notnull()].index)
distance_filtd_addition["yc_new_es_significant"] = pd.Series(results_filtd[0],index=distance_filtd_results[distance_filtd_results.pvalue.notnull()].index)
distance_filtd_addition["yc_new_es_pvalue"] = pd.Series(results_filtd[1],index=distance_filtd_results[distance_filtd_results.pvalue.notnull()].index)

distance_trunc_addition["yc_new_es_statistic"] = distance_trunc_results["statistic"]
distance_trunc_addition["yc_new_log_es_statistic"] = np.log(distance_trunc_addition["yc_new_es_statistic"])
distance_mixed_addition["yc_new_es_statistic"] = distance_mixed_results["statistic"]
distance_mixed_addition["yc_new_log_es_statistic"] = np.log(distance_mixed_addition["yc_new_es_statistic"])
distance_filtd_addition["yc_new_es_statistic"] = distance_filtd_results["statistic"]
distance_filtd_addition["yc_new_log_es_statistic"] = np.log(distance_mixed_addition["yc_new_es_statistic"])

distance_trunc = pd.concat([distance_trunc,distance_trunc_addition],axis=1)
distance_mixed = pd.concat([distance_mixed,distance_mixed_addition],axis=1)
distance_filtd = pd.concat([distance_filtd,distance_filtd_addition],axis=1)

#### KS Statistic

In [None]:
%%script false --no-raise-error
distance_trunc_addition = pd.DataFrame()
distance_mixed_addition = pd.DataFrame()
distance_filtd_addition = pd.DataFrame()

#Compute statistic

distance_trunc_results = yc_updated_reviews.groupby("businessID").apply(lambda group: get_ks2(ratings_baseline_trunc,group[group.flagged=='N'].rating))
distance_mixed_results = yc_updated_reviews.groupby("businessID").apply(lambda group: get_ks2(ratings_baseline_mixed,group.rating))
distance_filtd_results = yc_updated_reviews.groupby("businessID").apply(lambda group: get_ks2(ratings_baseline_filtd,group[group.flagged=='Y'].rating))

#Adjust the p-values to account for multiple hypothesis testing
results_trunc = multitest.multipletests(distance_trunc_results[distance_trunc_results.pvalue.notnull()]["pvalue"])
results_mixed = multitest.multipletests(distance_mixed_results[distance_mixed_results.pvalue.notnull()]["pvalue"])
results_filtd = multitest.multipletests(distance_filtd_results[distance_filtd_results.pvalue.notnull()]["pvalue"])
print(results_trunc[2:4])
print(results_mixed[2:4])
print(results_filtd[2:4])

#Rename
distance_trunc_addition["ks_significant"] = pd.Series(results_trunc[0],index=distance_trunc_results[distance_trunc_results.pvalue.notnull()].index)
distance_trunc_addition["ks_pvalue"] = pd.Series(results_trunc[1],index=distance_trunc_results[distance_trunc_results.pvalue.notnull()].index)
distance_mixed_addition["ks_significant"] = pd.Series(results_mixed[0],index=distance_mixed_results[distance_mixed_results.pvalue.notnull()].index)
distance_mixed_addition["ks_pvalue"] = pd.Series(results_mixed[1],index=distance_mixed_results[distance_mixed_results.pvalue.notnull()].index)
distance_filtd_addition["ks_significant"] = pd.Series(results_filtd[0],index=distance_filtd_results[distance_filtd_results.pvalue.notnull()].index)
distance_filtd_addition["ks_pvalue"] = pd.Series(results_filtd[1],index=distance_filtd_results[distance_filtd_results.pvalue.notnull()].index)

distance_trunc_addition["ks_statistic"] = distance_trunc_results["statistic"]
distance_trunc_addition["log_ks_statistic"] = np.log(distance_trunc_addition["ks_statistic"])
distance_mixed_addition["ks_statistic"] = distance_mixed_results["statistic"]
distance_mixed_addition["log_ks_statistic"] = np.log(distance_mixed_addition["ks_statistic"])
distance_filtd_addition["ks_statistic"] = distance_filtd_results["statistic"]
distance_filtd_addition["log_ks_statistic"] = np.log(distance_filtd_addition["ks_statistic"])

distance_trunc = pd.concat([distance_trunc,distance_trunc_addition],axis=1)
distance_mixed = pd.concat([distance_mixed,distance_mixed_addition],axis=1)
distance_filtd = pd.concat([distance_filtd,distance_filtd_addition],axis=1)

In [None]:
%%script false --no-raise-error
distance_trunc_addition = pd.DataFrame()
distance_mixed_addition = pd.DataFrame()
distance_filtd_addition = pd.DataFrame()

#Compute statistic

distance_trunc_results = yc_reviews.groupby("businessID").apply(lambda group: get_ks2(yc_ratings_baseline_trunc,group[group.flagged=='N'].rating))
distance_mixed_results = yc_reviews.groupby("businessID").apply(lambda group: get_ks2(yc_ratings_baseline_mixed,group.rating))
distance_filtd_results = yc_reviews.groupby("businessID").apply(lambda group: get_ks2(yc_ratings_baseline_filtd,group[group.flagged=='Y'].rating))

#Adjust the p-values to account for multiple hypothesis testing
results_trunc = multitest.multipletests(distance_trunc_results[distance_trunc_results.pvalue.notnull()]["pvalue"])
results_mixed = multitest.multipletests(distance_mixed_results[distance_mixed_results.pvalue.notnull()]["pvalue"])
results_filtd = multitest.multipletests(distance_filtd_results[distance_filtd_results.pvalue.notnull()]["pvalue"])
print(results_trunc[2:4])
print(results_mixed[2:4])
print(results_filtd[2:4])

#Rename
distance_trunc_addition["yc_ks_significant"] = pd.Series(results_trunc[0],index=distance_trunc_results[distance_trunc_results.pvalue.notnull()].index)
distance_trunc_addition["yc_ks_pvalue"] = pd.Series(results_trunc[1],index=distance_trunc_results[distance_trunc_results.pvalue.notnull()].index)
distance_mixed_addition["yc_ks_significant"] = pd.Series(results_mixed[0],index=distance_mixed_results[distance_mixed_results.pvalue.notnull()].index)
distance_mixed_addition["yc_ks_pvalue"] = pd.Series(results_mixed[1],index=distance_mixed_results[distance_mixed_results.pvalue.notnull()].index)
distance_filtd_addition["yc_ks_significant"] = pd.Series(results_filtd[0],index=distance_filtd_results[distance_filtd_results.pvalue.notnull()].index)
distance_filtd_addition["yc_ks_pvalue"] = pd.Series(results_filtd[1],index=distance_filtd_results[distance_filtd_results.pvalue.notnull()].index)

distance_trunc_addition["yc_ks_statistic"] = distance_trunc_results["statistic"]
distance_trunc_addition["yc_log_ks_statistic"] = np.log(distance_trunc_addition["yc_ks_statistic"])
distance_mixed_addition["yc_ks_statistic"] = distance_mixed_results["statistic"]
distance_mixed_addition["yc_log_ks_statistic"] = np.log(distance_mixed_addition["yc_ks_statistic"])
distance_filtd_addition["yc_ks_statistic"] = distance_filtd_results["statistic"]
distance_filtd_addition["yc_log_ks_statistic"] = np.log(distance_filtd_addition["yc_ks_statistic"])

distance_trunc = pd.concat([distance_trunc,distance_trunc_addition],axis=1)
distance_mixed = pd.concat([distance_mixed,distance_mixed_addition],axis=1)
distance_filtd = pd.concat([distance_filtd,distance_filtd_addition],axis=1)

In [None]:
%%script false --no-raise-error
distance_trunc_addition = pd.DataFrame()
distance_mixed_addition = pd.DataFrame()
distance_filtd_addition = pd.DataFrame()

#Compute statistic

distance_trunc_results = yc_new_reviews.groupby("businessID").apply(lambda group: get_ks2(yc_new_ratings_baseline_trunc,group[group.flagged=='N'].rating))
distance_mixed_results = yc_new_reviews.groupby("businessID").apply(lambda group: get_ks2(yc_new_ratings_baseline_mixed,group.rating))
distance_filtd_results = yc_new_reviews.groupby("businessID").apply(lambda group: get_ks2(yc_new_ratings_baseline_filtd,group[group.flagged=='Y'].rating))

#Adjust the p-values to account for multiple hypothesis testing
results_trunc = multitest.multipletests(distance_trunc_results[distance_trunc_results.pvalue.notnull()]["pvalue"])
results_mixed = multitest.multipletests(distance_mixed_results[distance_mixed_results.pvalue.notnull()]["pvalue"])
results_filtd = multitest.multipletests(distance_filtd_results[distance_filtd_results.pvalue.notnull()]["pvalue"])
print(results_trunc[2:4])
print(results_mixed[2:4])
print(results_filtd[2:4])

#Rename
distance_trunc_addition["yc_new_ks_significant"] = pd.Series(results_trunc[0],index=distance_trunc_results[distance_trunc_results.pvalue.notnull()].index)
distance_trunc_addition["yc_new_ks_pvalue"] = pd.Series(results_trunc[1],index=distance_trunc_results[distance_trunc_results.pvalue.notnull()].index)
distance_mixed_addition["yc_new_ks_significant"] = pd.Series(results_mixed[0],index=distance_mixed_results[distance_mixed_results.pvalue.notnull()].index)
distance_mixed_addition["yc_new_ks_pvalue"] = pd.Series(results_mixed[1],index=distance_mixed_results[distance_mixed_results.pvalue.notnull()].index)
distance_filtd_addition["yc_new_ks_significant"] = pd.Series(results_filtd[0],index=distance_filtd_results[distance_filtd_results.pvalue.notnull()].index)
distance_filtd_addition["yc_new_ks_pvalue"] = pd.Series(results_filtd[1],index=distance_filtd_results[distance_filtd_results.pvalue.notnull()].index)

distance_trunc_addition["yc_new_ks_statistic"] = distance_trunc_results["statistic"]
distance_trunc_addition["yc_new_log_ks_statistic"] = np.log(distance_trunc_addition["yc_new_ks_statistic"])
distance_mixed_addition["yc_new_ks_statistic"] = distance_mixed_results["statistic"]
distance_mixed_addition["yc_new_log_ks_statistic"] = np.log(distance_mixed_addition["yc_new_ks_statistic"])
distance_filtd_addition["yc_new_ks_statistic"] = distance_filtd_results["statistic"]
distance_filtd_addition["yc_new_log_ks_statistic"] = np.log(distance_filtd_addition["yc_new_ks_statistic"])

distance_trunc = pd.concat([distance_trunc,distance_trunc_addition],axis=1)
distance_mixed = pd.concat([distance_mixed,distance_mixed_addition],axis=1)
distance_filtd = pd.concat([distance_filtd,distance_filtd_addition],axis=1)

In [None]:
distance_filtd

#### Distribution

In [None]:
sns.distplot(distance_trunc.log_es_statistic,label="ES truncated",rug=True,hist=True)
sns.distplot(distance_mixed.log_es_statistic,label="ES mixed",rug=True,hist=True)
sns.distplot(distance_filtd.log_es_statistic,label="ES filtered",rug=True,hist=True)
plt.legend()
plt.show()
sns.distplot(distance_trunc.ks_statistic,label="KS truncated",rug=True,hist=True)
sns.distplot(distance_mixed.ks_statistic,label="KS mixed",rug=True,hist=True)
sns.distplot(distance_filtd.ks_statistic,label="KS filtered",rug=True,hist=True)
plt.legend()
plt.show()
sns.distplot(distance_trunc.tvd_statistic,label="TVD truncated",rug=True,hist=True)
sns.distplot(distance_mixed.tvd_statistic,label="TVD mixed",rug=True,hist=True)
sns.distplot(distance_filtd.tvd_statistic,label="TVD filtered",rug=True,hist=True)
plt.legend()
plt.show()

In [None]:
bins = np.arange(0,1,0.01)

kwargs = {"cumulative": True}

sns.distplot(distance_trunc.es_pvalue,label="ES truncated",rug=True,hist=True,bins=bins,hist_kws=kwargs,kde_kws=kwargs)
sns.distplot(distance_mixed.es_pvalue,label="ES mixed",rug=True,hist=True,bins=bins,hist_kws=kwargs,kde_kws=kwargs)
sns.distplot(distance_filtd.es_pvalue,label="ES filtered",rug=True,hist=True,bins=bins,hist_kws=kwargs,kde_kws=kwargs)
plt.legend()
plt.show()
sns.distplot(distance_trunc.ks_pvalue,label="KS truncated",rug=True,hist=True,bins=bins,hist_kws=kwargs,kde_kws=kwargs)
sns.distplot(distance_mixed.ks_pvalue,label="KS mixed",rug=True,hist=True,bins=bins,hist_kws=kwargs,kde_kws=kwargs)
sns.distplot(distance_filtd.ks_pvalue,label="KS filtered",rug=True,hist=True,bins=bins,hist_kws=kwargs,kde_kws=kwargs)
plt.legend()
plt.show()

#### Bin p-value data into different hypothesis levels

In [None]:
def bin_pvalue(pvalue):
    for p in [0.001,0.005,0.01,0.05]:
        if pvalue < p:
            return ("%0.3f" % p).rstrip("0")
    return "Not significant"
distance_trunc["ks_pvalue_bin"] = distance_trunc["ks_pvalue"].apply(bin_pvalue)
distance_trunc["es_pvalue_bin"] = distance_trunc["es_pvalue"].apply(bin_pvalue)
distance_mixed["ks_pvalue_bin"] = distance_mixed["ks_pvalue"].apply(bin_pvalue)
distance_mixed["es_pvalue_bin"] = distance_mixed["es_pvalue"].apply(bin_pvalue)
distance_filtd["ks_pvalue_bin"] = distance_filtd["ks_pvalue"].apply(bin_pvalue)
distance_filtd["es_pvalue_bin"] = distance_filtd["es_pvalue"].apply(bin_pvalue)


distance_trunc["yc_ks_pvalue_bin"] = distance_trunc["yc_ks_pvalue"].apply(bin_pvalue)
distance_trunc["yc_es_pvalue_bin"] = distance_trunc["yc_es_pvalue"].apply(bin_pvalue)
distance_mixed["yc_ks_pvalue_bin"] = distance_mixed["yc_ks_pvalue"].apply(bin_pvalue)
distance_mixed["yc_es_pvalue_bin"] = distance_mixed["yc_es_pvalue"].apply(bin_pvalue)
distance_filtd["yc_ks_pvalue_bin"] = distance_filtd["yc_ks_pvalue"].apply(bin_pvalue)
distance_filtd["yc_es_pvalue_bin"] = distance_filtd["yc_es_pvalue"].apply(bin_pvalue)

In [None]:
distance_trunc = distance_trunc.rename(columns={"%s" % colname: "%s_trunc" % colname for colname in distance_trunc})
distance_mixed = distance_mixed.rename(columns={"%s" % colname: "%s_mixed" % colname for colname in distance_mixed})
distance_filtd = distance_filtd.rename(columns={"%s" % colname: "%s_filtd" % colname for colname in distance_filtd})

### Construct a dataframe for doing stats on

In [None]:
businesses_distances = pd.concat([yc_businesses,distance_trunc, distance_mixed, distance_filtd], join="inner", axis=1)

#### What would it look like if ratings were drawn randomly

# How do the distances change over time?

In [None]:
sns.scatterplot(y="ks_statistic_trunc",x="yc_ks_statistic_trunc",data=businesses_distances,label="KS Trunc")
sns.scatterplot(y="ks_statistic_mixed",x="yc_ks_statistic_mixed",data=businesses_distances,label="KS Mixed")
sns.scatterplot(y="ks_statistic_filtd",x="yc_ks_statistic_filtd",data=businesses_distances,label="KS Filtered")
plt.show()
sns.scatterplot(y="log_es_statistic_trunc",x="yc_log_es_statistic_trunc",data=businesses_distances,label="ES Trunc")
sns.scatterplot(y="log_es_statistic_mixed",x="yc_log_es_statistic_mixed",data=businesses_distances,label="ES Mixed")
sns.scatterplot(y="log_es_statistic_filtd",x="yc_log_es_statistic_filtd",data=businesses_distances,label="ES Filtered")
plt.show()
sns.scatterplot(y="tvd_statistic_trunc",x="yc_tvd_statistic_trunc",data=businesses_distances,label="TVD Trunc")
sns.scatterplot(y="tvd_statistic_mixed",x="yc_tvd_statistic_mixed",data=businesses_distances,label="TVD Mixed")
sns.scatterplot(y="tvd_statistic_filtd",x="yc_tvd_statistic_filtd",data=businesses_distances,label="TVD Filtered")
plt.show()

In [None]:
sns.scatterplot(y="yc_new_ks_statistic_trunc",x="yc_ks_statistic_trunc",data=businesses_distances,label="KS Trunc")
sns.scatterplot(y="yc_new_ks_statistic_mixed",x="yc_ks_statistic_mixed",data=businesses_distances,label="KS Mixed")
sns.scatterplot(y="yc_new_ks_statistic_filtd",x="yc_ks_statistic_filtd",data=businesses_distances,label="KS Filtered")
plt.show()
sns.scatterplot(y="yc_new_log_es_statistic_trunc",x="yc_log_es_statistic_trunc",data=businesses_distances,label="ES Trunc")
sns.scatterplot(y="yc_new_log_es_statistic_mixed",x="yc_log_es_statistic_mixed",data=businesses_distances,label="ES Mixed")
sns.scatterplot(y="yc_new_log_es_statistic_filtd",x="yc_log_es_statistic_filtd",data=businesses_distances,label="ES Filtered")
plt.show()
sns.scatterplot(y="yc_new_tvd_statistic_trunc",x="yc_tvd_statistic_trunc",data=businesses_distances,label="TVD Trunc")
sns.scatterplot(y="yc_new_tvd_statistic_mixed",x="yc_tvd_statistic_mixed",data=businesses_distances,label="TVD Mixed")
sns.scatterplot(y="yc_new_tvd_statistic_filtd",x="yc_tvd_statistic_filtd",data=businesses_distances,label="TVD Filtered")
plt.show()

#### Let's capture the cluster on the right for ES

In [None]:
def grab_bottom_left_outliers(data1,data2):
    for idx,v1 in data1.iteritems():
        v2 = data2[idx]
        if v1 > 7.5 and v2 < 8:
            yield idx

t_outliers = pd.Series(grab_bottom_left_outliers(businesses_distances.yc_log_es_statistic_trunc, businesses_distances.log_es_statistic_trunc))
m_outliers = pd.Series(grab_bottom_left_outliers(businesses_distances.yc_log_es_statistic_mixed, businesses_distances.log_es_statistic_mixed))
f_outliers = pd.Series(grab_bottom_left_outliers(businesses_distances.yc_log_es_statistic_filtd, businesses_distances.log_es_statistic_filtd))

In [None]:
len(t_outliers),len(m_outliers),len(f_outliers)

In [None]:
len(set(t_outliers) & set(f_outliers))

In [None]:
pd.options.display.max_columns = None
businesses_distances.loc[set(t_outliers) & set(f_outliers)]

In [None]:
t_outliers_bd = businesses_distances.loc[t_outliers]
bd_no_outliers = businesses_distances.loc[set(businesses_distances.index) - set(t_outliers)]
bins = np.arange(0.75,6,0.5)
display(Markdown("#### New Rating"))
display(Markdown("Distance: %f (p=%f)" % scipy.stats.epps_singleton_2samp(t_outliers_bd.rating,businesses_distances.rating)))
sns.distplot(t_outliers_bd.rating,label="Trunc outliers",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
sns.distplot(businesses_distances.rating,label="Trunc",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
sns.distplot(bd_no_outliers.rating,label="Trunc Non-outliers",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
plt.legend()
plt.show()
display(Markdown("#### Old Rating"))
display(Markdown("Distance: %f (p=%f)" % scipy.stats.epps_singleton_2samp(t_outliers_bd.yc_rating,businesses_distances.yc_rating)))
sns.distplot(t_outliers_bd.yc_rating,label="Trunc outliers",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
sns.distplot(businesses_distances.yc_rating,label="Trunc",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
sns.distplot(bd_no_outliers.yc_rating,label="Trunc Non-outliers",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
plt.legend()
plt.show()
display(Markdown("#### New Review Count"))
display(Markdown("Distance: %f (p=%f)" % scipy.stats.epps_singleton_2samp(t_outliers_bd.review_count,businesses_distances.review_count)))
bins = np.exp(np.arange(1,np.log(businesses_distances.review_count.max()), 1))
sns.distplot(t_outliers_bd.review_count,label="Trunc outliers",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
sns.distplot(businesses_distances.review_count,label="Trunc",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
sns.distplot(bd_no_outliers.review_count,label="Trunc Non-outliers",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
plt.xscale("log")
plt.legend()
plt.show()
display(Markdown("#### Old Review Count"))
display(Markdown("Distance: %f (p=%f)" % scipy.stats.epps_singleton_2samp(t_outliers_bd.yc_reviewCount,businesses_distances.yc_reviewCount)))
sns.distplot(t_outliers_bd.yc_reviewCount,label="Trunc outliers",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
sns.distplot(businesses_distances.yc_reviewCount,label="Trunc",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
sns.distplot(bd_no_outliers.yc_reviewCount,label="Trunc Non-outliers",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
plt.xscale("log")
plt.legend()
plt.show()

#TODO
display(Markdown("#### New Filtered Ratio"))
display(Markdown("Distance (KS): %f (p=%f)" % scipy.stats.ks_2samp(t_outliers_bd.filtered_ratio,businesses_distances.filtered_ratio)))
bins = np.arange(0,1.01,0.01)
sns.distplot(t_outliers_bd.filtered_ratio,label="Trunc outliers",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
sns.distplot(businesses_distances.filtered_ratio,label="Trunc",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
sns.distplot(bd_no_outliers.filtered_ratio,label="Trunc Non-outliers",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
plt.legend()
plt.show()
display(Markdown("#### Old Filtered Ratio"))
display(Markdown("Distance (KS): %f (p=%f)" % scipy.stats.ks_2samp(t_outliers_bd.yc_filtered_ratio,businesses_distances.yc_filtered_ratio)))
sns.distplot(t_outliers_bd.yc_filtered_ratio,label="Trunc outliers",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
sns.distplot(businesses_distances.yc_filtered_ratio,label="Trunc",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
sns.distplot(bd_no_outliers.yc_filtered_ratio,label="Trunc Non-outliers",kde=False,norm_hist=True,bins=bins,hist_kws={"cumulative": True})
plt.legend()
plt.show()

#### These outliers have low review counts, but not atypical ratings

In [None]:
df = pd.concat([t_outliers_bd.groupby("yc_type").size(), businesses_distances.groupby("yc_type").size()],axis=1).fillna(0)
scipy.stats.fisher_exact(df.to_numpy())

In [None]:
t_outliers

In [None]:
def is_outlier_type(series,outlier_bids=None):
    bid = series.name
    return bid in outlier_bids
businesses_distances.apply(functools.partial(is_outlier_type,outlier_bids=set(t_outliers)),axis=1).sort_values()

In [None]:
businesses_distances["es_outlier_trunc"] = businesses_distances.apply(functools.partial(is_outlier_type,outlier_bids=set(t_outliers)),axis=1)
businesses_distances["es_outlier_mixed"] = businesses_distances.apply(functools.partial(is_outlier_type,outlier_bids=set(m_outliers)),axis=1)
businesses_distances["es_outlier_filtd"] = businesses_distances.apply(functools.partial(is_outlier_type,outlier_bids=set(f_outliers)),axis=1)

In [None]:
len(t_outliers)

#### Difference by type is statistically significant

### When we stratify by low review count, do the results hold up?

# How do the distances vary across types?

In [None]:
sns.scatterplot(x="ks_statistic_trunc",y="ks_statistic_mixed",data=businesses_distances,label="KS")
plt.show()
sns.scatterplot(x="es_statistic_trunc",y="es_statistic_mixed",data=businesses_distances,label="ES")
plt.xscale("log")
plt.yscale("log")
plt.show()
sns.scatterplot(x="ks_statistic_trunc",y="es_statistic_trunc",data=businesses_distances,label="Trunc")
plt.yscale("log")
plt.show()
sns.scatterplot(x="ks_statistic_mixed",y="es_statistic_mixed",data=businesses_distances,label="Mixed")
plt.yscale("log")

#### Quantize the `price` column

In [None]:
businesses_distances["price"] = businesses_distances["price"].apply(lambda x: len(x) if type(x) is str else x)

In [None]:
m = businesses_distances["price"].mean() // 1
businesses_distances["price"] = businesses_distances["price"].apply(lambda x: m if math.isnan(x) else x)

#### Perturb discrete columns for better plotting

In [None]:
businesses_distances["price_perturbed"] = businesses_distances.price.apply(lambda x: x + random.random() * 0.5 - 0.25)

In [None]:
businesses_distances["rating_perturbed"] = businesses_distances.rating.apply(lambda x: x + random.random() * 0.25 - 0.125)

#### Sort values and get needed log-based columns

In [None]:
businesses_distances = businesses_distances.sort_values("es_pvalue_trunc")
businesses_distances.to_pickle("../data/pickles/yelpchi_business_data_with_distances.pkl")

#### How does the statistic relate to p-value?

In [None]:
sns.scatterplot(x="es_statistic_trunc",y="es_pvalue_trunc",data=businesses_distances,label="All data")
plt.xscale("log")

In [None]:
sns.scatterplot(x="es_statistic_mixed",y="es_pvalue_mixed",data=businesses_distances,label="All data")
plt.xscale("log")

## Do comparisons

#### Review Count

In [None]:
businesses_distances[businesses_distances.es_statistic_trunc.isnull()]

In [None]:
sns.scatterplot(x="es_statistic_trunc",y="review_count",data=businesses_distances,label="All data",hue="es_pvalue_bin_trunc")
#sns.scatterplot(x="statistic",y="review_count",data=businesses_distances_sig,label="Signficant data",hue="pvalue_bin")
#sns.scatterplot(x="statistic",y="review_count",data=businesses_distances_very_sig,label="Very signficant data")
plt.xscale("log")
plt.yscale("log")
scipy.stats.pearsonr(businesses_distances.es_statistic_trunc, businesses_distances.review_count)

In [None]:
sns.scatterplot(x="yc_es_statistic_trunc",y="yc_reviewCount",data=businesses_distances,label="All data",hue="yc_es_pvalue_bin_trunc")
#sns.scatterplot(x="statistic",y="review_count",data=businesses_distances_sig,label="Signficant data",hue="pvalue_bin")
#sns.scatterplot(x="statistic",y="review_count",data=businesses_distances_very_sig,label="Very signficant data")
plt.xscale("log")
plt.yscale("log")
df1 = businesses_distances[businesses_distances.yc_es_statistic_trunc.notnull()]
df2 = df1.loc[set(df1.index) - set(t_outliers)]
print("With outliers: %f (p=%f)\nWithout outliers: %f (p=%f)" % (scipy.stats.pearsonr(df1.yc_es_statistic_trunc, df1.yc_reviewCount) + scipy.stats.pearsonr(df2.yc_es_statistic_trunc, df2.yc_reviewCount)))

In [None]:
sns.scatterplot(x="es_statistic_mixed",y="review_count",data=businesses_distances.sort_values("es_pvalue_mixed"),label="All data",hue="es_pvalue_bin_mixed")
#sns.scatterplot(x="statistic",y="review_count",data=businesses_distances_sig,label="Signficant data",hue="pvalue_bin")
#sns.scatterplot(x="statistic",y="review_count",data=businesses_distances_very_sig,label="Very signficant data")
plt.xscale("log")
plt.yscale("log")
scipy.stats.pearsonr(businesses_distances.es_statistic_mixed, businesses_distances.review_count)

#### Price

In [None]:
sns.scatterplot(x="es_statistic_trunc",y="price_perturbed",data=businesses_distances,hue="es_pvalue_bin_trunc")
#sns.scatterplot(x="es_statistic_trunc",y="price_perturbed",data=businesses_distances_sig,label="Signficant data")
plt.xscale("log")
scipy.stats.pearsonr(businesses_distances.es_statistic_trunc, businesses_distances.price)
#plt.yscale("log")

#### Is a chain

In [None]:
businesses_distances["isChainReduced"] = businesses_distances["isChain"].apply({"0": 0, "1": 1, "2": 1, "3": 1, "None": 0}.get)
businesses_distances["isChainReducedPerturbed"] = businesses_distances.isChainReduced.apply(lambda x: x + random.random() * 0.5 - 0.25)

In [None]:
sns.scatterplot(x="es_statistic_trunc",y="isChainReducedPerturbed",data=businesses_distances,hue="es_pvalue_bin_trunc")
#sns.scatterplot(x="es_statistic_trunc",y="price_perturbed",data=businesses_distances_sig,label="Signficant data")
plt.xscale("log")
scipy.stats.pearsonr(businesses_distances.log_es_statistic_trunc, businesses_distances.isChainReduced)
#plt.yscale("log")

#### Location

In [None]:
#palette = sns.cubehelix_palette(len(businesses_distances.pvalue_bin.unique()), rot=-.4, light=0.6, dark=0.8)
palette = sns.color_palette("GnBu_d",n_colors=len(businesses_distances.es_pvalue_bin_trunc.unique()))
sns.scatterplot(x="coordinates.longitude",y="coordinates.latitude",data=businesses_distances,hue="es_pvalue_bin_trunc",palette=palette)
#sns.scatterplot(x="coordinates.longitude",y="coordinates.latitude",data=businesses_distances_sig,hue="log_statistic",cmap=palette)
plt.legend(bbox_to_anchor=(1.05, 1.05),loc="upper left")

In [None]:
palette = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True)
sns.kdeplot(businesses_distances["coordinates.longitude"],businesses_distances["coordinates.latitude"],label="Non-signficant points")
plt.legend(bbox_to_anchor=(1.05, 1.05),loc="upper left")
plt.show()
sns.kdeplot(businesses_distances[businesses_distances.es_pvalue_trunc < 0.05]["coordinates.longitude"],businesses_distances[businesses_distances.es_pvalue_trunc < 0.05]["coordinates.latitude"],label="Significant points")
plt.legend(bbox_to_anchor=(1.05, 1.05),loc="upper left")
plt.show()
sns.kdeplot(businesses_distances[businesses_distances.es_pvalue_mixed < 0.05]["coordinates.longitude"],businesses_distances[businesses_distances.es_pvalue_mixed < 0.05]["coordinates.latitude"],label="Significant points")
plt.legend(bbox_to_anchor=(1.05, 1.05),loc="upper left")

#### Rating

In [None]:
sns.scatterplot(x="es_statistic_trunc",y="rating_perturbed",data=businesses_distances)
plt.xscale("log")
scipy.stats.pearsonr(businesses_distances.es_statistic_trunc, businesses_distances.rating)
#plt.yscale("log")

Note: this really isn't that surprising -- is there a correlation between your average rating and the distribution of your ratings? Of course!

In [None]:
sns.scatterplot(x="es_statistic_mixed",y="rating_perturbed",data=businesses_distances)
plt.xscale("log")
scipy.stats.pearsonr(businesses_distances.es_statistic_mixed, businesses_distances.rating)
#plt.yscale("log")

#### Filtered ratio

In [None]:
sns.scatterplot(x="es_statistic_trunc",y="filtered_ratio",data=businesses_distances)
plt.xscale("log")
scipy.stats.pearsonr(businesses_distances.es_statistic_trunc, businesses_distances.filtered_ratio)

In [None]:
sns.scatterplot(x="es_statistic_mixed",y="filtered_ratio",data=businesses_distances)
plt.xscale("log")
scipy.stats.pearsonr(businesses_distances.es_statistic_mixed, businesses_distances.filtered_ratio)

# Plot each business

In [None]:
review_ct_by_business = yc_updated_reviews.groupby("businessID").content.size()
yc_updated_reviews["business_review_count"] = yc_updated_reviews.apply(lambda x: review_ct_by_business[x.businessID], axis=1)

In [None]:
yc_updated_reviews.rating

In [None]:
one_month = pd.Timedelta("30 days")
three_months = pd.Timedelta("90 days")
def reviews_at_time(df, date):
    
    six_month_window = df[(df.date < date + three_months) & (df.date >= date - three_months)]
    
    if len(six_month_window) == 0:
        print(date)
    
    s = pd.Series(dtype="object")
    
    s["binned_rating"] = round(six_month_window.rating.mean() * 2) // 2
    s["ratings"] = list(six_month_window.rating)
    s["indexes"] = list(six_month_window.index.values)
    return s

In [None]:
all_six_month_windows_binned = {
    i: [] for i in np.arange(1,5.5,.5)
}
for business_id, group in tqdm(yc_updated_reviews.sort_values("business_review_count",ascending=False).groupby("businessID",sort=False),total=len(yc_updated_reviews[yc_updated_reviews.flagged == 'N'].businessID.unique())):
    
    
    dates = pd.Series(group.date.unique()).sort_values()
    
    i = 0 #Before
    j = 0 #After
    
    dates_alt = []
    
    date = dates.min()
    while date < dates.max():
        while i+1 < len(dates) and dates.iloc[i+1] < date:
            i += 1
        while j+1 < len(dates) and dates.iloc[j+1] > date:
            j += 1
        
        if date - dates.iloc[i] <= three_months or dates.iloc[j] - date < three_months:
            dates_alt.append(date)
        date += one_month
    dates_alt.append(dates.max())
    
    dates = pd.Series(dates_alt)
    
    windows = dates.apply(functools.partial(reviews_at_time,group))
    #display(windows)
    for idx, window in windows.iterrows():
        all_six_month_windows_binned[window["binned_rating"]] += list(window.ratings)
    

In [None]:
# three_months = pd.to_timedelta("90 days")

def rating_at_time(df, date):
    s = pd.Series(dtype='object')
    
    window = df[df.date <= date]
    
    s["date"] = date
    s["perc_filtered"] = len(window[window.flagged == "Y"]) / len(window)
    s["review_count"] = len(window)
    s["recommended_review_count"] = len(window[window.flagged == "N"])
    s["filtered_review_count"] = len(window[window.flagged == "Y"])
    
    six_month_window = df[(df.date < date + three_months) & (df.date >= date - three_months)]
    
    if len(window[window.flagged == "N"].rating) > 0:
        s["rating"] = window[window.flagged == "N"].rating.mean()
        s["smooth_avg_6mo"] = six_month_window[six_month_window.flagged == "N"].rating.mean()    
        
    if len(window[window.flagged == "N"].rating) > 1:
        sigma = window[window.flagged == "N"].rating.std()

        for i in range(1,3+1):
            s[f"rating_plus_{i}_sigma"] = min(s.rating + i * sigma,5)
            s[f"rating_plus_{i}_sigma_of_1"] = (s[f"rating_plus_{i}_sigma"] - 1) / 4

            s[f"rating_minus_{i}_sigma"] = max(1,s.rating - i * sigma)
            s[f"rating_minus_{i}_sigma_of_1"] = (s[f"rating_minus_{i}_sigma"] - 1) / 4


        s["rating_of_1"] = (s.rating - 1) / 4
        s["smooth_avg_6mo_of_1"] = (s.smooth_avg_6mo - 1) / 4
        
    
    try:
        s["es"] = scipy.stats.epps_singleton_2samp(six_month_window.rating, df.rating)[1]
        s["binned_rating"] = round(six_month_window.rating.mean() * 2) // 2
        s["es_bin"] = scipy.stats.epps_singleton_2samp(six_month_window.rating, all_six_month_windows_binned[s["binned_rating"]])[1]
    except KeyboardInterrupt:
        raise
    except:
        #display(window.rating)
        #display(df.rating)
        #raise
        pass
    
    return s

failures = []
years = mdates.YearLocator()   # every year
months = mdates.MonthLocator(bymonth=None, bymonthday=1, interval=3)  # every 3 months
years_fmt = mdates.DateFormatter('%Y')

start_time = datetime.datetime(year=yc_updated_reviews.date.min().year,month=1,day=1)
end_time = datetime.datetime(year=yc_updated_reviews.date.max().year+1,month=1,day=1)

for business_id, group in yc_updated_reviews.sort_values("business_review_count").groupby("businessID",sort=False):
    summary = pd.Series(group.date.unique()).apply(functools.partial(rating_at_time,group))
    summary = summary[summary.rating.notnull()]
    #display(summary)
    try:
        max_reviews = summary.review_count.max()
        
        business = yc_businesses.loc[business_id]
        business_alias = business.alias
        business_name = business.name
        
        print(f"{business_name}")
        
        summary["review_count_of_1"] = (summary.review_count / max_reviews).astype("float")
        
        six_month_sigma = summary["smooth_avg_6mo"].std()
        i = 1
        summary[f"six_month_rating_plus_{i}_sigma_of_1"] = (((summary["rating"] + i * six_month_sigma )- 1) / 4).apply(lambda x: min(x,1))
        summary[f"six_month_rating_minus_{i}_sigma_of_1"] = (((summary["rating"] - i * six_month_sigma )- 1) / 4).apply(lambda x: max(x,0))
        
        
#         for i in range(1,2):
#             sns.lineplot(x="date",y=f"rating_plus_{i}_sigma_of_1",data=summary,label=None,color="black")
#             ax = sns.lineplot(x="date",y=f"rating_minus_{i}_sigma_of_1",data=summary,label=None,color="black")
#             ax.lines[len(ax.lines)-1].set_linestyle("--")
#             ax.lines[len(ax.lines)-2].set_linestyle("--")

        plt.rcParams['figure.figsize'] = 15, 5
        #fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True)
        fig, ax1 = plt.subplots(ncols=1, sharey=True)
        ax2 = ax1
        
        try:
            sns.lineplot(x="date",y="es",data=summary,label="Epps-Singleton",ax=ax2)
            ax = sns.lineplot(x="date",y="es_bin",data=summary,label="Epps-Singleton (binned)",ax=ax2)
            ax.lines[len(ax.lines)-1].set_linestyle("--")
            ax.lines[len(ax.lines)-2].set_linestyle("--")
#             ax.tick_params(which="both", bottom=True)
#             ax.xaxis.set_major_locator(years)
#             ax.xaxis.set_major_formatter(years_fmt)
#             ax.xaxis.set_minor_locator(months)
#             ax.figure.autofmt_xdate()
#             ax.set_xlim(start_time,end_time)
        except KeyboardInterrupt:
            raise
        except:
            pass

        for i in range(1,2):
            sns.lineplot(x="date",y=f"six_month_rating_plus_{i}_sigma_of_1",data=summary,label=None,color="black",ax=ax1)
            ax = sns.lineplot(x="date",y=f"six_month_rating_minus_{i}_sigma_of_1",data=summary,label=None,color="black",ax=ax1)
            ax.lines[len(ax.lines)-1].set_linestyle("--")
            ax.lines[len(ax.lines)-2].set_linestyle("--")

        sns.lineplot(x="date",y="rating_of_1",data=summary,label="Rating",ax=ax1)
        sns.lineplot(x="date",y="perc_filtered",data=summary,label="Perc filtered",ax=ax1)
        sns.lineplot(x="date",y="review_count_of_1",data=summary,label="Reviews proportion of max",ax=ax1)
        sns.lineplot(x="date",y="smooth_avg_6mo_of_1",data=summary,label="6-month rolling average",ax=ax1)
        
#         ax.tick_params(which="both", bottom=True)
#         ax.xaxis.set_major_locator(years)
#         ax.xaxis.set_major_formatter(years_fmt)
#         ax.xaxis.set_minor_locator(months)
#         ax.figure.autofmt_xdate()
        
#         ax.set_title(f"Max reviews: {max_reviews}\nBusiness alias: {business_alias}")
#         ax.legend(bbox_to_anchor=(0, -0.15), loc=2, borderaxespad=0.)
#         ax.set_xlim(start_time,end_time)
        
        ax.tick_params(which="both", bottom=True)
        ax.xaxis.set_major_locator(years)
        ax.xaxis.set_major_formatter(years_fmt)
        ax.xaxis.set_minor_locator(months)
        ax.figure.autofmt_xdate()
        
        ax.set_title(f"Max reviews: {max_reviews}\nBusiness alias: {business_alias}")
        ax.legend(bbox_to_anchor=(0, -0.15), loc=2, borderaxespad=0.)
        ax.set_xlim(start_time,end_time)
        
        
        
        
        
        plt.show()
    except KeyboardInterrupt:
        break
    except:
        traceback.print_exc()
        print(business_id)
        display(summary)
        failures.append((business_id, summary))
        plt.clf()
        break

In [None]:
# Let's see how this does when we look at only the most typical reviews

# What if we look at reviews in unusual activity periods?

In [None]:
yc_updated_reviews_rating = yc_updated_reviews.copy()

### TVD

In [None]:
one_month = pd.Timedelta("30 days")

yc_updated_reviews_rating["tvd_atypical_windows"] = 0
yc_updated_reviews_rating["tvd_windows"] = 0

tvd_distances = collections.defaultdict(list)

total = len(yc_updated_reviews.businessID.unique())

tvd_all_windows = []

for business_id, group in tqdm(yc_updated_reviews.sample(frac=1).groupby("businessID",sort=False),total=total):
    dates = pd.Series(group.date.unique()).sort_values()
    
    i = 0 #Before
    j = 0 #After
    
    dates_alt = []
    
    date = dates.min()
    while date < dates.max():
        while i+1 < len(dates) and dates.iloc[i+1] < date:
            i += 1
        while j+1 < len(dates) and dates.iloc[j+1] > date:
            j += 1
        
        if date - dates.iloc[i] <= three_months or dates.iloc[j] - date < three_months:
            dates_alt.append(date)
        date += one_month
    dates_alt.append(dates.max())
    
    dates = pd.Series(dates_alt)
        
    
    windows = dates.apply(functools.partial(reviews_at_time,group))
    #display(windows)
    
    for idx, window in windows.iterrows():
        if len(window.ratings) < 5:
            continue
        try:
            distance = total_variational_distance(window.ratings,all_six_month_windows_binned[window["binned_rating"]])
        except np.linalg.LinAlgError:
            continue
            
        tvd_all_windows.append((distance,window))
        
        
        yc_updated_reviews_rating.loc[window.indexes]["tvd_windows"] += 1
        
        for i in window.indexes:
            tvd_distances[i].append(distance)
        
        if distance > 0.95:
            yc_updated_reviews_rating.loc[window.indexes]["tvd_atypical_windows"] += 1

In [None]:
distances = []
filtered_ratios = []
for distance,window in tvd_all_windows:
    reviews = yc_updated_reviews_rating.loc[window.indexes]
    counts = reviews.groupby("flagged").size()
    try:
        filtered_ratio = counts.loc["Y"] / len(reviews)
    except KeyError:
        filtered_ratio = 0
    distances.append(distance)
    filtered_ratios.append(filtered_ratio)
    
distances = pd.Series(distances).rename("Distance")
filtered_ratios = pd.Series(filtered_ratios).rename("Filtered Ratio")
    
sns.scatterplot(distances,filtered_ratios)
plt.show()

In [None]:
q1 = filtered_ratios.quantile(0.25)
q2 = filtered_ratios.quantile(0.50)
q3 = filtered_ratios.quantile(0.75)

def get_quartile(x):
    if x < q1:
        return "Q1"
    elif x < q2:
        return "Q2"
    elif x < q3:
        return "Q3"
    else:
        return "Q4"

sns.violinplot(y=distances,x=filtered_ratios.map(get_quartile))

### Epps-Singleton

In [None]:
yc_updated_reviews_rating["atypical_windows"] = 0
yc_updated_reviews_rating["windows"] = 0

pvalues = collections.defaultdict(list)

total = len(yc_updated_reviews.businessID.unique())

all_windows = []

for business_id, group in tqdm(yc_updated_reviews.sample(frac=1).groupby("businessID",sort=False),total=total):
    dates = pd.Series(group.date.unique()).sort_values()
    
    i = 0 #Before
    j = 0 #After
    
    dates_alt = []
    
    date = dates.min()
    while date < dates.max():
        while i+1 < len(dates) and dates.iloc[i+1] < date:
            i += 1
        while j+1 < len(dates) and dates.iloc[j+1] > date:
            j += 1
        
        if date - dates.iloc[i] <= three_months or dates.iloc[j] - date < three_months:
            dates_alt.append(date)
        date += one_month
    dates_alt.append(dates.max())
    
    dates = pd.Series(dates_alt)
        
    
    windows = dates.apply(functools.partial(reviews_at_time,group))
    #display(windows)
    
    for idx, window in windows.iterrows():
        if len(window.ratings) < 5:
            continue
        try:
            pvalue = scipy.stats.epps_singleton_2samp(window.ratings,all_six_month_windows_binned[window["binned_rating"]])[1]
        except np.linalg.LinAlgError:
            continue
            
        all_windows.append((pvalue,window))
        
        
        yc_updated_reviews_rating.loc[window.indexes]["windows"] += 1
        
        for i in window.indexes:
            pvalues[i].append(pvalue)
        
        if pvalue < 0.01:
            yc_updated_reviews_rating.loc[window.indexes]["atypical_windows"] += 1

In [None]:
yc_updated_reviews_rating.groupby("atypical_windows").size()

In [None]:
index = []
#l1 = []
#l2 = []
rows = []

for k,v in pvalues.items():
    windows = len(v)
    atypical_windows = len(list(filter(lambda x: x<0.01,v)))
    if windows == 0:
        continue
    index.append(k)
    rows.append([windows, atypical_windows, atypical_windows / windows])
    #l1.append()
    #l2.append()
    
#Fill with 0s
#0 atypical ratio -- assume not atypical if we don't have enough data
df = pd.DataFrame(rows,index=index,columns=["windows","atypical_windows","atypical_ratio"])

In [None]:
yc_updated_reviews_rating["atypical_ratio"] = 0
yc_updated_reviews_rating[["windows","atypical_windows","atypical_ratio"]] = df[["windows","atypical_windows","atypical_ratio"]]
yc_updated_reviews_rating[["windows","atypical_windows","atypical_ratio"]] = yc_updated_reviews_rating[["windows","atypical_windows","atypical_ratio"]].fillna(0)

In [None]:
scipy.stats.pearsonr(yc_updated_reviews_rating.atypical_ratio, list(map(lambda x: 1 if x else 0, yc_updated_reviews_rating.flagged == "Y")))

In [None]:
yc_updated_reviews_rating["flagged_discrete"] = yc_updated_reviews_rating.flagged.apply(lambda x: (1 if x=="Y" else 0))
yc_updated_reviews_rating["flagged_perturbed"] = yc_updated_reviews_rating.flagged_discrete.apply(lambda x: x + random.random() * 0.5 - 0.25)
yc_updated_reviews_rating["atypical_ratio_perturbed"] = yc_updated_reviews_rating.atypical_ratio.apply(lambda x: x + random.random() * 0.01 - 0.005)

In [None]:
sns.scatterplot("atypical_ratio_perturbed", "flagged_perturbed", data=yc_updated_reviews_rating)

In [None]:
sns.violinplot(y="flagged", x="atypical_ratio", data=yc_updated_reviews_rating,orient="h",scale="width")

### Try to do ML on it

In [None]:
#366 for leap years
twelve_months = pd.Timedelta("366 days")
#183 days so that it always includes at least half a year, to account for seasonality
six_months = pd.Timedelta("183 days")
three_months = pd.Timedelta("90 days")
one_month = pd.Timedelta("30 days")
half_month = pd.Timedelta("15 days")

col_names = [f"{num}_mo_{rating}_rating_ratio" for (num,rating) in itertools.product(["1","2","6","12"],["1","2","3","4","5"])]
business_oh = set

def get_rating_distribution(df, date, df_name=""):
    
    s = pd.Series([0]*len(col_names), index=col_names,name=f"{df_name} {date}")
    
    s[df_name] = 1
    s["business"] = df_name
    s["date"] = date
    
    twelve_month_window = df[(df.date < date + six_months) & (df.date >= date - six_months)]
    six_month_window = df[(df.date < date + three_months) & (df.date >= date - three_months)]
    two_month_window = df[(df.date < date + one_month) & (df.date >= date - one_month)]
    one_month_window = df[(df.date < date + half_month) & (df.date >= date - half_month)]
    
    for window,num in [(twelve_month_window,"12"), (six_month_window,"6"),(two_month_window,"2"),(one_month_window,"1")]:
        
        assert len(window) > 0, f"{num} {df_name} {date}"
        
        s[f"{num}_mo_filtered_ratio"] = len(window[window.flagged == "Y"]) / len(window)
        for rating, count in window.groupby("rating").size().iteritems():
            s[f"{num}_mo_{rating}_rating"] = count
            s[f"{num}_mo_{rating}_rating_ratio"] = count / len(window)
            
    return s

In [None]:
prior_filtering_col_names = ([f"{num}_mo_prior_has_reviews" for num in ["1","3","6","12", "all"]] + 
                             [f"{num}_mo_prior_filtered_ratio" for num in ["1","3","6","12", "all"]] + 
                             [f"{num}_mo_prior_review_count" for num in ["1","3","6","12","all"]]
                            )  

rating_col_names = (
    [f"{num}_mo_prior_has_rec_reviews" for num in ["1","3","6","12", "all"]] + 
    [f"{num}_mo_prior_rating" for num in ["1","3","6","12", "all"]]
)

def get_prior_filtering(df, date, df_name=""):
    
    s = pd.Series([0]*len(col_names), index=col_names,name=f"{df_name} {date}")
    
    s[df_name] = 1
    s["business"] = df_name
    s["date"] = date
    
#     twelve_months_prior = df[(df.date < date - twelve_months + half_month) & (df.date >= date - twelve_months - half_month)]
#     six_months_prior = df[(df.date < date - six_months + half_month) & (df.date >= date - six_months - half_month)]
#     three_months_prior = df[(df.date < date - three_months + half_month) & (df.date >= date - three_months - half_month)]
#     one_month_prior = df[(df.date < date - one_month + half_month) & (df.date >= date - one_month - half_month)]
    
    twelve_months_prior = df[(df.date < date - half_month) & (df.date >= date - twelve_months - half_month)]
    six_months_prior = df[(df.date < date - half_month) & (df.date >= date - six_months - half_month)]
    three_months_prior = df[(df.date < date - half_month) & (df.date >= date - three_months - half_month)]
    one_month_prior = df[(df.date < date - half_month) & (df.date >= date - one_month - half_month)]
    
    one_month_window = df[(df.date < date + half_month) & (df.date >= date - half_month)]
    
    all_prior = df[df.date < date - half_month]
    
    s[f"1_mo_filtered_ratio"] = len(one_month_window[one_month_window.flagged == "Y"]) / len(one_month_window)
    s[f"1_mo_rating"] = one_month_window[one_month_window.flagged == "N"].rating.mean()
    
    for window,num in [(all_prior,"all"),(twelve_months_prior,"12"), (six_months_prior,"6"),(three_months_prior,"3"),(three_months_prior,"1")]:
        
        #assert len(window) > 0
        s[f"{num}_mo_prior_has_reviews"] = 1 if len(window) > 0 else 0
        s[f"{num}_mo_prior_has_rec_reviews"] = 1 if len(window[window.flagged == "N"]) > 0 else 0
        
        s[f"{num}_mo_prior_rating"] = window[window.flagged == "N"].rating.mean() if len(window[window.flagged == "N"]) > 0 else 0
        
        s[f"{num}_mo_prior_filtered_ratio"] = len(window[window.flagged == "Y"]) / len(window) if len(window) > 0 else 0
        s[f"{num}_mo_prior_review_count"] = len(window)
            
    return s



In [None]:
total = len(yc_updated_reviews.businessID.unique())

all_rating_dfs = []

for business_id, group in tqdm(yc_updated_reviews.sample(frac=1).groupby("businessID",sort=False),total=total):
    dates = pd.Series(group.date.unique()).sort_values()
    
    i = 0 #Before
    j = 0 #After
    
    dates_alt = []
    
    date = dates.min() #datetime.datetime(month=dates.min().month,year=dates.min().year,day=1) + one_month
    while date < dates.max():
        while i+1 < len(dates) and dates.iloc[i+1] < date:
            i += 1
        while j+1 < len(dates) and dates.iloc[j+1] > date:
            j += 1
        
        if date - dates.iloc[i] <= half_month or dates.iloc[j] - date < half_month:
            dates_alt.append(date)
        date += one_month
        
    dates_alt.append(dates.max())
    dates = pd.Series(dates_alt)
        
    
    ratings = dates.apply(functools.partial(get_rating_distribution,group,df_name=business_id))
    prior_filtering = dates.apply(functools.partial(get_prior_filtering,group,df_name=business_id))
    
    df = pd.concat([ratings,prior_filtering],axis=1)
    
    all_rating_dfs.append(df)
    

In [None]:
all_rating_dfs_dedup = list(map(lambda df: df.loc[:,~df.columns.duplicated()],all_rating_dfs))

In [None]:
ratings_df = pd.concat(all_rating_dfs_dedup,axis=0).fillna(0)
ratings_df = ratings_df.set_index(["business", "date"],drop=False)

In [None]:
ratings_df["month"] = ratings_df["date"].map(lambda x: x.month)
ratings_df["year"] = ratings_df["date"].map(lambda x: x.year)

In [None]:
time_cols = [f"month_{m}" for m in ratings_df.month.unique()] + [f"year_{y}" for y in ratings_df.year.unique()]
for m in ratings_df.month.unique():
    ratings_df[f"month_{m}"] = ratings_df.month.map(lambda x: 1 if x == m else 0)
for y in ratings_df.year.unique():
    ratings_df[f"year_{y}"] = ratings_df.year.map(lambda x: 1 if x == y else 0)

In [None]:
CV = KFold(5,shuffle=True)

def dfs_to_data(*dfs,columns=None,ret_df=False):
    df = pd.concat(dfs, axis=1)
    if columns is not None:
        df = df[columns]
    if ret_df:
        return df
    else:
        return df.to_numpy(), list(df)

def get_data(*columns,**kwargs):
    cols = []
    for col_list in columns:
        if type(col_list) == str:
            cols.append(col_list)
        else:
            cols += list(col_list)
    return dfs_to_data(ratings_df, columns=cols,**kwargs)

def linear_regression_statsmodels(y, *columns):
    if type(y) == str:
        y = business_distances[y]
    df =  get_data(*columns, ret_df=True)
    df = sm.add_constant(df)
    model = sm.OLS(y,df)
    results = model.fit()
    return results.summary()
    
def linear_regression_sklearn(y, *columns):
    if type(y) == str:
        y = ratings_df[y]
    X, feature_names =  get_data(*columns)
    clf = LinearRegression()
    model = clf.fit(X,y)
    #scores = cross_val_score(clf, X, y, cv=CV,scoring='neg_mean_squared_error')
    scores = cross_val_score(clf, X, y, cv=CV,scoring='r2')
    print(f"Model score: {model.score(X,y)}\nLOO X-Val: {scores.mean()}")
    return sorted(list(zip(feature_names,model.coef_)), key=lambda x: abs(x[1]), reverse=True)

def random_forest_regressor_sklearn(y, *columns, **kwargs):
    if type(y) == str:
        y = ratings_df[y]
    X, feature_names = get_data(*columns)
    clf = RandomForestRegressor(**kwargs)
    model = clf.fit(X,y)
    scores = cross_val_score(clf, X, y, cv=CV,scoring='r2')
    print(f"score: {scores.mean()}")

In [None]:
ratings_df

In [None]:
s1 = set(ratings_df)
s2 = set(sum([prior_filtering_col_names, list(yc_updated_reviews.businessID.unique()), time_cols],[]))
s1 - s2, s2 - s1

In [None]:
#linear_regression_sklearn("1_mo_filtered_ratio", col_names, list(yc_updated_reviews.businessID.unique()))
linear_regression_sklearn(ratings_df["1_mo_filtered_ratio"], prior_filtering_col_names, list(yc_updated_reviews.businessID.unique()), time_cols)


In [None]:
#linear_regression_sklearn("1_mo_filtered_ratio", col_names, list(yc_updated_reviews.businessID.unique()))
linear_regression_statsmodels(ratings_df["1_mo_filtered_ratio"], prior_filtering_col_names, list(yc_updated_reviews.businessID.unique()), time_cols)


In [None]:
#linear_regression_sklearn("1_mo_filtered_ratio", col_names, list(yc_updated_reviews.businessID.unique()))
linear_regression_statsmodels(ratings_df["1_mo_rating"], rating_col_names, list(yc_updated_reviews.businessID.unique()), time_cols)


In [None]:
#linear_regression_sklearn("1_mo_filtered_ratio", col_names, list(yc_updated_reviews.businessID.unique()))
linear_regression_statsmodels(ratings_df["1_mo_filtered_ratio"], prior_filtering_col_names, list(yc_updated_reviews.businessID.unique()), col_names, time_cols)


In [None]:
#linear_regression_sklearn("1_mo_filtered_ratio", col_names, list(yc_updated_reviews.businessID.unique()))
linear_regression_sklearn(ratings_df["1_mo_filtered_ratio"], col_names, list(yc_updated_reviews.businessID.unique()), time_cols)


In [None]:
#random_forest_regressor_sklearn("1_mo_filtered_ratio", col_names, list(yc_updated_reviews.businessID.unique()))
random_forest_regressor_sklearn("1_mo_filtered_ratio", col_names, prior_filtering_col_names, list(yc_updated_reviews.businessID.unique()), time_cols)

In [None]:
#random_forest_regressor_sklearn("1_mo_filtered_ratio", col_names, list(yc_updated_reviews.businessID.unique()))
random_forest_regressor_sklearn("1_mo_filtered_ratio", col_names, list(yc_updated_reviews.businessID.unique()), time_cols)

# Has the ham distribution changed?

### Is the rating distrubtion across all businesses static?

In [None]:
yc_updated_reviews.date.max()

In [None]:
yc_updated_reviews["year"] = yc_updated_reviews.date.map(lambda x: x.year)

In [None]:
yc_updated_reviews["review_length"] = yc_updated_reviews.content.map(len)

In [None]:
yc_updated_reviews[yc_updated_reviews.flagged=="N"].sort_values("year").groupby("year").mean()

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    for year, group in yc_updated_reviews[yc_updated_reviews.flagged=="N"].sort_values(["year","rating"]).groupby(["year"]):
        print(year)
        print(group.groupby("rating").size() / len(group))

In [None]:
max_date = yc_reviews.date.max()
old_ratings = yc_reviews[yc_reviews.flagged=="N"].groupby("businessID").rating.mean()
new_ratings = yc_updated_reviews[(yc_updated_reviews.date <= max_date) & (yc_updated_reviews.flagged=="N")].groupby("businessID").rating.mean()

rating_diff = new_ratings-old_ratings

pd.concat([old_ratings.rename("old rating"),new_ratings.rename("new rating"),rating_diff.rename("diff")],axis=1).sort_values("diff")

In [None]:
for bid in yc_reviews.businessID.unique():
    try:
        if yc_businesses.loc[bid] is not None:
            continue
    except KeyError:
        pass
    print(bid)
    assert len(yc_businesses[yc_businesses.id == bid].name) == 1, bid

In [None]:
yc_businesses.loc["g51qDl6fQhgat-kFTrcbug"]

## Match changes over time

In [None]:
yc_reviews["matchFlagged"] = yc_reviews.matchID.map(lambda x: yc_updated_reviews.loc[x].flagged, na_action='ignore')

In [None]:
yc_reviews["year"] = yc_reviews.date.map(lambda x: x.year)
yc_reviews["month"] = yc_reviews.date.map(lambda x: x.month)

In [None]:
yc_reviews["matchFlagged"] = yc_reviews["matchFlagged"].fillna("X")

In [None]:
rows = []
labels = []
for (startFlag,endFlag), group in yc_reviews.groupby(["flagged","matchFlagged"]):
    labels.append(",".join([startFlag,endFlag]))
    rows.append(group.groupby("year").size() / yc_reviews.groupby("year").size())
    
pd.DataFrame(rows,index=labels)

In [None]:
yc_reviews.groupby("year").size()

In [None]:
rows = []
indexes = []
for (flag,mflag), group in yc_reviews.groupby(["flagged", "matchFlagged"]):
    rows.append(group.groupby("year").size())
    indexes.append(f"{flag}->{mflag}")
    
display("Number in each category by year")
pd.DataFrame(rows,index=indexes,dtype="int64").fillna(0).astype("int64")

In [None]:
rows = []
indexes = []
for (flag,mflag), group in yc_reviews.groupby(["flagged", "matchFlagged"]):
    s = pd.Series(index=sorted(yc_reviews.year.unique()),dtype="object")
    for year, ygroup in group.groupby("year"):
        months = ygroup.groupby("month").size()
        s[year] = f"{months.max()}/{months.min()}"
    rows.append(s)
    indexes.append(f"{flag}->{mflag}")

print("Min/Max month by year")
pd.DataFrame(rows,index=indexes).fillna("0/0")2

In [None]:
rows = []
indexes = []
for (flag,mflag), group in yc_reviews.groupby(["flagged", "matchFlagged"]):
    rows.append(group.groupby("year").size())
    indexes.append(f"{flag}->{mflag}")
    
pd.DataFrame(rows,index=indexes,dtype="int64").fillna(0).astype("int64")

In [None]:
rows = []
labels = []
for (startFlag,endFlag), group in yc_reviews.groupby(["flagged","matchFlagged"]):
    labels.append(",".join([startFlag,endFlag]))
    rows.append(group.groupby("year").size() / yc_reviews[yc_reviews.flagged == startFlag].groupby("year").size())
    
pd.DataFrame(rows,index=labels)

In [None]:
list(yc_updated_reviews)

In [None]:
list(yc_businesses)

In [None]:
id_map = {
    yc_id: row.id
    for yc_id, row in yc_businesses.loc[yc_updated_reviews.businessID.unique()].iterrows()
}
yc_updated_reviews["businessID_yc"] = yc_updated_reviews.businessID.map(id_map)

In [None]:
df = pd.concat([yc_reviews[yc_reviews.flagged=="N"].groupby("businessID").rating.mean().rename("old_rating"), yc_updated_reviews[(yc_updated_reviews.date <= yc_reviews.date.max()) & (yc_updated_reviews.flagged=="N")].groupby("businessID").rating.mean().rename("new_rating")],axis=1)

In [None]:
df["diff"] = df.new_rating - df.old_rating
df

In [None]:
sns.boxplot(df["diff"])

In [None]:
display(df[df.new_rating.isnull()])
display(df[df.old_rating.isnull()])

In [None]:
yc_updated_reviews[yc_updated_reviews.businessID == "BISUDalmPulSzHvsO3PhDA"]

In [None]:
yc_reviews[yc_reviews.businessID == "BISUDalmPulSzHvsO3PhDA"]