In [None]:
import constants
import pandas  as pd
from tqdm import tqdm
import hashlib
import collections
import itertools
import functools
import seaborn as sns
import pickle
import safer
import matplotlib.pyplot as plt
import math
import random

In [None]:
sns.set_style(style="whitegrid")

In [None]:
extended = False
constants.set_crawl_source(constants.CRAWL_SOURCE_EXTENDED if extended else constants.CRAWL_SOURCE_CHICAGO)

In [None]:
experiment = 2
if not extended:
    experiment = 1

In [None]:
tqdm.pandas()

In [None]:
df_all = pd.read_pickle(constants.LONG_DATA_FILE)

In [None]:
df_all["crawl_number"] = df_all["crawl_id"].apply(lambda x: constants.CRAWL_NUMBER[x])
df = df_all[df_all.experiment == experiment]
df = df.sort_values("crawl_number")

In [None]:
df["author_id"] = pd.concat([df[df.user_page_url.notnull()].user_page_url.str.slice(start=len("/user_details?userid=")),df[df.data_hovercard_id.notnull()].data_hovercard_id])

In [None]:

name_matches = collections.defaultdict(functools.partial(collections.defaultdict, set))
name_matches_exact = collections.defaultdict(functools.partial(collections.defaultdict, set))

matched_reviews = collections.defaultdict(functools.partial(collections.defaultdict, set))
matched_exact_reviews = collections.defaultdict(functools.partial(collections.defaultdict, set))

#Use a sample? -1 if doing a full run
sample_size = -1




for crawl_id, crawl_df in df.groupby("crawl_number"):
    
    print(f"Crawl: {crawl_id}" )
    
    
    username_indexed_df = crawl_df.set_index("user_name",append=True).reorder_levels([1,0]).sort_index(level=[0])

    recommended_reviews = username_indexed_df[username_indexed_df.user_page_url.notnull()]
    not_recommended_reviews = crawl_df[crawl_df.user_page_url.isnull()]
    
    
    
    if sample_size != -1:
        not_recommended_reviews = not_recommended_reviews.sample(sample_size)
    
    for rowid, row in tqdm(not_recommended_reviews.iterrows(), total=len(not_recommended_reviews)):
        user_name = row.user_name
        user_friends = row.user_friends
        user_photos = row.user_photos
        user_reviews = row.user_review_count
        user_location = row.user_location
        user_image_url = row.user_image_url
        
        try:
            name_matching_reviews = recommended_reviews.xs(user_name,level=0)
        except KeyError:
            continue
        
        m = name_matching_reviews[
            #(name_matching_reviews.user_name == user_name) &
            (name_matching_reviews.user_location == user_location) &
            ((name_matching_reviews.user_friends - user_friends).abs() <= 1) &
            ((name_matching_reviews.user_photos - user_photos).abs() <= 1) &
            ((name_matching_reviews.user_review_count - user_reviews).abs() <= 1) &
            (name_matching_reviews.user_image_url == user_image_url)
        ]
        
        m_exact = m[
            #(m.user_name == user_name) &
            (m.user_location == user_location) &
            ((m.user_friends - user_friends).abs() == 0) &
            ((m.user_photos - user_photos).abs() == 0) &
            ((m.user_review_count - user_reviews).abs() == 0) &
            (m.user_image_url == user_image_url)
        ]
        
        if len(m) > 0:
            name_matches[crawl_id][row.data_hovercard_id].update(m.author_id)
            matched_reviews[crawl_id][row.data_hovercard_id].update(list(m.index))
        else:
            name_matches[crawl_id][row.data_hovercard_id].update([])
            
        if len(m_exact) > 0:
            name_matches_exact[crawl_id][row.data_hovercard_id].update(m_exact.author_id)
            matched_exact_reviews[crawl_id][row.data_hovercard_id].update(list(m_exact.index))
        else:
            name_matches_exact[crawl_id][row.data_hovercard_id].update([])

In [None]:
name_matches.default_factory = functools.partial(collections.defaultdict, set)
name_matches_exact.default_factory = functools.partial(collections.defaultdict, set)
with safer.open(constants.AUTHOR_MATCH_FILE, "wb+", temp_file=True) as f:
    pickle.dump([name_matches,name_matches_exact], f)

In [None]:
with safer.open(constants.AUTHOR_MATCH_FILE, "rb") as f:
    name_matches,name_matches_exact = pickle.load(f)

In [None]:
last_crawl = max(list(name_matches))
match_series = pd.Series({author: len(name_matches[last_crawl][author]) for author in name_matches[last_crawl]})
match_series_exact = pd.Series({author: len(name_matches_exact[last_crawl][author]) for author in name_matches_exact[last_crawl]})

In [None]:
match_series.value_counts()

# How many recommended reviews do filtered authors author?

In [None]:
sns.histplot(match_series,discrete=True)
plt.yscale("log")

In [None]:
sns.histplot(match_series_exact,discrete=True)
plt.yscale("log")

# Cases

In [None]:
matches_series = (match_series != match_series_exact)
mismatches = pd.Series(list(matches_series[matches_series].index))
positive_matches_exact =  pd.Series(list(match_series_exact[match_series_exact > 0].index))
all_values = pd.Series(list(match_series_exact.index))

In [None]:
len(mismatches), len(matches_series)

In [None]:
for author_id in df[(df.data_hovercard_id.notnull()) & (df.crawl_id == "crawl_10")].data_hovercard_id:
    review_hashes = df[(df.data_hovercard_id == author_id) & (df.crawl_id == "crawl_10")].content_hash
    if any(df[df.content_hash.isin(review_hashes)]["user_page_url"].notnull()):
        print("Found")
        break

In [None]:
try:
    author_id = mismatches.sample(1).iloc[0]
    print(author_id)
    display(df[(df.data_hovercard_id == author_id) & (df.crawl_id == "crawl_10")])
    review_hashes = df[(df.data_hovercard_id == author_id) & (df.crawl_id == "crawl_10")].content_hash
    display(df[df.content_hash.isin(review_hashes)])
    len(df[df.content_hash.isin(review_hashes)].user_image_url.unique())
except:
    pass

# What does authorship look like during a reclassification event?

In [None]:
stats_df_all = pd.read_pickle(constants.RECALSSIFICATION_DATA_FILE)

In [None]:
names = ["Experiment", "Stratum", "author_id"]
def get_experiment_and_strata(row):
    df_rows = df_all.loc[row.members]
    df_row = df_rows.iloc[0]
    author_id = None
    for key, row in df_rows.iterrows():
        if not pd.isna(row.data_hovercard_id):
            author_id = row.data_hovercard_id
        else:
            author_id = row.user_page_url
            break
    return pd.Series([df_row.experiment,df_row.stratum,author_id], index=names)

experiment_strata = stats_df_all.progress_apply(get_experiment_and_strata,axis=1)

In [None]:
stats_df_all = pd.concat([stats_df_all,experiment_strata],axis=1)

In [None]:
stats_df = stats_df_all[stats_df_all.Experiment == experiment]

In [None]:
#stats_df = stats_df.reset_index(level=[1])

In [None]:
import re
whitespace_re = re.compile("\s+")

def trim(s, l=35):
    s = whitespace_re.sub(" ", s)
    
    if len(s) > l:
        s = s[:l-3] + "..."
    
    
    return s

def display_review_history(review_history_df):
    first_review = review_history_df.iloc[0]
    content = trim(first_review.content)
    date = first_review.date
    business_id = first_review.business_id
    
    history = []
    for crawl_id in constants.CRAWL_ORDER:
        flagged = review_history_df[review_history_df.crawl_id == crawl_id].flagged
        assert len(flagged) <= 1
        if len(flagged) == 0:
            history.append('_')
        else:
            history.append("R" if not flagged.iloc[0] else "F")
            
    print(f"{','.join(history)} | {date.strftime('%Y-%m-%d')} | {business_id} | {content}")

In [None]:
df["author_id"] = None #Set this temporarily

In [None]:
content_hash = stats_df[stats_df.reclassification_swaps >= 2].sample(1).index[0]
author_ids = df[df.content_hash == content_hash].author_id.unique()
display(author_ids)

In [None]:
%%script false --no-raise-error
review_hashes = df[(df.author_id.isin(author_ids))].content_hash.unique()
print(f"Author identifiers: {','.join(author_ids)}")
for review_hash, review_df in df[df.content_hash.isin(review_hashes)].groupby("content_hash"):
    display_review_history(review_df)

In [None]:
%%script false --no-raise-error
df[df.content_hash == content_hash]

In [None]:
%%script false --no-raise-error
review_hashes

# How many authors are there?

In [None]:
upu_offset = len("/user_details?userid=")
r_nr_author_id_table = {}
nr_r_author_id_table = {}
def get_author_ids(row):
    r = None
    nr = None
    for member_id in row.members:
        if pd.notna(df.loc[member_id].user_page_url):
            r = df.loc[member_id].user_page_url#[upu_offset:]
        elif pd.notna(df.loc[member_id].data_hovercard_id):
            nr = df.loc[member_id].data_hovercard_id
        else:
            raise Exception()
        if r and nr:
            return r, nr
    return r, nr
for r,nr in tqdm(stats_df.progress_apply(get_author_ids,axis=1)):
    r_nr_author_id_table[r] = nr
    nr_r_author_id_table[nr] = r

In [None]:
def fix_author_ids(row):
    if pd.notna(row.data_hovercard_id):
        if row.data_hovercard_id in nr_r_author_id_table:
            row["user_page_url"] = nr_r_author_id_table[row.data_hovercard_id]
    elif pd.notna(row.user_page_url):
        if row.user_page_url in r_nr_author_id_table:
            row["data_hovercard_id"] = r_nr_author_id_table[row.user_page_url]
    return row
df_fixed = df.progress_apply(fix_author_ids,axis=1)

In [None]:
num_linked = len(df_fixed[df_fixed.data_hovercard_id.notnull() & df_fixed.user_page_url.notnull()].data_hovercard_id.unique())
num_unlinked_nr = len(df_fixed[df_fixed.data_hovercard_id.notnull() & df_fixed.user_page_url.isnull()].data_hovercard_id.unique())
num_unlinked_r = len(df_fixed[df_fixed.data_hovercard_id.isnull() & df_fixed.user_page_url.notnull()].user_page_url.unique())

print(f"{num_linked + max(num_unlinked_r,num_unlinked_nr):,}-{num_linked + num_unlinked_r + num_unlinked_nr:,}")

## Let's look at this from a statistical perspective

In [None]:
df_all["rc_author_id"] = None #reclassification author ID

In [None]:
df_all_hash_index = df_all.set_index("content_hash",append=True,drop=False).reorder_levels([1,0]).sort_index(level=[0])
df_all_recommended_hash_index = df_all_hash_index[df_all_hash_index.user_page_url.notnull()]

In [None]:
chars = {True: "-", False: "+", None: "_"}
stats_df[stats_df.reclassification_swaps == 1].reclassification_order.apply(lambda ar: "".join(chars[x] for x in ar)).value_counts()

In [None]:
stats_df_all = stats_df_all.reset_index()

In [None]:
stats_df_reclass = stats_df_all[stats_df_all.progress_apply(lambda x: False if type(x.reclassification_order) is float else (True in x.reclassification_order and False in x.reclassification_order),axis=1)]

In [None]:
#%%script false --no-raise-error
author_id_map = {}

for idx, row in tqdm(stats_df_reclass.iterrows(),total=len(stats_df_reclass)):
    content_hash = row.content_hash
    rows = df_all_recommended_hash_index.xs(content_hash,level=0)
    first_row = rows.iloc[0]
    upu_rows = rows[rows.user_page_url.notnull()]
    dhi_rows = rows[rows.data_hovercard_id.notnull()]
    if len(upu_rows) > 0 and len(dhi_rows) > 0:
        author_id_map[dhi_rows.iloc[0]] = upu_rows.iloc[0]
    author_id = upu_rows.iloc[0].user_page_url if len(upu_rows) > 0 else rows.iloc[0].data_hovercard_id
    date = first_row.date
    business_id = first_row.business_id
    indexes = rows.index
    stats_df_all.loc[content_hash,"author_id"] = author_id
    stats_df_all.loc[content_hash,"date"] = date
    stats_df_all.loc[content_hash,"business_id"] = business_id

In [None]:
stats_df_all["author_id"] = stats_df_all["author_id"].replace(author_id_map)

def get_author_id(row):
    if pd.isnull(row.user_page_url):
        if row.data_hovercard_id in author_id_map:
            return author_id_map[row.data_hovercard_id]
        return row.data_hovercard_id
    return row.user_page_url
df_all["author_id"] = df_all.progress_apply(get_author_id,axis=1)

# Authorship basic statistics

In [None]:
df_all.groupby("experiment").apply(lambda subdf: f"Author range: {len(subdf[~subdf.author_id.isin(subdf.data_hovercard_id)].author_id.unique()):,} - {len(subdf.author_id.unique()):,}")

# Let's get stats on those reviews that are reclassified

In [None]:
stats_df = stats_df_all[stats_df_all.Experiment == experiment]

In [None]:
def get_reclass_pattern(reclassification_order):
    pattern = []
    last_class = None
    for classification in reclassification_order:
        if classification is None:
            continue  
        if last_class != classification: #None (start) or change
            pattern.append(classification)
        last_class = classification
    return pattern

In [None]:
matching = collections.defaultdict(lambda : 0)
matching_bg = collections.defaultdict(lambda : 0)

def get_matching_stats(m_dict,suffix="",do_sum=False):
    
    #Sum everthing up if we need to
    if do_sum:
        m_dict = {k:sum(v) for k,v in m_dict.items()}
    
    #Ensure we have all the values we need
    tf = [False,True]
    for p in itertools.product(tf,tf):
        if p not in m_dict: m_dict[p] = 0
            
    
    rec_perc = m_dict[(False,False)] / (m_dict[(False,False)] + m_dict[(False,True)])
    not_rec_perc = m_dict[(True,True)] / (m_dict[(True,True)] + m_dict[(True,False)])
    return pd.Series([rec_perc,
                    not_rec_perc],
                    index=[f"Recommended percentage matches{suffix}", f"Not recommended percentage matches{suffix}"])

reclass_count = stats_df.reclassification_order.apply(lambda x: 0 if type(x) is float else len(x)).max()

def mode(series):
    if len(series) == 0 or series.isna().all():
        return None
    else:
        return series.mode().iloc[0]
    
def get_match_class(x,t=None,v=None,v_prev=None):
    """
    matches prev and new -> 1
    mathces prev but not new -> 2
    not matches prev, but matches new -> 3
    not matches prev, matches new -> 4
    """
    
    if x[t-1] is None or x[t] is None: return None
    if x[t-1] == v_prev:
        if x[t] == v:
            return 1
        else:
            return 2
    else:
        if x[t] == v:
            return 3
        else:
            return 4

def get_review_group_stats(group):
    results = []
    
    
    l_matching = collections.defaultdict(lambda : [0 ] * reclass_count)
    l_matching_bg = collections.defaultdict(lambda: 0)
    
    
    majority_vote = [mode(group.reclassification_order.apply(lambda x: x[t])) for t in range(reclass_count)]
    reclass_pattern = get_reclass_pattern(majority_vote)
    
    if len(group) >= 2:
        #Every review the author wrote against every other review they wrote
        for idx1, stats1 in group.iterrows():
            #if type(stats1.reclassification_order) != list and math.isnan(stats1.reclassification_order):
            #    continue
            review_hash1 = stats1.content_hash
            for review_hash2, stats2 in group.iterrows():
                #if type(stats2.reclassification_order) != list and math.isnan(stats2.reclassification_order):
                #    continue
                review_hash2 = stats2.content_hash
                if review_hash1 == review_hash2: continue
                    
                #"background" is every pairing
                for c1,c2 in itertools.product(stats1.reclassification_order,stats2.reclassification_order):
                    l_matching_bg[(c1,c2)] += 1
                
                #Normal is synchronized stepping
                for t in range(len(stats1.reclassification_order)):
                    c1 = stats1.reclassification_order[t]
                    c2 = stats2.reclassification_order[t]
                    if c1 == None or c2 == None: continue #Skip if review not present at this time
                    l_matching[(c1,c2)][t] += 1
                    
        try:
            review_matching_results = [get_matching_stats(l_matching,do_sum=True),get_matching_stats(l_matching_bg, suffix=" (background)")]
            results += review_matching_results
        except ZeroDivisionError: #Zero division means the reviews never line up, so we should chuck this result
            index_names = ["Recommended percentage matches", "Not recommended percentage matches","Recommended percentage matches (background)","Not recommended percentage matches (background)", "Reclassification pattern", "Recommended Follow percentage", "Not Recommended Follow Percentage", "ZDE"]
            results.append(pd.Series([None] * (len(index_names)-1) + [True],index=index_names))
        
        for idx in l_matching:
            matching[idx] += sum(l_matching[idx])
        for idx in l_matching_bg:
            matching_bg[idx] += l_matching_bg[idx]
            
        perc_follow = []
        perc_stay = []

        #See how many reviews follow the majority
        for t, v in enumerate(majority_vote):
            if t == 0: #Skip first, can't follow no change
                v_prev = majority_vote[0]
                continue
            elif v is not None and v_prev is not None and v != v_prev: #Make sure both are non-null
                class_following = group.reclassification_order.apply(get_match_class,t=t,v=v,v_prev=v_prev)
                cf_vc = class_following.value_counts()
                
                if (cf_vc.get(1,0) + cf_vc.get(2,0)) > 0: #Matched originally
                    #When a change happens, how many follow the change?
                    perc_follow.append(cf_vc.get(1,0)  / (cf_vc.get(1,0) + cf_vc.get(2,0)))
                if (cf_vc.get(3,0) + cf_vc.get(4,0)) > 0: #Did not match originally
                    #When a change happens, how many follow the change?
                    perc_stay.append(cf_vc.get(3,0)  / (cf_vc.get(3,0) + cf_vc.get(4,0)))
            v_prev = v

        results.append(pd.Series([reclass_pattern, perc_follow, perc_stay], index=["Reclassification pattern", "Recommended Follow percentage", "Not Recommended Follow Percentage"]))
        
    else:
        return None
        #index_names = ["Recommended percentage matches", "Not recommended percentage matches","Recommended percentage matches (background)","Not recommended percentage matches (background)", "Reclassification pattern", "Recommended Follow percentage", "Not Recommended Follow Percentage"]
        #results.append(pd.Series([None] * len(index_names),index=index_names))
    
        
    
    
    
    results_s = pd.concat(results,axis=0).rename(group.author_id.iloc[0]).to_frame().T
    #display(results_s)
    
    return results_s
    
#review_group_stats = stats_df[(stats_df.reclassification_swaps >= 1)].groupby("author_id").progress_apply(get_review_group_stats)
rows = []
for author_id, subdf in tqdm(stats_df[(stats_df.reclassification_swaps >= 1)].groupby("author_id")):
    rows.append(get_review_group_stats(subdf))
    


In [None]:

try:
    review_group_stats = pd.concat(rows,axis=0)
    
    display(pd.concat([get_matching_stats(matching),get_matching_stats(matching_bg, suffix=" (background)")]))
except:
    print("Failed to build; removing bad rows")
    not_na_rows = list(filter(lambda x: x is not None, rows))
    index_set = set()
    for row in not_na_rows:
        row_index = set(row.index)
        intersection = (index_set & row_index)
        if len(intersection) != 0:
            print(intersection)
            break
        index_set |= row_index
        
    good_rows = []

    for i in tqdm(range(1,len(not_na_rows))):
        try:
            pd.concat(good_rows + [not_na_rows[i]],axis=0)
            good_rows.append(not_na_rows[i])
        except:
            print(f"Found error {i}")
            display(pd.concat(good_rows,axis=0))
            display(not_na_rows[i])
            continue
    review_group_stats = pd.concat(good_rows,axis=0)

In [None]:
display(len(review_group_stats[review_group_stats["Recommended percentage matches"].notnull()]), review_group_stats[review_group_stats["Recommended percentage matches"].notnull()]["Recommended percentage matches"].value_counts())
display(review_group_stats[review_group_stats["Recommended percentage matches"].notnull()].mean())

### How many reviews does each author make?

In [None]:
vcs = stats_df.author_id.value_counts().rename("Number of reviews")
sns.lineplot(x=range(len(vcs)),y=vcs).set(xlabel='Number of authors')
plt.xscale("log")
#plt.yscale("log")

In [None]:
review_group_stats

### How do the reviews follow when the class changes?

In [None]:
data = review_group_stats[review_group_stats["Recommended percentage matches"].notnull()].melt(value_vars = ["Recommended percentage matches", "Not recommended percentage matches","Recommended Follow percentage", "Not Recommended Follow Percentage"])
rfps = sum(data[data.variable == "Recommended Follow percentage"].value,[])
nrfps = sum(data[data.variable == "Not Recommended Follow Percentage"].value,[])
data = data.drop(data[data.variable.isin(["Recommended Follow percentage","Not Recommended Follow Percentage"])].index)
data = pd.concat([data,pd.DataFrame.from_records(({"variable": "Recommended Follow percentage", "value": rfp} for rfp in rfps))])
data = pd.concat([data,pd.DataFrame.from_records(({"variable": "Not Recommended Follow Percentage", "value": rfp} for rfp in nrfps))])
data["value"] = data.value.astype("float")
sns.violinplot(x="value",y="variable",data=data,orient="h")
plt.savefig("../../graphs/author_based_recommended_matching.pdf", bbox_inches = 'tight')

In [None]:
data[data.variable == "Not Recommended Follow Percentage"].value.mean()

In [None]:
fig = sns.histplot(review_group_stats[review_group_stats["Recommended percentage matches"].notnull()],fill=False,kde=True)
plt.savefig("../../graphs/author_based_recommended_matching")

In [None]:
review_group_stats["Reclassification pattern"].apply(lambda ar: None if ar is None else "".join(chars[x] for x in ar)).value_counts()

In [None]:
stats_df[stats_df.reclassification_order.notnull()].reclassification_order.apply(get_reclass_pattern).apply(lambda ar: "".join(chars[x] for x in ar)).value_counts()

In [None]:
matching

In [None]:
fig = sns.histplot(sum(review_group_stats[review_group_stats["Recommended percentage matches"].notnull()]["Recommended Follow percentage"],start=[]),fill=False,kde=True)
plt.savefig("../../graphs/author_based_reclassification_recommended_following.png")

In [None]:
fig = sns.histplot(sum(review_group_stats[review_group_stats["Recommended percentage matches"].notnull()]["Not Recommended Follow Percentage"],start=[]),fill=False,kde=True)
plt.savefig("../../graphs/author_based_reclassification_not_recommended_following.png")

# Cases

In [None]:
#Authors with at least 2 reviews, and their reviews don't match
author_ids = review_group_stats[((review_group_stats["Recommended percentage matches"].notnull()) & (review_group_stats["Recommended percentage matches"] != 1)) |
                               ((review_group_stats["Not recommended percentage matches"].notnull()) & (review_group_stats["Not recommended percentage matches"] != 1))].index.to_list()
for i in range(10):
    author_id = random.sample(author_ids,1)[0]
    print(f"Author: {author_id}")
    display(stats_df[stats_df.author_id == author_id])
    print("----------------------------------------------")

In [None]:
stats_df[stats_df.author_id == "AV8opO3Pqb7q33FbsHgvEQ"]

In [None]:
#CDF of reclassification swaps
sns.ecdfplot(data=stats_df, x="date", hue="reclassification_swaps")

In [None]:
stats_df.reclassification_swaps.value_counts()

In [None]:
stats_df["reclass_swaps_joined"] = stats_df.reclassification_swaps.apply(lambda x: "2+" if x >= 2 else str(int(x)))
swap = {k: f"{k} ({v})" for k,v in stats_df["reclass_swaps_joined"].value_counts().to_dict().items()}
stats_df["Number of reclassifications"] = stats_df["reclass_swaps_joined"].replace(swap)
sns.ecdfplot(data=stats_df, x="date", hue="Number of reclassifications").set()