In [None]:
import constants
import pandas  as pd
from tqdm import tqdm
import hashlib
import math
from pandarallel import pandarallel
#import swifter

In [None]:
tqdm.pandas()
#pandarallel.initialize()

In [None]:
extended = False
constants.set_crawl_source(constants.CRAWL_SOURCE_EXTENDED if extended else constants.CRAWL_SOURCE_CHICAGO)

In [None]:
df = pd.read_pickle(constants.LONG_DATA_FILE)

In [None]:
df["crawl_number"] = df["crawl_id"].apply(lambda x: constants.CRAWL_NUMBER[x])
df = df.sort_values("crawl_number")

In [None]:
def get_stripped_text(text):
    text = link_re.sub("",text)
    return nonalpha_re.sub("",text)

def hash_review_stripped(review):
    stripped_review = get_stripped_text(review)
    h = hashlib.sha1()
    h.update(stripped_review.encode())
    return h.digest()

def hash_review(review):
    h = hashlib.sha1()
    h.update(review.encode())
    return h.digest()

In [None]:
def get_grouped_stats(group):
    stats_row = pd.Series()
    val_counts = group.crawl_id.value_counts()
    stats_row["recurring_reviews"] = len(val_counts) > 1
    stats_row["duplicate_reviews"] = val_counts.apply(lambda x: x > 1).any()
    
    stats_row["members"] = list(group.index)
    
    #Business metadata -- since we group by business, it should be OK to pull this out like so
    single_review = group.iloc[0]
    stats_row["experiment"] = single_review.experiment
    stats_row["stratum"] = single_review.stratum
    stats_row["Stratum1"] = single_review.Stratum1
    stats_row["Stratum2"] = single_review.Stratum2
    
    if not stats_row["duplicate_reviews"]:
        
        #Review metadata
        stats_row["date"] = single_review.date
        stats_row["business_id"] = single_review.business_id
        stats_row["rating"] = single_review.rating
        
        
        
        reclass_order = []
        for crawl_id in constants.CRAWL_ORDER:
            flagged_list = group[group.crawl_id == crawl_id].flagged
            if len(flagged_list) == 0:
                reclass_order.append(None)
            else:
                reclass_order.append(flagged_list.iloc[0])
                
        prev_val = None
        swaps = 0
        disappeared = False
        reappearances = 0
        for val in reclass_order:
            if prev_val is not None:
                if prev_val != val: #Did it get reclassified
                    swaps += 1
                if val is None:
                    disappeared = True
            elif disappeared and val is not None:
                reappearances += 1
            
            prev_val = val
        
        stats_row["reclassification_and_disappearance_swaps"] = swaps
        
        prev_val = None
        swaps = 0
        for val in reclass_order:
            if val is None:
                continue
            if prev_val is not None:
                if prev_val != val: #Did it get reclassified
                    swaps += 1
            prev_val = val
            
        stats_row["reclassification_swaps"] = swaps
        
        stats_row["reappearances"] = reappearances
        try:
            stats_row["reclassification_start"] = next(filter(lambda x: x is not None, reclass_order))
            stats_row["reclassification_end"] = next(filter(lambda x: x is not None, reversed(reclass_order)))
            stats_row["reclassification_order"] = reclass_order
        except StopIteration:
            print(reclass_order)
            raise
    return pd.DataFrame([stats_row])

In [None]:
stats_df = df.groupby(["content_hash"]).progress_apply(get_grouped_stats)

In [None]:
stats_df = stats_df.droplevel(1) #Remove the "0" level

In [None]:
stats_df.to_pickle(constants.RECALSSIFICATION_DATA_FILE)

In [None]:
stats_df = pd.read_pickle(constants.RECALSSIFICATION_DATA_FILE)

In [None]:
print(f"Percentage of duplicates reviews: {stats_df.duplicate_reviews.value_counts()[True]/len(stats_df):%}")

In [None]:
def reappearances(reclass_order):
    prev_val = None
    disappeared = False
    reappearances = 0
    for val in reclass_order:
        if prev_val is not None:
            if val is None:
                disappeared = True
        elif disappeared and val is not None:
            reappearances += 1
            
    return reappearances

stats_df[stats_df.reclassification_order.notnull()].reclassification_order.apply(reappearances).value_counts()

In [None]:
class_chars = {True: "-", False: "+", None: ""} #True means not recommended, which is more intuitively a '-'
def get_pattern(reclass_order):
    pattern = []
    prev_c = None
    for c in reclass_order:
        if c is None:
            continue
        
        if c != prev_c:
            pattern.append(c)
            prev_c = c
    return "".join(class_chars[x] for x in pattern)
stats_df["reclassification_pattern"] = stats_df[stats_df.reclassification_order.notnull()].reclassification_order.apply(get_pattern)

In [None]:
row = stats_df[(stats_df.reclassification_swaps == 2) & stats_df.reclassification_order.notnull()].sample(1)
print(row.reclassification_order.iloc[0])
display(df.loc[row.iloc[0].members])

In [None]:
for experiment in sorted(stats_df.experiment.unique()):
    print(f"Experiment: {experiment}")
    sub_df = stats_df[stats_df.experiment == experiment]
    for i in sorted(stats_df.reclassification_swaps.unique()):
        print(i)
        print("How many started as filtered?")
        print(sub_df[(sub_df.reclassification_swaps == i) & sub_df.reclassification_order.notnull()].reclassification_pattern.apply(lambda x: x[0]).value_counts().sort_index())
        print(f"Sum: {len(sub_df[(sub_df.reclassification_swaps == i) & sub_df.reclassification_order.notnull()]):,}")
        perc = len(sub_df[(sub_df.reclassification_swaps == i) & sub_df.reclassification_order.notnull()])/len(sub_df[sub_df.reclassification_order.notnull()])
        print(f"Percentage: {perc:%} ({1-perc:%})")
        print("How many observations do we have?")
        print(sub_df[(sub_df.reclassification_swaps == i) & sub_df.reclassification_order.notnull()].reclassification_order.apply(lambda x: sum(map(lambda x: x != None, x))).value_counts().sort_index())
        print("How many of each pattern do we have?")
        print(sub_df[(sub_df.reclassification_swaps == i) & sub_df.reclassification_order.notnull()].reclassification_pattern.value_counts().sort_values())
        print("------------------------------------")
        

In [None]:
stats_df[stats_df.reclassification_order.notnull()].reclassification_order.apply(lambda x: sum(map(lambda x: x != None, x)))

In [None]:
stats_df.reclassification_order.apply(lambda x: None if (type(x) is not list and pd.isnull(x)) else sum(map(lambda x: x != None, x)))

In [None]:
stats_df["observations"] = stats_df.reclassification_order.apply(lambda x: None if (type(x) is not list and pd.isnull(x)) else sum(map(lambda x: x != None, x)))

In [None]:
stats_df[(stats_df.observations == 1) & (stats_df.reclassification_swaps > 0)]

In [None]:
stats_df.duplicate_reviews.value_counts()

In [None]:
stats_df.reclassification_swaps.value_counts()

In [None]:
h = stats_df[stats_df.reclassification_swaps == 3].index[0]
print(h)

df[df.content_hash == h]

In [None]:
sample_df = stats_df[stats_df.reclassification_swaps == 2].sample(1)
h = sample_df.index[0][0]
display(sample_df)

df[df.content_hash == h]

In [None]:
sample_df = stats_df[stats_df.reclassification_swaps == 1].sample(1)
h = sample_df.index[0][0]
display(sample_df)

df[df.content_hash == h]

# Clean up

In [None]:
df_original = None
df = None
stats_df = None
group = None
value = None