In [None]:
import os
import constants
import pandas as pd
import json
import glob
import itertools
import numpy as np
import hashlib
import pickle
import seaborn as sns

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
import importlib
importlib.reload(constants)

#Change this to determine which experiment is loaded
extended = False

#Set data source
if extended:
    source = constants.CRAWL_SOURCE_EXTENDED
else:
    source = constants.CRAWL_SOURCE_CHICAGO
constants.set_crawl_source(source)

#Load experiment ID mapping
with open(constants.ZIPCODE_TO_STRATA, "rb+") as fp:
    zc_to_strata = pickle.load(fp)
zc_to_experiment_strata_extended_f = lambda zc: zc_to_strata[zc]
zc_to_experiment_strata_chicago_f = lambda zc: (constants.CrawlExperiment.CHICAGO, 0)
if extended:
    zc_to_experiment_strata_f = zc_to_experiment_strata_extended_f
else:
    zc_to_experiment_strata_f = zc_to_experiment_strata_chicago_f

In [None]:
crawl_ids = constants.CRAWL_ORDER

In [None]:
def load_reviews(crawl_id, low_memory=True):
    for files_dir, not_rec in [(constants.RECOMMENDED_DIR % crawl_id, False), (constants.NOT_RECOMMENDED_DIR % crawl_id, True)]:
        for data_fn in os.listdir(files_dir):
            
            #All filenames should be of the format "{zipcode}.json"
            zipcode = os.path.splitext(os.path.basename(data_fn))[0]
            experiment, stratum = zc_to_experiment_strata_f(zipcode)

            #Open the file
            with open(os.path.join(files_dir,data_fn)) as f:
                business_to_reviews = json.load(f)
                
            #Extract the reviews
            for businessID,reviews in business_to_reviews.items():
                for review in reviews:
                    review["business_id"] = businessID
                    review["experiment"] = experiment
                    review["stratum"] = stratum
                    review["Stratum1"] = constants.STRATA_COMMON_NAMES[experiment][stratum]
                    review["Stratum2"] = constants.COMBINED_STRATUM_COMMON_NAMES[stratum]
                    review["crawl_id"] = crawl_id
                    review["flagged"] = not_rec
                    if review["date"].endswith("Updated review"):
                        review["date"] = review["date"][:-len("Updated review")]
                        review["updated"] = True
                    if low_memory:
                        #Only save the hash
                        pass
                    
                    
                    yield review

In [None]:
types = {
    "content": "string",
    "date": "datetime64",
    "user_image_url": "string",
    "user_page_url": "string",
    "user_name": "string",
    "user_location": "string",
    "user_friends": "int64",
    "user_photos": "int64",
    "elite": "bool",
    "business_id": "category",
    "user_review_count": "int64",
    "data_hovercard_id": "string",
    "experiment": "category",
    "stratum": "category",
}

def _print(x):
    print(x)
    return x

def get_df_for_crawl(*crawl_ids):
    df = pd.DataFrame.from_records(itertools.chain.from_iterable((load_reviews(_print(crawl_id)) for crawl_id in crawl_ids))).astype(types)
    return df

In [None]:
df = get_df_for_crawl(*crawl_ids)

In [None]:
duplicates = df[df.duplicated(["content","rating","crawl_id","date","user_image_url","user_name","user_location"],keep=False)]

In [None]:
reviews_before_dedupe = len(df)
df = df.drop_duplicates(["content","rating","crawl_id","date","user_image_url","user_name","user_location"],keep="first")
print(f"Removed {reviews_before_dedupe - len(df):,} duplicate reviews, {(reviews_before_dedupe - len(df))/reviews_before_dedupe:%}")

In [None]:
def get_stripped_text(text):
    text = link_re.sub("",text)
    return nonalpha_re.sub("",text)

def hash_review_stripped(row):
    review = row.content
    stripped_review = get_stripped_text(review)
    h = hashlib.sha1()
    h.update(stripped_review.encode())
    return h.digest()

def hash_review(row):
    review = row.content
    business_id = row.business_id
    h = hashlib.sha1()
    h.update(business_id.encode())
    h.update(review.encode())
    return h.digest()

df["content_hash"] = df.progress_apply(hash_review,axis=1)

In [None]:
df.dtypes

In [None]:
df.to_pickle(constants.LONG_DATA_FILE)

# Now do the businesses

In [None]:
def flatten_json(v):
    if type(v) == str or type(v) == int or type(v) == float or type(v) == bool or v == None:
        return v
    if type(v) == list:
        new_dict = {}
        for idx, item in enumerate(v):
            flattened = flatten_json(item)
            if type(flattened) == dict:
                for ko,vo in flattened.items():
                    new_dict["%d.%s" % (idx,ko)] = vo
            else:
                new_dict["%d" % (idx)] = flattened
        return new_dict
    elif type(v) == dict:
        new_dict = {}
        for key,value in v.items():
            value = flatten_json(value)
            if type(value) == dict:
                for ko,vo in value.items():
                    new_dict["%s.%s" % (key,ko)] = vo
            else:
                new_dict[key] = value
        return new_dict
    else:
        raise Exception(v,type(v))
            

def get_businesses(zipcodes):
    for zipcode in zipcodes:
        try:
            with open(f"{constants.BUSINESSES_DIR}/{zipcode}.json") as f:
                businesses = json.load(f)
        except:
            print(f"Empty zipcode: {zipcode}")
            continue
        
        try:
            experiment, stratum = zc_to_experiment_strata_extended_f(zipcode)
        except:
            experiment, stratum = zc_to_experiment_strata_chicago_f(zipcode)
            
        for business in businesses:
            if "special_hours" in business:
                del business["special_hours"]
            if "hours" in business:
                del business["hours"]
            
            business["experiment"] = experiment
            business["stratum"] = stratum
                
            try:
                yield flatten_json(business)
            except:
                display(business)
                raise

In [None]:
business_df = pd.DataFrame(get_businesses(constants.ZIPCODES))

In [None]:
business_df

In [None]:
business_df.to_pickle(constants.BUSINESS_DATA_FILE)

# Crawled business data

In [None]:
try:
    # now that we have the missing data fixed...
    with open(constants.REPLACEMENT_MASK_DATA,"r") as fp:
        mask_data = json.load(fp)
except:
    print("No replacement data")
    mask_data = {}

In [None]:
def get_crawled_businesses(zipcodes):
    for crawl_id in crawl_ids:
        if not os.path.exists(constants.BUSINESS_DATA_DIR % crawl_id):
            print(f"Empty crawl {crawl_id}")
            continue
        for zipcode in zipcodes:
            try:
                with open(f"{constants.BUSINESS_DATA_DIR % crawl_id}/{zipcode}.json") as f:
                    businesses = json.load(f)
            except:
                print(f"Empty zipcode: {zipcode}")
                continue

            try:
                experiment, stratum = zc_to_experiment_strata_extended_f(zipcode)
            except:
                experiment, stratum = zc_to_experiment_strata_chicago_f(zipcode)

            for business_id, business_data in businesses.items():
                
                if "amenities" in business_data:
                    business_data["ammenities"] = business_data["amenities"] #we corrected a spelling error in data release. This un-corrects it
                
                #Try to recover
                if ("ammenities" not in business_data or len(business_data["ammenities"]) == 0) and business_id in mask_data:
                    business_data = mask_data[business_id]
                
                #Flag
                if "ammenities" not in business_data:
                    business_data["ammenities"] = []
                    business_data["needs_manual_invervention"] = True

                business_data["num_ammenities"] = len(business_data["ammenities"])
                
                for ammenity in business_data["ammenities"]:
                    business_data[f"ammenity_{ammenity['alias']}"] = ammenity["isActive"]
                    
                del business_data["ammenities"]
                    
                
                business_data["experiment"] = experiment
                business_data["stratum"] = stratum
                business_data["Stratum"] = constants.STRATA_COMMON_NAMES[experiment][stratum]
                business_data["crawl_id"] = crawl_id
                business_data["business_id"] = business_id

                try:
                    yield flatten_json(business_data)
                except:
                    display(business_data)
                    raise

In [None]:
crawled_business_df = pd.DataFrame(get_crawled_businesses(constants.ZIPCODES))

In [None]:
crawled_business_df

In [None]:
crawled_business_df = crawled_business_df.set_index(["crawl_id","business_id"],drop=False)

In [None]:
crawled_business_df.to_pickle(constants.CRAWLED_BUSINESS_DATA_FILE)

In [None]:
list(crawled_business_df)

In [None]:
crawled_business_df["ammenity_customers_must_wear_masks"].value_counts()

In [None]:
crawled_business_df["ammenities_need_manual_intervention"] = crawled_business_df["ammenities_need_manual_intervention"].fillna(False)

In [None]:
crawled_business_df["num_ammenities"].value_counts()

In [None]:
crawled_business_df[crawled_business_df.ammenities_need_manual_intervention].business_id

In [None]:
missing_business_urls = business_df.loc[business_df.id.isin(crawled_business_df[crawled_business_df["num_ammenities"] == 0].business_id)].apply(lambda row: (row.id,row.url.split("?")[0]),axis=1)

In [None]:
with open(constants.MISSING_MASK_DATA,"w+") as fp:
    json.dump({bid: url for bid, url in missing_business_urls}, fp)

In [None]:
mask_data

In [None]:
crawled_business_df = None
missing_business_urls = None
business_df = None
mask_data = None
df = None

In [None]:
#Typically this is the next step in processing
%run ./Reclassification.ipynb

In [None]:
%run ./Authorship.ipynb

# Duplicates

In [None]:
import random
gby = duplicates.groupby(["content","rating","crawl_id","date","user_image_url","user_name","user_location"])

In [None]:
for name, group in gby:
    if len(group.flagged.unique()) == 1: continue
    display(name)
    display(group)
    break

In [None]:
sns.barplot(x="crawl_id",y="flagged",data=df)

In [None]:
df[df.duplicated(["content","rating","crawl_id","date","user_image_url","user_name","user_location"],keep=False)]

In [None]:
duplicates.crawl_id.value_counts()

In [None]:
reviews_before_dedupe = len(df)
df_deduped = df.drop_duplicates(["content","rating","crawl_id","date","user_image_url","user_name","user_location","flagged"],keep="last")
print(f"Removed {reviews_before_dedupe - len(df_deduped):,} duplicate reviews, {(reviews_before_dedupe - len(df_deduped))/reviews_before_dedupe:%}")

In [None]:
duplicates = df_deduped[df_deduped.duplicated(["content","rating","crawl_id","date","user_image_url","user_name","user_location"],keep=False)]

In [None]:
duplicates.crawl_id.value_counts()

In [None]:
import os
import constants
import pandas as pd
import json
import glob
import itertools
import numpy as np
import hashlib
import pickle
