In [None]:
import json
import os
import re
import traceback

import math
import scipy
import scipy.stats

import pyzipcode
import hashlib

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown
from IPython.display import HTML
from tqdm import tqdm
#import fuzzywuzzy
#import fuzzywuzzy.fuzz

import sqlite3
import pandas as pd
import nltk

import itertools
import collections
import functools
import collections
import random


In [None]:
# We need to be able to hash data

alpha_re = re.compile(r'[a-zA-Z]')
nonalpha_re = re.compile(r'[^a-zA-Z]+')
#https://urlregex.com/
link_re = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def get_bag_of_words(review):
    
    sentences = nltk.tokenize.sent_tokenize(review)
    
    tokens = itertools.chain(*(nltk.tokenize.word_tokenize(sent) for sent in sentences))
    
    #Lowercase
    tokens = (token.lower() for token in tokens)
    
    #Needs to have alphanumeric
    tokens = filter(alpha_re.search, tokens)
    
    counter = collections.Counter(tokens)
    return counter

def compare_reviews(review1, review2):
#     bag1 = get_bag_of_words(review1)
#     bag2 = get_bag_of_words(review2)
#     all_words = set(itertools.chain(bag1.keys(),bag2.keys()))
#     mismatches = 0
#     total = 0
#     for word in all_words:
#         mismatches += abs(bag1[word] - bag2[word])
#         total += bag1[word] + bag2[word]
#     return 1-mismatches/total
    return fuzzywuzzy.fuzz.ratio(review1,review2)

def hash_review_bow(review):
    bag = get_bag_of_words(review)
    h = hashlib.sha1()
    h.update(str(bag).encode())
    return h.digest()


def get_stripped_text(text):
    text = link_re.sub("",text)
    return nonalpha_re.sub("",text)

def hash_review_stripped(review):
    stripped_review = get_stripped_text(review)
    h = hashlib.sha1()
    h.update(stripped_review.encode())
    return h.digest()

# Load Data

In [None]:
def flatten_json(v):
    if type(v) == str or type(v) == int or type(v) == float or type(v) == bool or v == None:
        return v
    if type(v) == list:
        new_dict = {}
        for idx, item in enumerate(v):
            flattened = flatten_json(item)
            if type(flattened) == dict:
                for ko,vo in flattened.items():
                    new_dict["%d.%s" % (idx,ko)] = vo
            else:
                new_dict["%d" % (idx)] = flattened
        return new_dict
    elif type(v) == dict:
        new_dict = {}
        for key,value in v.items():
            value = flatten_json(value)
            if type(value) == dict:
                for ko,vo in value.items():
                    new_dict["%s.%s" % (key,ko)] = vo
            else:
                new_dict[key] = value
        return new_dict
    else:
        raise Exception(v,type(v))
            

def get_yelpchi_businesses():
    with open("../data/eyg_data/businessid_to_data.json") as f:
        businessid_map = json.load(f)
    for bid in businessid_map:
        if "special_hours" in businessid_map[bid]:
            del businessid_map[bid]["special_hours"]
        if "hours" in businessid_map[bid]:
            del businessid_map[bid]["hours"]
            
        businessid_map[bid]["businessID_alternate"] = bid
        try:
            yield flatten_json(businessid_map[bid])
        except:
            display(businessid_map[bid])
            raise

In [None]:
ycu_businesses = pd.DataFrame.from_records(get_yelpchi_businesses())

## Load Yelp CHI

In [None]:
with sqlite3.connect("../data/eyg_data/yelpResData.db") as conn:
    conn.text_factory = lambda b: b.decode(errors = 'ignore')
    yc_r_reviews = pd.read_sql_query("SELECT * FROM review WHERE flagged = 'Y' OR flagged = 'N';", conn)
    yc_r_reviewers = pd.read_sql_query("SELECT * FROM reviewer WHERE reviewerID IN (SELECT DISTINCT(reviewerID) FROM review WHERE flagged == 'Y' OR flagged == 'N');", conn)
    yc_r_data = pd.read_sql_query("SELECT * FROM restaurant WHERE restaurantID IN (SELECT DISTINCT(restaurantID) FROM review WHERE flagged == 'Y' OR flagged == 'N');", conn)

In [None]:
with sqlite3.connect("../data/eyg_data/yelpHotelData.db") as conn:
    conn.text_factory = lambda b: b.decode(errors = 'ignore')
    yc_h_reviews = pd.read_sql_query("SELECT * FROM review WHERE flagged = 'Y' OR flagged = 'N';", conn)
    yc_h_reviewers = pd.read_sql_query("SELECT * FROM reviewer WHERE reviewerID IN (SELECT DISTINCT(reviewerID) FROM review WHERE flagged == 'Y' OR flagged == 'N');", conn)
    yc_h_data = pd.read_sql_query("SELECT * FROM hotel WHERE hotelID IN (SELECT DISTINCT(hotelID) FROM review WHERE flagged == 'Y' OR flagged == 'N');", conn)

In [None]:
len(yc_h_data)

In [None]:
yc_reviewers = pd.concat([yc_r_reviewers,yc_h_reviewers]).drop_duplicates(subset=["reviewerID"]).set_index("reviewerID")

In [None]:
yc_r_reviews["reviewHashStripped"] = yc_r_reviews.reviewContent.apply(hash_review_stripped)
yc_h_reviews["reviewHashStripped"] = yc_h_reviews.reviewContent.apply(hash_review_stripped)
yc_r_reviews["reviewHashBOW"] = yc_r_reviews.reviewContent.apply(hash_review_bow)
yc_h_reviews["reviewHashBOW"] = yc_h_reviews.reviewContent.apply(hash_review_bow)

In [None]:
restaurantIDs = yc_r_reviews.restaurantID.unique()
hotelIDs = yc_h_reviews.hotelID.unique()

yc_r_reviews["type"] = "r"
yc_h_reviews["type"] = "h"
yc_h_reviews["businessID"] = yc_h_reviews["hotelID"]
yc_r_reviews["businessID"] = yc_r_reviews["restaurantID"]

yc_reviews = pd.concat([yc_r_reviews,yc_h_reviews]).reset_index(drop=True)




In [None]:

yc_h_data = yc_h_data.rename(columns={"hotelID":"businessID"})
yc_r_data = yc_r_data.rename(columns={"restaurantID":"businessID"})
yc_r_data["type"] = "r"
yc_h_data["type"] = "h"

yc_business_data = pd.concat([yc_r_data,yc_h_data])
yc_business_data = yc_business_data.set_index("businessID")



yc_business_data = yc_business_data.rename(columns={
    colname: "yc_%s" % colname for colname in yc_business_data
})

In [None]:
yc_reviews

In [None]:
yc_business_data

## Load updated data

In [None]:
with open("../data/eyg_data/businessid_to_data.json") as f:
    business_data = json.load(f)

In [None]:
business_data = pd.DataFrame.from_records(list(business_data.values()))

In [None]:
with open("../data/eyg_data/yelpchi_reviews.json") as f:
    yc_n_updated = json.load(f)
with open("../data/eyg_data/yelpchi_filtered_reviews.json") as f:
    yc_y_updated = json.load(f)


In [None]:
yc_n_updated['50gFzdVglOz88eFJ6v-26A']

In [None]:
yc_y_updated['50gFzdVglOz88eFJ6v-26A']

In [None]:
def fix_reviews(reviews):
    """
    Fixes a formatting bug in some of the reviews
    """
    if len(reviews) != 0 and type(reviews[0]) == list:
        assert type(reviews[1]) == int
        assert len(reviews) == 2
        return reviews[0]
    return reviews

In [None]:
yc_n_updated["Btjt4D8dJ-yEdsts3Tj5Hg"][0]

In [None]:
flat_reviews = []
total = len(yc_n_updated) + len(yc_y_updated)

for bid, reviews in tqdm(yc_n_updated.items(),total=total):
    for review in reviews:
        review["businessID"] = bid
        review["flagged"] = "N"
        review["reviewHashStripped"] = hash_review_stripped(review["content"])
        review["reviewHashBOW"] = hash_review_bow(review["content"])
        review["reviewHashBOWNoLF"] = hash_review_bow(review["content"].replace("\n",""))
        if type(review["date"]) == list:
            review["date"] = review["date"][0]
        elif type(review["date"]) != str:
            raise Exception(review["date"])
    flat_reviews += reviews
    
for bid, reviews in tqdm(yc_y_updated.items(),initial=len(yc_n_updated),total=total):
    reviews = fix_reviews(reviews)
    for review in reviews:
        review["businessID"] = bid
        review["flagged"] = "Y"
        review["reviewHashStripped"] = hash_review_stripped(review["content"])
        review["reviewHashBOW"] = hash_review_bow(review["content"])
        review["reviewHashBOWNoLF"] = hash_review_bow(review["content"].replace("\n",""))
        
    #Filter out removed due to ToS violations
    reviews = [review for review in reviews if review["content"] != "This review has been removed for violating our Terms of Service"]
        
    flat_reviews += reviews
    
yc_updated_reviews = pd.DataFrame.from_records(flat_reviews)

In [None]:
yc_updated_reviews[yc_updated_reviews.flagged == "N"].sort_values("date")

In [None]:
yc_updated_reviews["date"] = pd.to_datetime(yc_updated_reviews["date"])

In [None]:
yc_updated_reviews["reviewerID"] = yc_updated_reviews.user_page_url.apply(lambda s: s.split("userid=")[1] if type(s) is str else None)

In [None]:
yc_updated_reviews["user_friends"] = pd.to_numeric(yc_updated_reviews["user_friends"])
yc_updated_reviews["user_photos"] = pd.to_numeric(yc_updated_reviews["user_photos"])
yc_updated_reviews["user_review_count"] = pd.to_numeric(yc_updated_reviews["user_review_count"])

### How much data have we collected?

In [None]:
num_businesses_updated = yc_updated_reviews.businessID.unique().size
num_restaurants_updated = len(set(yc_updated_reviews.businessID.unique()) & set(yc_r_reviews.restaurantID.unique()))
num_restaurants = yc_r_reviews.restaurantID.unique().size
num_hotels_updated = len(set(yc_updated_reviews.businessID.unique()) & set(yc_h_reviews.hotelID.unique()))
num_hotels = yc_h_reviews.hotelID.unique().size
print(f"""Number of businesses updated: {num_businesses_updated}/{num_hotels+num_restaurants}
Number of restaurants: {num_restaurants_updated}/{num_restaurants}
Number of hotels: {num_hotels_updated}/{num_hotels}
""")

In [None]:
len(yc_updated_reviews)

In [None]:
yc_reviews["date"] = yc_reviews.date.apply(lambda s: s[10:] if "Updated" in s else s).astype("datetime64")

# Do more cleanup and save

In [None]:
#Todo multi-index with alternate ID
ycu_businesses = ycu_businesses.set_index("businessID_alternate")

In [None]:
set(ycu_businesses.index) - set(yc_business_data.index), ycu_businesses.shape, yc_business_data.shape

In [None]:
yc_businesses = pd.concat([ycu_businesses,yc_business_data],axis=1)

In [None]:
try:
    os.mkdir("../data/pickles/")
except:
    print("already exists")
    pass

In [None]:
yc_reviews.to_pickle("../data/pickles/yelpchi_reviews_unmatched.pkl")
yc_updated_reviews.to_pickle("../data/pickles/yelpchi_updated_reviews_unmatched.pkl")
yc_businesses.to_pickle("../data/pickles/yelpchi_businesses_unmatched.pkl")
#business_data.to_pickle("../data/pickles/yelpchi_business_data.pkl")

In [None]:
%run ./YelpChi_Match_Reviews.ipynb
%run ./YelpChi_Chain_Label.ipynb