In [None]:
import json
import os
import re
import traceback

import math
import scipy
import scipy.stats
import numpy as np
import random

import pyzipcode
import hashlib

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown
from IPython.display import HTML
from tqdm import tqdm

import sqlite3
import pandas as pd
import nltk
import statsmodels.stats.multitest as multitest

import itertools
import collections
import functools
import collections


In [None]:
yc_reviews = pd.read_pickle("../data/pickles/yelpchi_reviews.pkl")
yc_updated_reviews = pd.read_pickle("../data/pickles/yelpchi_updated_reviews.pkl")
yc_businesses = pd.read_pickle("../data/pickles/yelpchi_businesses_with_chain.pkl")

In [None]:
def flatten_json(v):
    if type(v) == str or type(v) == int or type(v) == float or type(v) == bool or v == None:
        return v
    if type(v) == list:
        new_dict = {}
        for idx, item in enumerate(v):
            flattened = flatten_json(item)
            if type(flattened) == dict:
                for ko,vo in flattened.items():
                    new_dict["%d.%s" % (idx,ko)] = vo
            else:
                new_dict["%d" % (idx)] = flattened
        return new_dict
    elif type(v) == dict:
        new_dict = {}
        for key,value in v.items():
            value = flatten_json(value)
            if type(value) == dict:
                for ko,vo in value.items():
                    new_dict["%s.%s" % (key,ko)] = vo
            else:
                new_dict[key] = value
        return new_dict
    else:
        raise Exception(v,type(v))
            

mismatches = None
            
def get_chicago_businesses():
    
    global mismatches
    mismatches = collections.defaultdict(list)
    
    with open("../data/ranking_zip_map.json") as f:
        zipmap = json.load(f)
    for zipcode,businesses in zipmap.items():
        print(zipcode)
        for business in businesses:
            
            if business["location"]["zip_code"] != zipcode:
                index = None
                try:
                    index = business[f"index_{zipcode}"]
                except:
                    print("Couldn't get index")
                mismatches[zipcode].append((business["name"],business["id"],index,business["location"]["zip_code"]))
            
            if "special_hours" in business:
                del business["special_hours"]
            if "hours" in business:
                del business["hours"]
            try:
                yield flatten_json(business)
            except:
                display(business)
                raise

In [None]:
chicago_businesses = pd.DataFrame.from_records(get_chicago_businesses())

In [None]:
mismatches

In [None]:
yc_bids = set(yc_businesses.index.values)
chicago_bids = set(chicago_businesses.id)
len(yc_bids & chicago_bids), len(yc_bids), len(chicago_bids)

In [None]:
print(yc_businesses.loc[(yc_bids & chicago_bids)].groupby("is_closed").size())
print(yc_businesses.loc[(yc_bids - chicago_bids)].groupby("is_closed").size())
print(yc_businesses.groupby("is_closed").size())

In [None]:
yc_businesses_no_rank = yc_businesses.loc[(yc_bids - chicago_bids)]
print(yc_businesses_no_rank.groupby("rating").size())
sns.distplot(yc_businesses_no_rank.review_count,label="All")
sns.distplot(yc_businesses_no_rank[yc_businesses_no_rank.rating >=4].review_count,label="Rating >= 4")
sns.distplot(yc_businesses_no_rank[yc_businesses_no_rank.rating > 4].review_count,label="Rating > 4")
plt.legend()

In [None]:
(set(chicago_businesses["location.zip_code"]) | set(yc_businesses["location.zip_code"])) - (set(chicago_businesses["location.zip_code"]) & set(yc_businesses["location.zip_code"]))

In [None]:
chicago_businesses = chicago_businesses.drop_duplicates("id")

In [None]:
bins = np.arange(0.75,5.5,0.5)
sns.distplot(chicago_businesses.rating,kde=False,bins=bins)

In [None]:
chicago_business_ids = set(chicago_businesses["id"])
matches = 0
all_businesses = set()
with open("../data/yelp_academic_dataset/yelp_academic_dataset_review.json") as f:
    for line in f:
        review = json.loads(line)
        bid = review["business_id"]
        if bid in chicago_business_ids:
            matches += 1
        all_businesses.add(bid)
print(matches,matches/len(chicago_business_ids),len(chicago_business_ids),len(all_businesses))

In [None]:
next(chicago_business_ids.__iter__())

In [None]:
next(all_businesses.__iter__())

In [None]:
review

In [None]:
c_coords = (chicago_businesses["coordinates.latitude"].mean(),chicago_businesses["coordinates.longitude"].mean())

In [None]:
shortest_dist = 1000000
sd_bus = None

with open("../data/yelp_academic_dataset/yelp_academic_dataset_business.json") as f:
    for line in f:
        business = json.loads(line)
        coords = np.array((business["latitude"], business["longitude"]))
        dist = np.linalg.norm(c_coords - coords)
        if shortest_dist > dist:
            sd_bus = business
            shortest_dist = dist
            

In [None]:
shortest_dist

In [None]:
sd_bus

In [None]:
shortest_dist = 1000000
longest_dist = 0

distances = []

for bid, business in chicago_businesses.iterrows():
    coords = np.array((business["coordinates.latitude"], business["coordinates.longitude"]))
    dist = np.linalg.norm(c_coords - coords)
    if shortest_dist > dist:
        sd_bus = business
        shortest_dist = dist
    if longest_dist < dist:
        longest_dist = dist
        
    distances.append(dist)
        
shortest_dist, longest_dist

In [None]:
chicago_businesses["distance"] = distances

In [None]:
chicago_businesses.sort_values("distance")

In [None]:
ratings = collections.defaultdict(lambda:0)
with open("../data/yelp_academic_dataset/yelp_academic_dataset_review.json") as f:
    for line in f:
        review = json.loads(line)
        ratings[review["stars"]] += 1

In [None]:
ratings

In [None]:
x,y=zip(*sorted(list(ratings.items()),key=lambda x: x[0]))
x,y
#sns.barplot(x=x,y=y)

In [None]:
max_yc = yc_reviews.date.apply(lambda x: x.strip()).apply(lambda x: x if not x.startswith("Updated - ") else x[len("Updated - "):]).astype("datetime64").max()
yc_new = yc_updated_reviews[yc_updated_reviews.date > max_yc]

In [None]:
df_ratings = pd.DataFrame()
df_ratings["yc_updated"] = yc_updated_reviews[yc_updated_reviews.flagged == "N"].groupby("rating").size()
df_ratings["yc"] = yc_reviews[yc_reviews.flagged == "N"].groupby("rating").size()
df_ratings["yc_new"] = yc_new[yc_new.flagged == "N"].groupby("rating").size()
df_ratings["yc_updated_filtered"] = yc_updated_reviews[yc_updated_reviews.flagged == "Y"].groupby("rating").size()
df_ratings["yc_filtered"] = yc_reviews[yc_reviews.flagged == "Y"].groupby("rating").size()
df_ratings["yc_new_filtered"] = yc_new[yc_new.flagged == "Y"].groupby("rating").size()
df_ratings["yc_updated_all"] = yc_updated_reviews.groupby("rating").size()
df_ratings["yc_all"] = yc_reviews.groupby("rating").size()
df_ratings["yc_new_all"] = yc_new.groupby("rating").size()
df_ratings["yad"] = y

In [None]:
df_plottable = pd.DataFrame()
for column in list(df_ratings):
    df = pd.DataFrame()
    df["x"] = df_ratings.index.values
    y = np.array(df_ratings[column])
    y_norm = y / sum(y)
    df["y"] = y_norm
    df["hue"] = column
    df_plottable = pd.concat([df_plottable,df])

In [None]:
sns.barplot(x="x",y="y",hue="hue",data=df_plottable)

In [None]:
scipy.stats.ks_2samp(yc_reviews.rating,yc_new.rating)

In [None]:
scipy.stats.ks_2samp(yc_updated_reviews.rating.sample(1000),yc_updated_reviews.rating.sample(1000))

In [None]:
scipy.stats.ks_2samp(yc_updated_reviews.rating,yc_reviews.rating)

In [None]:
scipy.stats.ks_2samp(df_ratings.yc,df_ratings.yc_updated)

In [None]:
list(chicago_businesses)