In [1]:
import pandas as pd
import re
from itertools import chain,cycle, islice
import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, HTML, display_html

In [2]:
# Do not truncate column content and show all columns
pd.set_option('display.max_colwidth', None)

# Update output cell format to display mutliple objects (dataframes)
# https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side
def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h4>{title}</h4>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

In [3]:
prefix = '../../data/'
filePaths = {
    "reviews": prefix + 'yelp_academic_dataset_review.json',
    "businesses": prefix + 'yelp_academic_dataset_business.json',
    "checkins": prefix + 'yelp_academic_dataset_checkin.json',
    "tips": prefix + 'yelp_academic_dataset_tip.json',
    "users": prefix + 'yelp_academic_dataset_user.json'
}

***
# Reviews json

# Data Pre-Processing: Review Text
---

## Problem Description:
- Bin and cluster average user rating to detect and remove "outlier" users that preferentially give the highest or lowest possible rating.
- Use the bag-of-words model to simplify review text (i.e. tokenize words, generate feature vectors for sentences).
- Model review text sentiment to classify reviews as either negative, positive or neutral

- Trendy foods, Time and Restaurant Category 

In [4]:
chunksize = 1000000
reviewReader = pd.read_json(filePaths['reviews'], lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=chunksize)

## Todo: Pre-Process

***
## Merge json files: Reviews + Restaurants
> only combining reviews data relevant to businesses in our restaurants dataset
***

In [5]:
# Collect all df chunks 
chunk_list = []
for reviewChunk in reviewReader:
    reviewChunk = reviewChunk.drop(['review_id','useful','funny','cool'], axis=1)
    reviewChunk = reviewChunk.rename(columns={'stars': 'review_stars'})
    # Inner merge to obtain reviews for only restaurant
    chunk_merged = pd.merge(restaurantsTrimmed, reviewChunk, on='business_id', how='inner')
    # Show feedback on progress
    print(f"{chunk_merged.shape[0]} merged out of {chunksize:,} reviews")
    chunk_list.append(chunk_merged)

NameError: name 'restaurantsTrimmed' is not defined

In [26]:
# concat all dataframe chunks into 1 dataframe
restaurants_reviews_df = pd.concat(chunk_list, ignore_index=True, axis=0)
restaurants_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5498553 entries, 0 to 5498552
Data columns (total 18 columns):
 #   Column        Dtype  
---  ------        -----  
 0   business_id   object 
 1   name          object 
 2   address       object 
 3   city          object 
 4   state         object 
 5   postal_code   object 
 6   latitude      float64
 7   longitude     float64
 8   stars         float64
 9   review_count  int64  
 10  is_open       int64  
 11  attributes    object 
 12  categories    object 
 13  hours         object 
 14  user_id       object 
 15  review_stars  int32  
 16  text          object 
 17  date          object 
dtypes: float64(3), int32(1), int64(2), object(12)
memory usage: 734.1+ MB


### City Distribution

In [203]:
restaurant_city = restaurants_reviews_df.city.value_counts().reset_index()
restaurant_city.columns = ['city', 'count']
restaurant_city.head(20)

Unnamed: 0,city,count
0,Las Vegas,1609318
1,Phoenix,550164
2,Toronto,483619
3,Scottsdale,294496
4,Charlotte,283581
5,Pittsburgh,212637
6,Henderson,156332
7,Tempe,153852
8,Montréal,150174
9,Mesa,120877


### Which cities have the best ```<category>``` ? 

In [27]:
pd.set_option('display.max_colwidth', 50)

In [28]:
restaurantsTrimmed.head()
# dfCities = restaurantsTrimmed[[]]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
8,pQeaRpvuhoEqudo3uymHIQ,The Empanadas House,404 E Green St,Champaign,IL,61820,40.110446,-88.233073,4.5,5,1,"{'RestaurantsAttire': 'u'casual'', 'Restaurant...","Ethnic Food, Food Trucks, Specialty Food, Impo...","{'Monday': '11:30-14:30', 'Tuesday': '11:30-14..."
12,M_guz7Dj7hX0evS672wIwA,Chocolate Shoppe Ice Cream,2831 Parmenter St,Middleton,WI,53562,43.10531,-89.510142,3.5,6,1,"{'BikeParking': 'True', 'Caters': 'True', 'Whe...","Desserts, Food, Ice Cream & Frozen Yogurt","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ..."
20,CsLQLiRoafpJPJSkNX2h5Q,Middle East Deli,4508 E Independence Blvd,Charlotte,NC,28205,35.194894,-80.767442,3.0,5,0,"{'RestaurantsGoodForGroups': 'True', 'OutdoorS...","Food, Restaurants, Grocery, Middle Eastern",
23,mKTq1T_IAplDpHUcMzOXkw,Tipsy Ryde,,Gastonia,NC,28054,35.252842,-81.152698,3.5,3,1,{'BusinessAcceptsCreditCards': 'True'},"Hotels & Travel, Transportation, Taxis, Beer, ...","{'Monday': '0:0-0:0', 'Tuesday': '9:0-0:0', 'W..."
24,eBEfgOPG7pvFhb2wcG9I7w,Philthy Phillys,"15480 Bayview Avenue, unit D0110",Aurora,ON,L4G 7J1,44.010962,-79.448677,4.5,4,1,"{'RestaurantsTableService': 'False', 'Restaura...","Restaurants, Cheesesteaks, Poutineries","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."


# Reviews: Text Features

## Todo: Data integration

> Compile all .json files into a relational database to improve accessibility.
