In [4]:
#import the python scientific suite
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import sklearn
import seaborn as sns
import scipy
import pylab



In [5]:
from collections import defaultdict

In [7]:
# This is importing a "future" python version 3 print function.
from __future__ import print_function
from __future__ import division
#plot figures inline in Jupyter notebooks
%matplotlib inline 

#use seaborn for plotting
sns.set()
sns.set_context('paper', font_scale = 1.5)
sns.set_style('ticks')
pylab.rcParams.update({'figure.autolayout': True})

In [27]:
#define some helper functions 

def CountInstance(df, col_name):

    name, counts = np.unique(df[col_name], return_counts = True)
    counts_df = pd.DataFrame(data = {col_name: name, ('n_'+ col_name):counts})
    merged_df = pd.merge(df, counts_df, on = col_name, how = 'left')
    
    return(merged_df)

def TopChains(df, cutoff):
    #
    # Find the top chains
    # include a dataframe with column 'business_name'
    
    name, counts = np.unique(df['name'], return_counts = True)
    counts_df = pd.DataFrame(data = {'name': name, 'counts':counts})
    top_chains = counts_df.sort_values(by = 'counts', ascending = False)
    top_chains.index = range(len(top_chains)) #re-index
    
    return list(top_chains['name'][top_chains['counts'] > cutoff])

In [18]:
#read in the data from the files
file_dir = '/Volumes/1TB_BAY/yelp_dataset_challenge_academic_dataset/' #replace this with the path to your data files

f = {'R': 'A_review_stat.csv', 'T': 'B_review_text.csv', 'N': 'C_neighborhood.csv',\
    'B': 'D_business.csv', 'U': 'E_user.csv'}

d = defaultdict() #initialize an empty dictionary to hold the dataframes

for fi in f: #for each file in the dictinary of files
    d[fi] = pd.read_csv(file_dir + f[fi]) #read in the files and hold them in the dataframe dictionary

In [19]:
print(d.keys()) #the list of files in the dataframe dictionary

['R', 'B', 'U', 'T', 'N']


In [22]:
#begin filtration of data based on multiple criteria
min_business_per_neighborhood = 5
min_reviews_per_business = 2
min_reviews_per_user = 5

d['N'] = d['N'][(d['N']['n'] >= min_business_per_neighborhood)]
d['B'] = d['B'][(d['B']['review_count'] >= min_review_per_business)]
d['U'] = d['U'][(d['U']['n_reviews'] >= min_review_per_user)]

In [33]:
min_stores_per_chain = 35
top_chains = TopChains(d['B'], min_stores_per_chain) #list of all of the chains with over a certain number of reviews
print(top_chains)

['Starbucks', 'Subway', "McDonald's", 'Walgreens', 'Taco Bell', 'Pizza Hut', 'The UPS Store', "Wendy's", 'Burger King', 'Bank of America', 'Wells Fargo Bank', 'Circle K', 'Great Clips', "Domino's Pizza", 'Panda Express', "Dunkin' Donuts", 'Chase Bank', 'Supercuts', 'Chipotle Mexican Grill', 'Jiffy Lube', 'US Post Office', 'QuikTrip', "Jimmy John's", "Papa John's Pizza", 'KFC', 'Enterprise Rent-A-Car', 'Dairy Queen', 'Cvs Pharmacy', 'Jack in the Box', 'FedEx Office Print & Ship Center', 'T-Mobile', "Denny's", "Arby's"]


Unnamed: 0.1,Unnamed: 0,review_id,words,sentences,ease,grade,dale_chall,stars,business_id,z_hood,latest_zhvi,latest_sqft,lat,lon,bbox_area,polarity,subjectivity,year,user_id
0,0,6ZWmWypGMdnwwavY8ykXlg,197,13,73.17,6.8,6.62,4,uI0LvTXFACid1308KFO5ew,343635,,,40.392222,-79.947974,5.378061,0.260963,0.527941,2009.624658,iTmWHtltCtk0Gm55AOxrUA
1,1,3m6vLe955opYMSrTrlLIKw,385,25,90.09,4.4,6.10,2,uI0LvTXFACid1308KFO5ew,343635,,,40.392222,-79.947974,5.378061,0.009410,0.485827,2011.197260,JEvkfVPf_DuhX-ntE5L6bQ
2,2,MkHaKWwZ_OngdUsMt_xAqg,55,4,92.12,3.6,7.15,2,uI0LvTXFACid1308KFO5ew,343635,,,40.392222,-79.947974,5.378061,0.190000,0.580000,2011.438356,QOdrDkYXhqA8jtPzfpugWQ
3,3,gB-kzO-Jg_u2p7Cmf9xG_w,206,12,79.60,6.4,6.70,4,uI0LvTXFACid1308KFO5ew,343635,,,40.392222,-79.947974,5.378061,0.324351,0.606169,2011.586301,qPGLUQUG43b3aBZKDUBlMw
4,4,cCh6xPIcV6P2ben4bCsq7Q,208,9,65.05,9.9,8.19,3,uI0LvTXFACid1308KFO5ew,343635,,,40.392222,-79.947974,5.378061,0.209776,0.511859,2011.679452,lJsCDFtnR2-AMhDdWtCRug
5,5,QraufK_S6pdcLHXz15n8gQ,418,30,83.66,4.8,6.81,4,uI0LvTXFACid1308KFO5ew,343635,,,40.392222,-79.947974,5.378061,0.054153,0.463127,2012.459016,7R79mN6iJNU6o0CI1aBesA
6,6,p1j9N_P02v0oN6nWWPwDSg,412,34,84.68,4.4,6.80,3,uI0LvTXFACid1308KFO5ew,343635,,,40.392222,-79.947974,5.378061,0.249842,0.619531,2012.538251,nEYPahVwXGD2Pjvgkm7QqQ
7,7,6SHLQH067F_M9ZZQ48TjLA,43,5,88.74,2.9,5.87,1,uI0LvTXFACid1308KFO5ew,343635,,,40.392222,-79.947974,5.378061,-0.014444,0.271111,2013.032877,yIhhNAk3aAVUChiUhqpFaA
8,8,I0CGTPDXtQr9wmpVhAasKQ,236,15,73.17,6.8,7.39,5,uI0LvTXFACid1308KFO5ew,343635,,,40.392222,-79.947974,5.378061,0.194185,0.617168,2013.213699,qUPwH9Vvx65RxcGgZL-f7A
9,9,fYyklVeVe9Bcj71eA8UwWA,357,22,80.62,6.0,6.42,3,uI0LvTXFACid1308KFO5ew,343635,,,40.392222,-79.947974,5.378061,0.050576,0.450636,2013.591781,WzaaorVCmUTQvu4mScunNg
