# How does the city influence the distribution of ratings?

Are there cities that give high reviews? Are there cities whose spread of ratings narrower?

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
from sklearn import base, linear_model
import dill as pickle
mpl.rcParams['savefig.dpi'] = 1.5 * mpl.rcParams['savefig.dpi']

In [None]:
def adjust_date(df, date_col='date', year=False, month=False, week=False):
    if year: df[date_col] = df[date_col].map(lambda x: x.replace(x[:4], '2000'))
    if month: df[date_col] = df[date_col].map(lambda x: x.replace(x[8:], '01'))
    if week:
        i = 0
        for d in df[date_col]:
            if int(d[8:]) < 8: new_day = '01'
            elif int(d[8:]) < 15: new_day = '08'
            elif int(d[8:]) < 22: new_day = '15'
            else: new_day = '22'
            df.at[i,date_col] = d.replace(d[8:], new_day)
            i = i + 1
    # Convert 'date' column from strings to a datetimes
    df[date_col] = pd.to_datetime(df[date_col], format='%Y-%m-%d')
    return df;

In [None]:
business_datafile = '~/capstone/data/yelp_academic_dataset_business.csv'
biz_id = 16  # Column containing the business_id, variable used as dataframe index name
#user_datafile = '~/capstone/data/yelp_academic_dataset_user.csv'
#usr_id = 16  # Column containing the user_id, variable used as dataframe index name
review_datafile = '~/capstone/data/yelp_academic_dataset_review.csv'
rev_id = 1   # Column containing the review_id, variable used as dataframe index name

business = pd.read_csv(business_datafile, index_col=biz_id)
#user = pd.read_csv(user_datafile, index_col=usr_id)
review = pd.read_csv(review_datafile, index_col=rev_id)

rest_crit = business['categories'].map(lambda x: 'Restaurants' in x)
restaurants = business[rest_crit]
restaurant_ids = restaurants.index.values
rest_reviews = review[review['business_id'].isin(restaurant_ids)]

rest_reviews = adjust_date(rest_reviews)

In [None]:
type(rest_crit)
rest_crit.head()

In [None]:
min_reviews = 1

rest_train = restaurants[restaurants.review_count >= min_reviews]
rest_ids = rest_train.index.values
reviews_train = rest_reviews[rest_reviews['business_id'].isin(rest_ids)]
reviews_train.drop(['votes.cool','votes.funny','votes.useful','type', 'text', 'user_id'], axis=1, inplace=True)
reviews_train.reset_index(inplace=True)
reviews_train.drop(['review_id'], axis=1, inplace=True)
gby_biz_reviews = reviews_train.groupby(['business_id'], as_index=False).mean()
gby_biz_reviews.head()

In [None]:
tmp = restaurants.reset_index()
state = tmp[['business_id','state']]
print state.state.unique()
# Combine multi-state metro areas to a single state metro-area
# Drop the clearly mislabeled/bad data
# XGL - is the code for greater London
# NW - is the code for Nordrhein-Westfalen, not near Karlsrhue
state.state.replace(to_replace=['SC','MLN','FIF','ELN','BW','RP'],
                    value=['NC','EDH','EDH','EDH','KHL','KHL'],
                    inplace=True)
state = state[state.state != 'XGL']
state = state[state.state != 'NW']
print state.state.unique()
state.head()

In [None]:
grouped_reviews = pd.merge(gby_biz_reviews, state, on='business_id')
grouped_reviews.tail()

Get the histogram of star ratings for each city

In [None]:
state_dist = {}
state_mean = {}
state_sd = {}
bins = np.arange(42)/10.+0.95
bin_centers = np.arange(41)/10.+1.
for state, df in grouped_reviews.groupby(['state']):
    tmp_hist = np.histogram(df.stars, bins=bins)
    state_dist[state] = 1.*tmp_hist[0]/np.sum(tmp_hist[0])
    state_mean[state] = df.stars.mean()
    state_sd[state] = df.stars.std()
state_dist_df = pd.DataFrame(state_dist)
state_dist_df['bins'] = bin_centers
state_dist_df.head()

In [None]:
xlabels = ['']*len(state_dist_df.bins)
xlabels[0] = '1'
xlabels[5] = '1.5'
xlabels[10] = '2'
xlabels[15] = '2.5'
xlabels[20] = '3'
xlabels[25] = '3.5'
xlabels[30] = '4'
xlabels[35] = '4.5'
xlabels[40] = '5'

ax = sns.barplot(x='bins', y='IL', data=state_dist_df, color='goldenrod', alpha=.5)
sns.barplot(x='bins', y='QC', data=state_dist_df, color='dodgerblue', alpha=.5)

ax.set_xticklabels(xlabels)
ax.set(xlabel='Yelp Star Rating', ylabel='Fraction of Restaurants', title='Original ratings')

import matplotlib.patches as mpatches
yellow_patch = mpatches.Patch(color='goldenrod', label='Champaign-Urbana', alpha=0.5)
blue_patch = mpatches.Patch(color='dodgerblue', label='Montreal', alpha=0.5)
ax.legend(handles=[yellow_patch, blue_patch], loc=2)

In [None]:
print state_mean
print state_sd

In [None]:
def Standardize(row, mean_dict, sd_dict):
    return (row['stars'] - mean_dict[row['state']])/sd_dict[row['state']] + 3.0

In [None]:
tmp = grouped_reviews.apply(Standardize, args=(state_mean, state_sd), axis=1)
grouped_reviews['adjustment'] = tmp - grouped_reviews['stars']
grouped_reviews['city_adj_stars'] = tmp
grouped_reviews.head()

In [None]:
state_dist = {}
state_mean = {}
state_sd = {}
bins = np.arange(42)/10.+0.95
bin_centers = np.arange(41)/10.+1.
for state, df in grouped_reviews.groupby(['state']):
    tmp_hist = np.histogram(df.city_adj_stars, bins=bins)
    state_dist[state] = 1.*tmp_hist[0]/np.sum(tmp_hist[0])
    state_mean[state] = df.stars.mean()
    state_sd[state] = df.stars.std()
state_dist_df = pd.DataFrame(state_dist)
state_dist_df['bins'] = bin_centers

xlabels = ['']*len(state_dist_df.bins)
xlabels[0] = '1'
xlabels[5] = '1.5'
xlabels[10] = '2'
xlabels[15] = '2.5'
xlabels[20] = '3'
xlabels[25] = '3.5'
xlabels[30] = '4'
xlabels[35] = '4.5'
xlabels[40] = '5'

ax = sns.barplot(x='bins', y='IL', data=state_dist_df, color='goldenrod', alpha=.5)
sns.barplot(x='bins', y='QC', data=state_dist_df, color='dodgerblue', alpha=.5)

ax.set_xticklabels(xlabels)
ax.set(xlabel='Yelp Star Rating', ylabel='Fraction of Restaurants', title='City corrected ratings')

import matplotlib.patches as mpatches
yellow_patch = mpatches.Patch(color='goldenrod', label='Champaign-Urbana', alpha=0.5)
blue_patch = mpatches.Patch(color='dodgerblue', label='Montreal', alpha=0.5)
ax.legend(handles=[yellow_patch, blue_patch], loc=2)

In [None]:
print 'The maximum restaurant UNDER-rating is {}'.format(grouped_reviews.adjustment.max())
print 'The maximum restaurant OVER-rating is {}'.format(grouped_reviews.adjustment.min())

In [None]:
filename = '/home/vagrant/capstone/data/city_adjusted_ratings.pkl'
with open(filename,'w') as f:
    pickle.dump(grouped_reviews, f)