This notebook explores the effects of yelp ratings based on the number of reviews a restaurant receives.

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
from sklearn import base, linear_model
import dill as pickle
mpl.rcParams['savefig.dpi'] = 1.5 * mpl.rcParams['savefig.dpi']

In [None]:
business_datafile = '~/capstone/data/yelp_academic_dataset_business.csv'
biz_id = 16  # Column containing the business_id, variable used as dataframe index name
#user_datafile = '~/capstone/data/yelp_academic_dataset_user.csv'
#usr_id = 16  # Column containing the user_id, variable used as dataframe index name
review_datafile = '~/capstone/data/yelp_academic_dataset_review.csv'
rev_id = 1   # Column containing the review_id, variable used as dataframe index name

business = pd.read_csv(business_datafile, index_col=biz_id)
#user = pd.read_csv(user_datafile, index_col=usr_id)
review = pd.read_csv(review_datafile, index_col=rev_id)

In [None]:
min_reviews = 10

rest_crit = business['categories'].map(lambda x: 'Restaurants' in x)
restaurants = business[rest_crit]
restaurants = restaurants[restaurants.review_count >= min_reviews]
restaurant_ids = restaurants.index.values
rest_reviews = review[review['business_id'].isin(restaurant_ids)]

In [None]:
grouped_rest_reviews = rest_reviews.groupby(['business_id']).agg(['mean', 'std', 'count'])
grouped_rest_reviews.head()

In [None]:
x = grouped_rest_reviews['stars']['count']
y = grouped_rest_reviews['stars']['mean']
#phx_avg_review.plot(kind='scatter',x=['stars count'],y=['stars mean'])
plt.scatter(x,y, alpha=0.5)
plt.xscale('log')
plt.xlabel('Number of Reviews')
plt.ylabel('Average Rating')
plt.title('All Restaurants')
plt.xlim([10,10000])

In [None]:
def binemup(x):
    if x < 32:
        return 10
    if x < 57:
        return 50
    if x < 317:
        return 100
    if x < 563:
        return 500
    else:
        return 1000

In [None]:
tmp = grouped_rest_reviews['stars']['count'].map(binemup)

In [None]:
binned_up = grouped_rest_reviews.copy()
binned_up['count_bin'] = tmp
binned_up['star_mean'] = binned_up['stars']['mean']
binned_up.head()

In [None]:
order=[10,50,100,500,1000]
ax = sns.violinplot(x='count_bin', y='star_mean', data=binned_up, order=order)
ax.set_xlabel('Number of reviews')
ax.set_ylabel('Yelp rating')
ax.set_ylim([0,6])

bins = [0, 32, 57, 317, 563, 10000]
my_hist = np.histogram(binned_up.count_bin, bins=bins)
my_hist_frac = 1.*my_hist[0]/sum(my_hist[0])
print my_hist_frac
my_dict = {'order': order, 'hist': my_hist_frac}
hist_df = pd.DataFrame(my_dict)
sns.barplot(x='order', y='hist', data=hist_df)

Now let's look at how the restaurants with the highest number of reviews evolve over time.

In [None]:
bins = [0, 32, 57, 317, 563, 10000]
my_hist = np.histogram(binned_up.count_bin, bins=bins)
my_hist_frac = 1.*my_hist[0]/sum(my_hist[0])
print my_hist_frac

In [None]:
high_reviews = binned_up[binned_up['count_bin'] == 1000].index.values
len(high_reviews)

In [None]:
high_restaurants = review[review['business_id'].isin(high_reviews)]
high_restaurants.shape

In [None]:
tmp_df = binned_up.groupby(['count_bin']).count()
tmp_df

In [None]:
def GetAvg10(dseries):
    return dseries.iloc[:10].mean()

def GetAvg50(dseries):
    return dseries.iloc[:50].mean()

def GetAvg100(dseries):
    return dseries.iloc[:100].mean()

def GetAvg500(dseries):
    return dseries.iloc[:500].mean()

def GetAvg(dseries):
    return dseries.mean()

In [None]:
aggregates = {'stars': [GetAvg10, GetAvg50, GetAvg100, GetAvg500, GetAvg]}
sorted_df = high_restaurants.sort('date')
my_df = high_restaurants.groupby('business_id', as_index=False).agg(aggregates)
my_df.head()

In [None]:
my_stars = my_df['stars']['GetAvg10'].values.tolist()
my_stars = my_stars + my_df['stars']['GetAvg50'].values.tolist()
my_stars = my_stars + my_df['stars']['GetAvg100'].values.tolist()
my_stars = my_stars + my_df['stars']['GetAvg500'].values.tolist()
my_stars = my_stars + my_df['stars']['GetAvg'].values.tolist()

my_counts = [10]*len(high_reviews) + [50]*len(high_reviews) + \
            [100]*len(high_reviews) + [500]*len(high_reviews) + \
            [1000]*len(high_reviews)

my_id = my_df['business_id'].values.tolist()*5
my_dict = {'stars': my_stars, 'counts': my_counts, 'cat': ['high']*5*len(high_reviews), \
           'business_id': my_id}
high_review_df = pd.DataFrame(my_dict)

In [None]:
order=[10,50,100,500,1000]
ax = sns.violinplot(x='counts', y='stars', data=high_review_df, order=order)
ax.set_xlabel('Number of reviews')
ax.set_ylabel('Yelp rating')
ax.set_ylim([0,6])

In [None]:
my_dict = {'order': order, 'hist': my_hist_frac}
hist_df = pd.DataFrame(my_dict)

sns.barplot(x='order', y='hist', data=hist_df)

In [None]:
count_bin = binned_up['count_bin'].tolist()
star_mean = binned_up['star_mean'].tolist()
cat = ['all']*len(star_mean)
start = pd.DataFrame({'counts': count_bin, 'stars': star_mean, 'cat': cat, \
                     'business_id': binned_up.index.values.tolist()})
combined = pd.concat([start, high_review_df])
combined.rename(columns={'cat': 'Review Type'})
combined.head()

In [None]:
order=[10,50,100,500,1000]
ax = sns.violinplot(x='counts', y='stars', hue='cat', data=combined, \
                    order=order, split=True, palette="muted", bw=0.25, \
                    inner=None)
ax.set_xlabel('Number of reviews')
ax.set_ylabel('Yelp rating')
ax.set_ylim([0,6])

In [None]:
open_rest = restaurants[restaurants['open']]
print '{} open restaurants, {} total restaurants'.format(len(open_rest),len(restaurants))

In [None]:
binned_up2 = pd.concat([binned_up,restaurants['open']], axis=1)
binned_up2.reset_index(inplace=True)
binned_up2.rename(columns={('count_bin', ''): 'counts', ('star_mean', ''): 'stars'}, inplace=True)
binned_up2.head()

In [None]:
order=[10,50,100,500,1000]
ax = sns.violinplot(x='counts', y='stars', hue='open', data=binned_up2, \
                    order=order, split=True, palette='muted', bw=0.3, \
                    inner=None)
ax.set_xlabel('Number of reviews')
ax.set_ylabel('Yelp rating')
ax.set_ylim([0,6])