# What happens when restaurants change stars?

If a restaurant changes a yelp star rating (move up or down 1/2 star), how do the subsequent N reviews compare to the  N reviews immediately preceeding the change? If we can take an average of this, then maybe we can estimate what affect the change in star rating has.

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
from sklearn import base, linear_model
import dill as pickle
mpl.rcParams['savefig.dpi'] = 1.5 * mpl.rcParams['savefig.dpi']

In [None]:
def adjust_date(df, date_col='date', year=False, month=False, week=False):
    if year: df[date_col] = df[date_col].map(lambda x: x.replace(x[:4], '2000'))
    if month: df[date_col] = df[date_col].map(lambda x: x.replace(x[8:], '01'))
    if week:
        i = 0
        for d in df[date_col]:
            if int(d[8:]) < 8: new_day = '01'
            elif int(d[8:]) < 15: new_day = '08'
            elif int(d[8:]) < 22: new_day = '15'
            else: new_day = '22'
            df.at[i,date_col] = d.replace(d[8:], new_day)
            i = i + 1
    # Convert 'date' column from strings to a datetimes
    df[date_col] = pd.to_datetime(df[date_col], format='%Y-%m-%d')
    return df;

In [None]:
business_datafile = '~/capstone/data/yelp_academic_dataset_business.csv'
biz_id = 16  # Column containing the business_id, variable used as dataframe index name
#user_datafile = '~/capstone/data/yelp_academic_dataset_user.csv'
#usr_id = 16  # Column containing the user_id, variable used as dataframe index name
review_datafile = '~/capstone/data/yelp_academic_dataset_review.csv'
rev_id = 1   # Column containing the review_id, variable used as dataframe index name

business = pd.read_csv(business_datafile, index_col=biz_id)
#user = pd.read_csv(user_datafile, index_col=usr_id)
review = pd.read_csv(review_datafile, index_col=rev_id)

In [None]:
rest_crit = business['categories'].map(lambda x: 'Restaurants' in x)
restaurants = business[rest_crit]
restaurant_ids = restaurants.index.values
rest_reviews = review[review['business_id'].isin(restaurant_ids)]

rest_reviews = adjust_date(rest_reviews)
rest_reviews.drop(['user_id','text','votes.cool','votes.funny','type','votes.useful'],axis=1, inplace=True)
rest_reviews.head()

In [None]:
final_list = []
for business_id, df in rest_reviews.groupby(['business_id'], sort=False):
    my_avg = pd.expanding_mean(df.stars)
    my_nearest = np.round(my_avg*2.)/2.
    if len(df) > 10:
        rd = 1.*df['stars'][1:] - 1.*my_nearest[:-1]
        rating_drift = [0.]+rd
        my_rd = np.array(rating_drift)*0.35
        my_anchor_adjusted = df['stars']+my_rd

    else:
        my_anchor_adjusted = df.stars
    raw_avg = df.stars.mean()        
    adj_avg = my_anchor_adjusted.mean()
    df_dict = {'business_id': business_id, 'raw_avg': raw_avg, 'anchor_avg': adj_avg}
    final_list.append(df_dict)

In [None]:
final_df = pd.DataFrame(final_list)
plt.scatter(final_df.raw_avg, final_df.anchor_avg, color='dodgerblue', alpha=0.4, edgecolor='black')
plt.xlabel('Yelp Average Rating')
plt.ylabel('Anchor Adjusted Rating')

In [None]:
tmp_df = rest_reviews[rest_reviews.business_id == 'kXpQzN1jJ3vHuvUnbvqxzg']
tmp_df

Example to play with

In [None]:
tmp_df = rest_reviews[rest_reviews.business_id == '--5jkZ3-nUPZxUvtcbr8Uw']
print len(tmp_df)
tmp_df.reset_index(inplace=True)
tmp_df.head()

In [None]:
b = tmp_df.copy()
b['avg'] = pd.expanding_mean(tmp_df['stars'])
b['nearest'] = np.round(b.avg*2.)/2.

rd = 1.*b['stars'][1:] - 1.*b['nearest'][:-1]
rating_drift = [0.]+rd
b['rating_drift'] = rating_drift*0.35
b['anchor_adjusted'] = b['stars']+b['rating_drift']
#b.anchor_adjusted[b['anchor_adjusted'] > 5] = 5.
#b.anchor_adjusted[b['anchor_adjusted'] < 1] = 1.
b

In [None]:
print b.anchor_adjusted.mean()
print b.avg[51]

In [None]:
filename = '/home/vagrant/capstone/data/anchor_adjusted_ratings.pkl'
with open(filename,'w') as f:
    pickle.dump(final_df, f)

In [None]:
anchor_datafile = '/home/vagrant/capstone/data/anchor_adjusted_ratings.pkl'
with open(anchor_datafile,'r') as f:
    df = pickle.load(f)
df.head()

In [None]:
plt.scatter(np.arange(len(final_df)), final_df.raw_avg-final_df.anchor_avg, alpha=0.4)

In [None]:
x = np.arange(62)/30.-1.05
y = np.histogram(final_df.raw_avg-final_df.anchor_avg, bins=x)

In [None]:
plt.hist(final_df.raw_avg-final_df.anchor_avg, 62)