In [28]:
import pandas as pd 
import numpy as np

import math
import scipy.stats as st

In [14]:
# loading the two datasets 

ratings_ba_df = pd.read_csv('data/BeerAdvocate/ratings_BA.csv')
ratings_rb_df = pd.read_csv('data/RateBeer/ratings_RB.csv')


# loading the users dataset 

users_ba_df = pd.read_csv('data/BeerAdvocate/users.csv')
users_rb_df = pd.read_csv('data/RateBeer/users.csv')

# loading the beers dataset

beers_ba_df = pd.read_csv('data/BeerAdvocate/beers.csv')
beers_rb_df = pd.read_csv('data/RateBeer/beers.csv')

In [15]:
# Convert the date column to datetime format for BeerAdvocate
ratings_ba_df['date'] = pd.to_datetime(ratings_ba_df['date'], unit='s')
ratings_ba_df['year'] = ratings_ba_df['date'].dt.year
ratings_ba_df['month'] = ratings_ba_df['date'].dt.month

# Convert the date column to datetime format for RateBeer
ratings_rb_df['date'] = pd.to_datetime(ratings_rb_df['date'], unit='s')
ratings_rb_df['year'] = ratings_rb_df['date'].dt.year
ratings_rb_df['month'] = ratings_rb_df['date'].dt.month

## Filter on location

In [16]:
users_rb_df['location'] = users_rb_df['location'].str.split(',').str[0]
users_ba_df['location'] = users_ba_df['location'].str.split(',').str[0]

US_country = 'United States'

# keep only the rows where the user country is United States
users_ba_df = users_ba_df[users_ba_df.location == US_country]
users_rb_df = users_rb_df[users_rb_df.location == US_country]

# merge the ratings and users data frames to get the location of the user for each rating
ratings_users_ba_df = pd.merge(ratings_ba_df, users_ba_df[['user_id', 'location']], on='user_id', how='inner')
ratings_users_rb_df = pd.merge(ratings_rb_df, users_rb_df[['user_id', 'location']], on='user_id', how='inner')

# filter the ratings dataframes to only keep the popular ratings of the users from the US
ratings_ba_df = ratings_users_ba_df[ratings_users_ba_df.location == US_country]
ratings_ba_df.drop('location', axis=1, inplace=True)
ratings_rb_df = ratings_users_rb_df[ratings_users_rb_df.location == US_country]
ratings_rb_df.drop('location', axis=1, inplace=True)

Since the distribution of the two datasets is different, i.e users are more harsher on the 

In [17]:
mean_ba = ratings_ba_df.rating.mean()
mean_rb = ratings_rb_df.rating.mean()

print("Mean rating in BeerAdvocate:", mean_ba)
print("Mean rating in RateBeer:", mean_rb)

Mean rating in BeerAdvocate: 3.846906563579535
Mean rating in RateBeer: 3.3793541115704993


In [18]:
# compute the count of similar beer_id in both datasets
beer_ba = ratings_ba_df.beer_id.unique()
beer_rb = ratings_rb_df.beer_id.unique()

print("Number of beers in BeerAdvocate:", len(beer_ba))
print("Number of beers in RateBeer:", len(beer_rb))

# count the number of beer_id that are the same in the two datasets
similar_beer = np.intersect1d(beer_ba, beer_rb)
print("Number of beers in common:", len(similar_beer))

Number of beers in BeerAdvocate: 142593
Number of beers in RateBeer: 232437
Number of beers in common: 67861


## Using the Lower Bound of Wilson 

To correctly rank the beers, we will consider the lower bound of Wilson score. 
Wilson lower bound score presents the usefulness value of a product/comment to the user with a 95% confidence interval (https://medium.com/@okanckaya/rating-products-sorting-reviews-aff32cbd29c1).

Here we will consider as negative ratings (i.e bad ratings) the number of ratings that are below the average rating. And we will consider as positive rating (i.e. good rating) the number of ratings that are above the average. 

We will count separately the number of positive and negative ratings on the two datasets. 

In [19]:
def wilson_lower_bound (pos, neg, confidence=0.95):
    n = pos + neg
    if n == 0:
        return 0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * pos / n
    return (phat + z * z / (2 * n) - z * math. sqrt ((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)

In [20]:
# for RateBeer dataset 

average_rating = ratings_rb_df.rating.mean()

beer_rating_rb_df = ratings_rb_df.copy()

beer_rating_rb_df['above_average'] = beer_rating_rb_df['rating'] > average_rating
beer_rating_rb_df['below_average'] = beer_rating_rb_df['rating'] <= average_rating

# group by beer_id and compute the average rating and the number of ratings for each beer
beer_rating_rb_df = beer_rating_rb_df.groupby('beer_id').agg({'rating': 'mean', 'beer_id': 'count', 'above_average': 'sum', 'below_average': 'sum'})
beer_rating_rb_df.rename(columns={'rating': 'average_rating', 'beer_id': 'count'}, inplace=True)
beer_rating_rb_df.reset_index(inplace=True)

# for BeerAdvocate dataset

average_rating = ratings_ba_df.rating.mean()

beer_rating_ba_df = ratings_ba_df.copy()

beer_rating_ba_df['above_average'] = beer_rating_ba_df['rating'] > average_rating
beer_rating_ba_df['below_average'] = beer_rating_ba_df['rating'] <= average_rating

# group by beer_id and compute the average rating and the number of ratings for each beer
beer_rating_ba_df = beer_rating_ba_df.groupby('beer_id').agg({'rating': 'mean', 'beer_id': 'count', 'above_average': 'sum', 'below_average': 'sum'})
beer_rating_ba_df.rename(columns={'rating': 'average_rating', 'beer_id': 'count'}, inplace=True)
beer_rating_ba_df.reset_index(inplace=True)

In [23]:
# merge the two dataframes to get the average rating and the number of ratings for each beer in both datasets
beer_rating_df = pd.merge(beer_rating_rb_df, beer_rating_ba_df, on='beer_id', how='inner')
beer_rating_df.rename(columns={'average_rating_x': 'average_rating_rb', 'count_x': 'count_rb', 'above_average_x': 'above_average_rb', 'below_average_x': 'below_average_rb', 'average_rating_y': 'average_rating_ba', 'count_y': 'count_ba', 'above_average_y': 'above_average_ba', 'below_average_y': 'below_average_ba'}, inplace=True)


In [25]:
# for each beer_id, compute the average rating of average_rating_rb and average_rating_ba
beer_rating_df['average_rating'] = (beer_rating_df['average_rating_rb'] + beer_rating_df['average_rating_ba']) / 2
beer_rating_df['count'] = beer_rating_df['count_rb'] + beer_rating_df['count_ba']
beer_rating_df['above_average'] = beer_rating_df['above_average_rb'] + beer_rating_df['above_average_ba']
beer_rating_df['below_average'] = beer_rating_df['below_average_rb'] + beer_rating_df['below_average_ba']

# drop the columns that are not needed anymore
beer_rating_df.drop(['average_rating_rb', 'average_rating_ba', 'count_rb', 'count_ba', 'above_average_rb', 'above_average_ba', 'below_average_rb', 'below_average_ba'], axis=1, inplace=True)

In [29]:
# compute the lower bound of the average rating for each beer
beer_rating_df['lower_bound'] = beer_rating_df.apply(lambda row: wilson_lower_bound(row['above_average'], row['below_average']), axis=1)

# sort the dataframe by the lower bound of the average rating
beer_rating_df.sort_values(by='lower_bound', ascending=False, inplace=True)

beer_rating_df.head(10)

Unnamed: 0,beer_id,average_rating,count,above_average,below_average,lower_bound
39616,113283,4.348602,187,187,0,0.979871
10338,21690,4.121717,934,919,15,0.973672
40324,116684,4.324103,322,319,3,0.97297
42069,126305,3.665346,319,316,3,0.97272
41197,121397,4.632759,262,260,2,0.972599
17986,43947,4.091419,1112,1089,23,0.969154
51838,184008,3.646854,179,178,1,0.969039
25710,66438,4.042035,318,314,4,0.968109
25533,65881,3.977089,1030,1007,23,0.966715
4896,8936,4.017019,2089,2035,54,0.966426


In [30]:
# print average count of ratings for each beer
print("Average count of ratings for each beer:", beer_rating_df['count'].mean())

Average count of ratings for each beer: 44.47473512031948


We observe the ranking of the top 10 beers on the table above. The first beer got a number of 187 reviews (which is way higher than the average count). The top 1 beer got no review below the average rating (no bad review). 