In [69]:
# imports

import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import math
import scipy.stats as st

In [70]:
# load the merged dataframes
beers_combined_df = pd.read_csv('data/pMerged/beers_combined_df.csv')
ratings_combined_df = pd.read_csv('data/pMerged/ratings_combined_df.csv')

# load the brewery dataframes
breweries_ba_df = pd.read_csv('data/BeerAdvocate/breweries.csv')
breweries_rb_df = pd.read_csv('data/RateBeer/breweries.csv')

  ratings_combined_df = pd.read_csv('data/pMerged/ratings_combined_df.csv')


In [71]:
# process and concatenate brewery dataframes

# BA and RB dont use the same brewery_id's
# --> add column 'dataset' to the brewery dataframes, containing BA and RB respectively
breweries_ba_df['dataset'] = 'BeerAdvocate'
breweries_rb_df['dataset'] = 'RateBeer'

# concatenate the two dataframes
breweries_combined_df = pd.concat([breweries_ba_df, breweries_rb_df], ignore_index=True)

# rename id column of breweries df to brewery_id
breweries_combined_df.rename(columns={'id': 'brewery_id'}, inplace=True)

In [72]:
# add the location from the breweries df to the ratings df
# only add BA brewery_id to BA entries and RB brewery_id to RB entries
ratings_combined_df = pd.merge(ratings_combined_df, breweries_combined_df[['brewery_id', 'dataset', 'location']], how='left', on=['brewery_id', 'dataset'])

## Lower Bound of Wilson

To correctly rank the beers, we will consider the lower bound of Wilson score. 
Wilson lower bound score presents the usefulness value of a product/comment to the user with a 95% confidence interval (https://medium.com/@okanckaya/rating-products-sorting-reviews-aff32cbd29c1).

Here we will consider as negative ratings (i.e bad ratings) the number of ratings that are below the average rating. And we will consider as positive rating (i.e. good rating) the number of ratings that are above the average.

In [73]:
def wilson_lower_bound (pos, neg, confidence=0.95):
    n = pos + neg
    if n == 0:
        return 0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * pos / n
    return (phat + z * z / (2 * n) - z * math. sqrt ((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)

In [74]:
ratings_combined_df.groupby('beer_id').head()

Unnamed: 0,beer_name,beer_id,brewery_id,brewery_name,style,date,user_id,user_name,appearance,aroma,palate,taste,overall,rating,text,year,month,year_month,dataset,location
0,Régab,142544,37262,Societe des Brasseries du Gabon (SOBRAGA),Pale Lager,2015-08-20 10:00:00,nmann08.184925,nmann08,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ...",2015,8,2015-08-01,BeerAdvocate,Gabon
1,Americano Imperial Coffee IPA,131646,31221,Siren Craft Brew,India Pale Ale (IPA),2014-08-29 10:00:00,nmann08.184925,nmann08,,,,,,3.50,"On draft, pours a dark brown with red hues, wi...",2014,8,2014-08-01,BeerAdvocate,England
2,Broken Dream,99556,31221,Siren Craft Brew,Sweet Stout,2014-08-29 10:00:00,nmann08.184925,nmann08,,,,,,4.00,"On draft, pours an opaque black with a light b...",2014,8,2014-08-01,BeerAdvocate,England
3,Maiden 2013,127138,31221,Siren Craft Brew,Barley Wine,2014-08-29 10:00:00,nmann08.184925,nmann08,,,,,,3.75,"On draft, pours a translucent but very dark re...",2014,8,2014-08-01,BeerAdvocate,England
4,Shattered Dream,135343,31221,Siren Craft Brew,Imperial Stout,2014-08-29 10:00:00,nmann08.184925,nmann08,,,,,,4.00,"On draft, pours a dark brown, pretty much opaq...",2014,8,2014-08-01,BeerAdvocate,England
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5455436,Meduz Blonde,229764,17699,Meduz Brewery,Bière de Garde,2015-06-05 10:00:00,372299,Mitch87,4.00,8.00,4.00,8.00,17.0,4.10,Medium hoppy with a mild fruity and citrus fin...,2015,6,2015-06-01,RateBeer,France
5455438,Saint-Mihal Beer Lager de Luxe,316004,29320,Unknown Origin - France,Pale Lager,2017-07-19 10:00:00,483276,stmihalfan,5.00,10.00,5.00,10.00,20.0,5.00,Probably my favourite beer ever. its really re...,2017,7,2017-07-01,RateBeer,France
5455448,Holland Beer Dunkel,86571,9468,Holland Beer,Schwarzbier,2008-04-20 10:00:00,72689,microlover,3.00,3.00,4.00,7.00,15.0,3.20,"south-german Dunkel, could be a bit more tasty...",2008,4,2008-04-01,RateBeer,Thailand
5455449,Holland Beer Lager,86570,9468,Holland Beer,Dortmunder/Helles,2008-04-20 10:00:00,72689,microlover,5.00,8.00,4.00,8.00,20.0,4.50,Found this place by coming home from the seasi...,2008,4,2008-04-01,RateBeer,Thailand


In [75]:
# calculate if a beer is above or below average
# ratings are not equally strict for BA and RB
# --> calculate the average rating for each subset

average_rating_ba = ratings_combined_df[ratings_combined_df['dataset'] == 'BeerAdvocate'].rating.mean()
average_rating_rb = ratings_combined_df[ratings_combined_df['dataset'] == 'RateBeer'].rating.mean()

# compare the rating to average_rating for the respective subset
ratings_combined_df['above_average'] = False
ratings_combined_df.loc[ratings_combined_df['dataset'] == 'BeerAdvocate', 'above_average'] = ratings_combined_df['rating'] > average_rating_ba
ratings_combined_df.loc[ratings_combined_df['dataset'] == 'RateBeer', 'above_average'] = ratings_combined_df['rating'] > average_rating_rb
ratings_combined_df['below_average'] = ~ratings_combined_df['above_average']

# copy the ratings_combined_df to a new dataframe
ratings_beerGroups_df = ratings_combined_df.copy()

# group the ratings by beer_id & dataset and take average of rating and sum of above_average
ratings_beerGroups_df = ratings_beerGroups_df.groupby(['beer_id', 'dataset']).agg({'rating': 'mean', 'above_average': 'sum', 'below_average': 'sum'}).reset_index()
ratings_beerGroups_df.rename(columns={'rating': 'average_rating'}, inplace=True)

In [79]:
# add beer_name, brewery_id and location to the ratings_beerGroups_df
ratings_beerGroups_df = pd.merge(ratings_beerGroups_df, beers_combined_df[['beer_id', 'dataset', 'beer_name', 'brewery_id']], how='left', on=['beer_id', 'dataset'])
ratings_beerGroups_df = pd.merge(ratings_beerGroups_df, breweries_combined_df[['brewery_id', 'dataset', 'location']], how='left', on=['brewery_id', 'dataset'])

In [84]:
# checking for duplicates in top X of wilson lower bound ranking might be different from merging same beers first and then calculating the wilson lower bound
# get the min and max values for average_rating in BA and RB
min_avg_r_BA = ratings_beerGroups_df[ratings_beerGroups_df['dataset'] == 'BeerAdvocate'].average_rating.min()
max_avg_r_BA = ratings_beerGroups_df[ratings_beerGroups_df['dataset'] == 'BeerAdvocate'].average_rating.max()
min_avg_r_RB = ratings_beerGroups_df[ratings_beerGroups_df['dataset'] == 'RateBeer'].average_rating.min()
max_avg_r_RB = ratings_beerGroups_df[ratings_beerGroups_df['dataset'] == 'RateBeer'].average_rating.max()

# normalize the average_rating to a scale from 0 to 1 for all entries in BA and RB
ratings_beerGroups_df.loc[ratings_beerGroups_df['dataset'] == 'BeerAdvocate', 'average_rating'] = (ratings_beerGroups_df['average_rating'] - min_avg_r_BA) / (max_avg_r_BA - min_avg_r_BA)
ratings_beerGroups_df.loc[ratings_beerGroups_df['dataset'] == 'RateBeer', 'average_rating'] = (ratings_beerGroups_df['average_rating'] - min_avg_r_RB) / (max_avg_r_RB - min_avg_r_RB)


In [86]:
# do a similar thing for the number of above_average ratings and below_average ratings

In [54]:
## Probably delete this cell



# --> normalize average_ratings and above_average/below_average counts by number of ratings within each dataset
ratings_beerGroups_df['average_rating_normalized'] = ratings_beerGroups_df['average_rating']
ratings_beerGroups_df['above_average_normalized'] = ratings_beerGroups_df['above_average']
ratings_beerGroups_df['below_average_normalized'] = ratings_beerGroups_df['below_average']
# Q: do i need to normalize by dividing only be the sum of above_average/below_average in that dataset or by the total number of ratings in the two datasets?
ratings_beerGroups_df.loc[ratings_beerGroups_df['dataset'] == 'BeerAdvocate', 'average_rating_normalized'] = ratings_beerGroups_df['average_rating'] / ratings_beerGroups_df['above_average'].sum()
ratings_beerGroups_df.loc[ratings_beerGroups_df['dataset'] == 'RateBeer', 'average_rating_normalized'] = ratings_beerGroups_df['average_rating'] / ratings_beerGroups_df['below_average'].sum()
ratings_beerGroups_df.loc[ratings_beerGroups_df['dataset'] == 'BeerAdvocate', 'above_average_normalized'] = ratings_beerGroups_df['above_average'] / ratings_beerGroups_df['above_average'].sum()
ratings_beerGroups_df.loc[ratings_beerGroups_df['dataset'] == 'RateBeer', 'below_average_normalized'] = ratings_beerGroups_df['below_average'] / ratings_beerGroups_df['below_average'].sum()


In [53]:
# compute the lower bound of the average rating for each beer
ratings_beerGroups_df['lower_bound'] = ratings_beerGroups_df.apply(lambda row: wilson_lower_bound(row['above_average'], row['below_average']), axis=1)

# sort the dataframe by the lower bound of the average rating
ratings_beerGroups_df.sort_values(by='lower_bound', ascending=False, inplace=True)


KeyboardInterrupt: 

In [44]:
ratings_beerGroups_df.head()

Unnamed: 0,beer_id,dataset,average_rating,above_average,below_average,beer_name_x,lower_bound,beer_name_y,brewery_id,location
0,56716,RateBeer,4.204762,210,0,3 Fonteinen Oude Geuze Vintage (all from 2002-*),0.982036,3 Fonteinen Oude Geuze Vintage (all from 2002-*),2058,Belgium
1,56716,RateBeer,4.204762,210,0,3 Fonteinen Oude Geuze Vintage (all from 2002-*),0.982036,3 Fonteinen Oude Geuze Vintage (all from 2002-*),2058,Belgium
2,56716,RateBeer,4.204762,210,0,3 Fonteinen Oude Geuze Vintage (all from 2002-*),0.982036,3 Fonteinen Oude Geuze Vintage (all from 2002-*),2058,Belgium
3,56716,RateBeer,4.204762,210,0,3 Fonteinen Oude Geuze Vintage (all from 2002-*),0.982036,3 Fonteinen Oude Geuze Vintage (all from 2002-*),2058,Belgium
4,56716,RateBeer,4.204762,210,0,3 Fonteinen Oude Geuze Vintage (all from 2002-*),0.982036,3 Fonteinen Oude Geuze Vintage (all from 2002-*),2058,Belgium


In [None]:
# OLD CODE BELOW

In [18]:
# as the breweries are a more general entity for whose subcomponents (the beers) do not change location, we can group according to brewery_id and take the mean of the ratings
avg_ratings_ba_df = ratings_ba_df.groupby('brewery_id')['rating'].mean().reset_index()
avg_ratings_rb_df = ratings_rb_df.groupby('brewery_id')['rating'].mean().reset_index()
display(avg_ratings_ba_df)

Unnamed: 0,brewery_id,rating
0,1,3.665596
1,2,3.130769
2,3,3.501711
3,4,4.081997
4,5,3.568344
...,...,...
9794,49747,3.833333
9795,49765,3.085714
9796,49781,3.826000
9797,49795,4.080000


In [19]:
# add column with location of brewery to avg_ratings df
avg_ratings_ba_df = avg_ratings_ba_df.merge(breweries_ba_df[['brewery_id', 'location']], on='brewery_id', how='left')
avg_ratings_rb_df = avg_ratings_rb_df.merge(breweries_rb_df[['brewery_id', 'location']], on='brewery_id', how='left')
display(avg_ratings_ba_df)

Unnamed: 0,brewery_id,rating,location
0,1,3.665596,Czech Republic
1,2,3.130769,"United States, Texas"
2,3,3.501711,"United States, Louisiana"
3,4,4.081997,"United States, Maine"
4,5,3.568344,"United States, Washington"
...,...,...,...
9794,49747,3.833333,"United States, Illinois"
9795,49765,3.085714,"United States, Illinois"
9796,49781,3.826000,"United States, New Hampshire"
9797,49795,4.080000,Vietnam


In [21]:
# sort the top 10 breweries by average rating in each df
top10_ba_df = avg_ratings_ba_df.sort_values(by='rating', ascending=False).head(10)
top10_rb_df = avg_ratings_rb_df.sort_values(by='rating', ascending=False).head(10)
display(top10_ba_df)

Unnamed: 0,brewery_id,rating,location
9580,47779,5.0,"United States, Pennsylvania"
9374,46653,5.0,Canada
9689,48719,5.0,"United States, New Mexico"
7861,38705,5.0,Hungary
6754,33739,5.0,"United States, Virginia"
6980,34425,5.0,"United States, California"
9460,47048,5.0,"United States, California"
8260,41033,5.0,"United States, North Carolina"
8773,43688,5.0,"United States, Oregon"
7949,39301,5.0,Canada


In [22]:
# these "top breweries" may have very little ratings --> lets look at the most rated breweries
# count the number of ratings for each brewery
count_ratings_ba_df = ratings_ba_df.groupby('brewery_id')['rating'].count().reset_index()
count_ratings_rb_df = ratings_rb_df.groupby('brewery_id')['rating'].count().reset_index()
display(count_ratings_ba_df)


Unnamed: 0,brewery_id,rating
0,1,1326
1,2,39
2,3,7832
3,4,10809
4,5,646
...,...,...
9794,49747,3
9795,49765,7
9796,49781,5
9797,49795,1


In [23]:
# add column with location of brewery to count_ratings df
count_ratings_ba_df = count_ratings_ba_df.merge(breweries_ba_df[['brewery_id', 'location']], on='brewery_id', how='left')
count_ratings_rb_df = count_ratings_rb_df.merge(breweries_rb_df[['brewery_id', 'location']], on='brewery_id', how='left')
display(count_ratings_ba_df)

Unnamed: 0,brewery_id,rating,location
0,1,1326,Czech Republic
1,2,39,"United States, Texas"
2,3,7832,"United States, Louisiana"
3,4,10809,"United States, Maine"
4,5,646,"United States, Washington"
...,...,...,...
9794,49747,3,"United States, Illinois"
9795,49765,7,"United States, Illinois"
9796,49781,5,"United States, New Hampshire"
9797,49795,1,Vietnam


In [24]:
# sort the top 10 breweries by number of ratings in each df
top10_count_ba_df = count_ratings_ba_df.sort_values(by='rating', ascending=False).head(10)
top10_count_rb_df = count_ratings_rb_df.sort_values(by='rating', ascending=False).head(10)
display(top10_count_ba_df)

Unnamed: 0,brewery_id,rating,location
28,35,50705,"United States, Massachusetts"
109,140,47775,"United States, California"
116,147,45124,"United States, California"
51,64,43402,"United States, Delaware"
879,1199,35217,"United States, Michigan"
226,287,33798,"United States, Michigan"
182,220,27788,"United States, California"
261,345,25344,"United States, Pennsylvania"
104,132,25165,"United States, Oregon"
859,1146,24835,"United States, Illinois"


The top 10 breweries (and probably more) are all located in the US, which is suspicious as only US users where considered. The assumption can be made that US users are more likely to drink US beer and thus also more likely to rate US beer. This skews our perception

In [None]:
def get_wilson_beers(df, attribute, n=10):
    average_rating = df[attribute].mean()
    
    beer_rating_df = df.copy()

    beer_rating_df['above_average'] = beer_rating_df[attribute] > average_rating
    beer_rating_df['below_average'] = beer_rating_df[attribute] <= average_rating

    beer_rating_df = beer_rating_df.groupby('beer_id').agg({attribute: 'mean', 'beer_id': 'count', 'above_average': 'sum', 'below_average': 'sum'})
    beer_rating_df.rename(columns={'beer_id': 'count'}, inplace=True)
    beer_rating_df.reset_index(inplace=True)
    
    # compute the lower bound of the confidence interval for each beer
    beer_rating_df['lower_bound'] = beer_rating_df.apply(lambda row: wilson_lower_bound(row['above_average'], row['below_average']), axis=1)

    # sort by the lower bound of the confidence interval
    beer_rating_df.sort_values(by=['lower_bound'], ascending=False, inplace=True)
    
    # select the top n beers
    beer_rating_df = beer_rating_df.head(n)
    
    print(beer_rating_df)
    
    return beer_rating_df.beer_id.values