In [5]:
# imports

import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
breweries_ba_df = pd.read_csv('data/BeerAdvocate/breweries.csv')
breweries_rb_df = pd.read_csv('data/RateBeer/breweries.csv')

In [7]:
# loading the two datasets 

ratings_ba_df = pd.read_csv('data/BeerAdvocate/ratings_BA.csv')
ratings_rb_df = pd.read_csv('data/RateBeer/ratings_RB.csv')


# loading the users dataset 

users_ba_df = pd.read_csv('data/BeerAdvocate/users.csv')
users_rb_df = pd.read_csv('data/RateBeer/users.csv')

# loading the beers dataset

beers_ba_df = pd.read_csv('data/BeerAdvocate/beers.csv')
beers_rb_df = pd.read_csv('data/RateBeer/beers.csv')

In [8]:
# extract the country from the location

users_rb_df['location'] = users_rb_df['location'].str.split(',').str[0]
users_ba_df['location'] = users_ba_df['location'].str.split(',').str[0]

In [9]:
US_country = 'United States'

# keep only the rows where the user country is United States
users_ba_df = users_ba_df[users_ba_df.location == US_country]
users_rb_df = users_rb_df[users_rb_df.location == US_country]

In [10]:
# merge the ratings and users data frames to get the location of the user for each rating
ratings_users_ba_df = pd.merge(ratings_ba_df, users_ba_df[['user_id', 'location']], on='user_id', how='inner')
ratings_users_rb_df = pd.merge(ratings_rb_df, users_rb_df[['user_id', 'location']], on='user_id', how='inner')

# filter the ratings dataframes to only keep the popular ratings of the users from the US
ratings_ba_df = ratings_users_ba_df[ratings_users_ba_df.location == US_country]
ratings_ba_df.drop('location', axis=1, inplace=True)
ratings_rb_df = ratings_users_rb_df[ratings_users_rb_df.location == US_country]
ratings_rb_df.drop('location', axis=1, inplace=True)

In [11]:
# Assumption: BA and RB dont use the same brewery_id's

# rename id column of breweries df to brewery_id
breweries_ba_df.rename(columns={'id': 'brewery_id'}, inplace=True)
breweries_rb_df.rename(columns={'id': 'brewery_id'}, inplace=True)
# add the location from the breweries df to the ratings df
ratings_ba_df = ratings_ba_df.merge(breweries_ba_df[['brewery_id', 'location']], on='brewery_id', how='left')
ratings_rb_df = ratings_rb_df.merge(breweries_rb_df[['brewery_id', 'location']], on='brewery_id', how='left')

In [13]:
display(ratings_ba_df)

Unnamed: 0,beer_name,beer_id,brewery_id,brewery_name,style,date,user_id,user_name,appearance,aroma,palate,taste,overall,rating,text,location
0,Régab,142544,37262,Societe des Brasseries du Gabon (SOBRAGA),Euro Pale Lager,1440064800,nmann08.184925,nmann08,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ...",Gabon
1,Americano Imperial Coffee IPA,131646,31221,Siren Craft Brew,English India Pale Ale (IPA),1409306400,nmann08.184925,nmann08,,,,,,3.50,"On draft, pours a dark brown with red hues, wi...",England
2,Broken Dream,99556,31221,Siren Craft Brew,Milk / Sweet Stout,1409306400,nmann08.184925,nmann08,,,,,,4.00,"On draft, pours an opaque black with a light b...",England
3,Maiden 2013,127138,31221,Siren Craft Brew,English Barleywine,1409306400,nmann08.184925,nmann08,,,,,,3.75,"On draft, pours a translucent but very dark re...",England
4,Shattered Dream,135343,31221,Siren Craft Brew,Russian Imperial Stout,1409306400,nmann08.184925,nmann08,,,,,,4.00,"On draft, pours a dark brown, pretty much opaq...",England
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2346489,Michigan Brewing Superior Stout,4474,565,Michigan Brewing Company,American Stout,1176112800,quaffer83.125656,Quaffer83,3.00,3.00,3.00,3.50,4.0,3.40,I'd call the overall character roasty. Not ver...,"United States, Michigan"
2346490,Chaotic Double IPA,70471,23640,Twisted Manzanita Ales,American Double / Imperial IPA,1397383200,everman.532342,Everman,4.75,4.50,4.75,4.50,4.5,4.54,"While it is a double IPA, the hops presence is...","United States, California"
2346491,IPA,61818,23640,Twisted Manzanita Ales,American IPA,1309168800,justin0001.352175,Justin0001,3.00,2.50,3.00,1.50,2.0,2.08,"San Diego has countless fantastic taco shops, ...","United States, California"
2346492,Riverwalk Blonde,61815,23640,Twisted Manzanita Ales,American Blonde Ale,1400234400,joetex.800347,JoeTex,3.50,4.00,4.75,4.75,4.5,4.45,This is a great session beer. You can drink th...,"United States, California"


In [18]:
# as the breweries are a more general entity for whose subcomponents (the beers) do not change location, we can group according to brewery_id and take the mean of the ratings
avg_ratings_ba_df = ratings_ba_df.groupby('brewery_id')['rating'].mean().reset_index()
avg_ratings_rb_df = ratings_rb_df.groupby('brewery_id')['rating'].mean().reset_index()
display(avg_ratings_ba_df)

Unnamed: 0,brewery_id,rating
0,1,3.665596
1,2,3.130769
2,3,3.501711
3,4,4.081997
4,5,3.568344
...,...,...
9794,49747,3.833333
9795,49765,3.085714
9796,49781,3.826000
9797,49795,4.080000


In [19]:
# add column with location of brewery to avg_ratings df
avg_ratings_ba_df = avg_ratings_ba_df.merge(breweries_ba_df[['brewery_id', 'location']], on='brewery_id', how='left')
avg_ratings_rb_df = avg_ratings_rb_df.merge(breweries_rb_df[['brewery_id', 'location']], on='brewery_id', how='left')
display(avg_ratings_ba_df)

Unnamed: 0,brewery_id,rating,location
0,1,3.665596,Czech Republic
1,2,3.130769,"United States, Texas"
2,3,3.501711,"United States, Louisiana"
3,4,4.081997,"United States, Maine"
4,5,3.568344,"United States, Washington"
...,...,...,...
9794,49747,3.833333,"United States, Illinois"
9795,49765,3.085714,"United States, Illinois"
9796,49781,3.826000,"United States, New Hampshire"
9797,49795,4.080000,Vietnam


In [21]:
# sort the top 10 breweries by average rating in each df
top10_ba_df = avg_ratings_ba_df.sort_values(by='rating', ascending=False).head(10)
top10_rb_df = avg_ratings_rb_df.sort_values(by='rating', ascending=False).head(10)
display(top10_ba_df)

Unnamed: 0,brewery_id,rating,location
9580,47779,5.0,"United States, Pennsylvania"
9374,46653,5.0,Canada
9689,48719,5.0,"United States, New Mexico"
7861,38705,5.0,Hungary
6754,33739,5.0,"United States, Virginia"
6980,34425,5.0,"United States, California"
9460,47048,5.0,"United States, California"
8260,41033,5.0,"United States, North Carolina"
8773,43688,5.0,"United States, Oregon"
7949,39301,5.0,Canada


In [22]:
# these "top breweries" may have very little ratings --> lets look at the most rated breweries
# count the number of ratings for each brewery
count_ratings_ba_df = ratings_ba_df.groupby('brewery_id')['rating'].count().reset_index()
count_ratings_rb_df = ratings_rb_df.groupby('brewery_id')['rating'].count().reset_index()
display(count_ratings_ba_df)


Unnamed: 0,brewery_id,rating
0,1,1326
1,2,39
2,3,7832
3,4,10809
4,5,646
...,...,...
9794,49747,3
9795,49765,7
9796,49781,5
9797,49795,1


In [23]:
# add column with location of brewery to count_ratings df
count_ratings_ba_df = count_ratings_ba_df.merge(breweries_ba_df[['brewery_id', 'location']], on='brewery_id', how='left')
count_ratings_rb_df = count_ratings_rb_df.merge(breweries_rb_df[['brewery_id', 'location']], on='brewery_id', how='left')
display(count_ratings_ba_df)

Unnamed: 0,brewery_id,rating,location
0,1,1326,Czech Republic
1,2,39,"United States, Texas"
2,3,7832,"United States, Louisiana"
3,4,10809,"United States, Maine"
4,5,646,"United States, Washington"
...,...,...,...
9794,49747,3,"United States, Illinois"
9795,49765,7,"United States, Illinois"
9796,49781,5,"United States, New Hampshire"
9797,49795,1,Vietnam


In [24]:
# sort the top 10 breweries by number of ratings in each df
top10_count_ba_df = count_ratings_ba_df.sort_values(by='rating', ascending=False).head(10)
top10_count_rb_df = count_ratings_rb_df.sort_values(by='rating', ascending=False).head(10)
display(top10_count_ba_df)

Unnamed: 0,brewery_id,rating,location
28,35,50705,"United States, Massachusetts"
109,140,47775,"United States, California"
116,147,45124,"United States, California"
51,64,43402,"United States, Delaware"
879,1199,35217,"United States, Michigan"
226,287,33798,"United States, Michigan"
182,220,27788,"United States, California"
261,345,25344,"United States, Pennsylvania"
104,132,25165,"United States, Oregon"
859,1146,24835,"United States, Illinois"


The top 10 breweries (and probably more) are all located in the US, which is suspicious as only US users where considered. The assumption can be made that US users are more likely to drink US beer and thus also more likely to rate US beer. This skews our perception

In [None]:
def get_wilson_beers(df, attribute, n=10):
    average_rating = df[attribute].mean()
    
    beer_rating_df = df.copy()

    beer_rating_df['above_average'] = beer_rating_df[attribute] > average_rating
    beer_rating_df['below_average'] = beer_rating_df[attribute] <= average_rating

    beer_rating_df = beer_rating_df.groupby('beer_id').agg({attribute: 'mean', 'beer_id': 'count', 'above_average': 'sum', 'below_average': 'sum'})
    beer_rating_df.rename(columns={'beer_id': 'count'}, inplace=True)
    beer_rating_df.reset_index(inplace=True)
    
    # compute the lower bound of the confidence interval for each beer
    beer_rating_df['lower_bound'] = beer_rating_df.apply(lambda row: wilson_lower_bound(row['above_average'], row['below_average']), axis=1)

    # sort by the lower bound of the confidence interval
    beer_rating_df.sort_values(by=['lower_bound'], ascending=False, inplace=True)
    
    # select the top n beers
    beer_rating_df = beer_rating_df.head(n)
    
    print(beer_rating_df)
    
    return beer_rating_df.beer_id.values