In [110]:
# imports

import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import math
import scipy.stats as st

## Load Datasets and Simple Processing

In [111]:
# load the merged dataframes
beers_combined_df = pd.read_csv('data/pMerged/beers_combined_df.csv')
ratings_combined_df = pd.read_csv('data/pMerged/ratings_combined_df.csv')

# load the brewery dataframes
breweries_ba_df = pd.read_csv('data/BeerAdvocate/breweries.csv')
breweries_rb_df = pd.read_csv('data/RateBeer/breweries.csv')

  ratings_combined_df = pd.read_csv('data/pMerged/ratings_combined_df.csv')


In [112]:
# process and concatenate brewery dataframes

# BA and RB dont use the same brewery_id's
# --> add column 'dataset' to the brewery dataframes, containing BA and RB respectively
breweries_ba_df['dataset'] = 'BeerAdvocate'
breweries_rb_df['dataset'] = 'RateBeer'

# concatenate the two dataframes
breweries_combined_df = pd.concat([breweries_ba_df, breweries_rb_df], ignore_index=True)

# rename id column of breweries df to brewery_id
breweries_combined_df.rename(columns={'id': 'brewery_id'}, inplace=True)

In [113]:
# add the location from the breweries df to the ratings df
# only add BA brewery_id to BA entries and RB brewery_id to RB entries
ratings_combined_df = pd.merge(ratings_combined_df, breweries_combined_df[['brewery_id', 'dataset', 'location']], how='left', on=['brewery_id', 'dataset'])

## Group the ratings for each product, while keeping distinction BA/RB

In [114]:
# calculate if a beer is above or below average
# ratings are not equally strict for BA and RB
# --> calculate average_rating and normalize for each subset

average_rating_ba = ratings_combined_df[ratings_combined_df['dataset'] == 'BeerAdvocate'].rating.mean()
average_rating_rb = ratings_combined_df[ratings_combined_df['dataset'] == 'RateBeer'].rating.mean()

# compare the rating to average_rating for the respective subset
ratings_combined_df['above_average'] = False
ratings_combined_df.loc[ratings_combined_df['dataset'] == 'BeerAdvocate', 'above_average'] = ratings_combined_df['rating'] > average_rating_ba
ratings_combined_df.loc[ratings_combined_df['dataset'] == 'RateBeer', 'above_average'] = ratings_combined_df['rating'] > average_rating_rb
ratings_combined_df['below_average'] = ~ratings_combined_df['above_average']

# copy the ratings_combined_df to a new dataframe
ratings_beerGroups_df = ratings_combined_df.copy()

# group the ratings by beer_id & dataset and take average of rating and sum of above_average
ratings_beerGroups_df = ratings_beerGroups_df.groupby(['beer_id', 'dataset']).agg({'beer_name': 'first', 'brewery_id': 'first', 'rating': 'mean', 'above_average': 'sum', 'below_average': 'sum'}).reset_index()
ratings_beerGroups_df.rename(columns={'rating': 'average_rating'}, inplace=True)

In [115]:
# add location to the ratings_beerGroups_df
ratings_beerGroups_df = pd.merge(ratings_beerGroups_df, breweries_combined_df[['brewery_id', 'dataset', 'location']], how='left', on=['brewery_id', 'dataset'])

## Merging the ratings from the two datasets

Perform merge-operation by normalizing and weighing average_rating with the number of ratings they originate from.

In [116]:
ratings_beerGroupsMERGED_df = ratings_beerGroups_df.copy()

# Normalize
# get the min and max values for average_rating in BA and RB
min_avg_r_BA = ratings_beerGroupsMERGED_df[ratings_beerGroupsMERGED_df['dataset'] == 'BeerAdvocate'].average_rating.min()
max_avg_r_BA = ratings_beerGroupsMERGED_df[ratings_beerGroupsMERGED_df['dataset'] == 'BeerAdvocate'].average_rating.max()
min_avg_r_RB = ratings_beerGroupsMERGED_df[ratings_beerGroupsMERGED_df['dataset'] == 'RateBeer'].average_rating.min()
max_avg_r_RB = ratings_beerGroupsMERGED_df[ratings_beerGroupsMERGED_df['dataset'] == 'RateBeer'].average_rating.max()

# normalize the average_rating to a scale from 0 to 1 for all entries in BA and RB
ratings_beerGroupsMERGED_df.loc[ratings_beerGroupsMERGED_df['dataset'] == 'BeerAdvocate', 'average_rating'] = (ratings_beerGroupsMERGED_df['average_rating'] - min_avg_r_BA) / (max_avg_r_BA - min_avg_r_BA)
ratings_beerGroupsMERGED_df.loc[ratings_beerGroupsMERGED_df['dataset'] == 'RateBeer', 'average_rating'] = (ratings_beerGroupsMERGED_df['average_rating'] - min_avg_r_RB) / (max_avg_r_RB - min_avg_r_RB)

### Merging average_rating
To correctly get the average_rating, need to:
- weigh average_rating
- take the sum of weighted average_ratings for the same beer
- divide by the sum of weights
  
The aggregate function takes the sum of the weights and the sum of the weighted average_ratings --> only need to divide by the weight, which at this point is the sum of weights.


In [117]:
# create the weights: total number of ratings for each beer --> column 'weight'
ratings_beerGroupsMERGED_df['weight'] = ratings_beerGroupsMERGED_df['above_average'] + ratings_beerGroupsMERGED_df['below_average']

# normalize weight to a scale from 0 to 1 (BUT NO WEIGHT SHOULD BE 0 --> ONLY DIVIDE BY MAX)
max_weight = ratings_beerGroupsMERGED_df.weight.max()
ratings_beerGroupsMERGED_df['weight'] = ratings_beerGroupsMERGED_df['weight'] / max_weight

# weigh average_rating
ratings_beerGroupsMERGED_df['average_rating'] = ratings_beerGroupsMERGED_df['average_rating'] * ratings_beerGroupsMERGED_df['weight']

# group by beer_name and specify how to aggregate the columns (coluns beer_id, dataset and brewery_id are dropped automatically)
ratings_beerGroupsMERGED_df = ratings_beerGroupsMERGED_df.groupby(['beer_name']).agg({'location': 'first', 'average_rating': 'sum', 'above_average': 'sum', 'below_average': 'sum', 'weight': 'sum'}).reset_index()

# divide the sum of average_rating by the sum of weight
ratings_beerGroupsMERGED_df['average_rating'] = ratings_beerGroupsMERGED_df['average_rating'] / ratings_beerGroupsMERGED_df['weight']

# drop the weight column
ratings_beerGroupsMERGED_df.drop(columns=['weight'], inplace=True)

## Lower Bound of Wilson

To correctly rank the beers, we will consider the lower bound of Wilson score. 
Wilson lower bound score presents the usefulness value of a product/comment to the user with a 95% confidence interval (https://medium.com/@okanckaya/rating-products-sorting-reviews-aff32cbd29c1).

Here we will consider as negative ratings (i.e bad ratings) the number of ratings that are below the average rating. And we will consider as positive rating (i.e. good rating) the number of ratings that are above the average.

In [118]:
def wilson_lower_bound (pos, neg, confidence=0.95):
    n = pos + neg
    if n == 0:
        return 0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * pos / n
    return (phat + z * z / (2 * n) - z * math. sqrt ((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)

In [119]:
# compute the lower bound of wilson the average rating for each beer
ratings_beerGroupsMERGED_df['lower_bound'] = ratings_beerGroupsMERGED_df.apply(lambda row: wilson_lower_bound(row['above_average'], row['below_average']), axis=1)

# sort the dataframe by the lower bound of the average rating
ratings_beerGroupsMERGED_df.sort_values(by='lower_bound', ascending=False, inplace=True)


In [120]:
ratings_beerGroupsMERGED_df.head()

Unnamed: 0,beer_name,location,average_rating,above_average,below_average,lower_bound
3060,3 Fonteinen Oude Geuze Vintage (all from 2002-*),Belgium,0.82328,210,0,0.982036
189367,Lost Abbey Isabelle Proximus,"United States, California",0.845354,389,2,0.981544
266109,Russian River Pliny the Younger,"United States, California",0.860782,636,6,0.979762
110258,Evil Twin Double Barrel Jesus,"United States, New York",0.86156,183,0,0.97944
314095,Tilquin Oude Quetsche à lAncienne,Belgium,0.79638,178,0,0.978875


## Plotting location

In [126]:
ratings_beerGroupsMERGED_df['location'].head(10)


3060                        Belgium
189367    United States, California
266109    United States, California
110258      United States, New York
314095                      Belgium
149329       United States, Vermont
149274       United States, Vermont
149341       United States, Vermont
131936      United States, Illinois
334933                      Belgium
Name: location, dtype: object

## Per-Month Analysis