# Pre-processing and exploration of the data

# Summary
* Exploration of the Datasets - Augmented Milestone 2 
    * Pre-processing : checking for NaN values
        * Beer datasets
        * User datasets
        * Brewery datasets
    * Exploration of the datasets
        * Textual reviews
* Analysis for Data Story - Milestone 3
    * Pre-processing for bias correction
    * Bias correction
    * Beer characteristics
    * SAT dataset matches with BeerAdvocate and RateBeer
    * Querying the dataset for the beers prefered by each country
    
* Graphs and Plots for Data Story
    * t-SNE of SAT beers and countries preferences
    * SAT beer ranking
    * wordclouds
    * Interactive World Map
    
* Addendum : Our helper modules exposed for easy reading and code-checking



In [None]:
#Import of all the necessary libraries:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import re
import pandas as pd
import numpy as np

from PIL import Image

import plotly.tools 
import plotly.graph_objs as go
from  plotly.offline import plot
import plotly.express as px
from sklearn.manifold import TSNE
from plotly.subplots import make_subplots 

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,silhouette_score
from statsmodels.stats import diagnostic
import openai
import nltk
nltk.download('punkt')


#We created a certain number of modules to encapsulate functions we created for this project

from functions import read_data
from functions import NLP_utils
from functions import SAT_helpers
from functions import plot_helpers
from functions.happyness_helpers import *


In [None]:
#Loading the datasets: 

DATASET_BEERADVOCATE = 'DATA/BeerAdvocate/'
DATASET_RATEBEER = 'DATA/RateBeer/'
DATASET_MATCHEDBEER = 'DATA/matched_beer_data/'


#Beer advocate dataset
df_adv_beer = pd.read_csv(DATASET_BEERADVOCATE + 'beers.csv')
df_adv_breweries = pd.read_csv(DATASET_BEERADVOCATE + 'breweries.csv')
df_adv_users = pd.read_csv(DATASET_BEERADVOCATE + 'users.csv')

#Ratebeer dataset
df_rb_beer = pd.read_csv(DATASET_RATEBEER + 'beers.csv')
df_rb_breweries = pd.read_csv(DATASET_RATEBEER + 'breweries.csv')
df_rb_users = pd.read_csv(DATASET_RATEBEER + 'users.csv')

#Matched dataset - We do not use it in our project.

<a id='preprocessing'></a>

## Pre-processing: checking for NaN values


<a id='beer_processing'></a>

### Beer datasets

In [None]:
#Let's have a look and check the shape of the datasets: 
print("BeerAdvocate beers:")
display(df_adv_beer.head(2))
print("Beer advocate beer dataset has {} rows and {} colomns".format(df_adv_beer.shape[0],df_adv_beer.shape[1]))
print("RateBeer beers:")
display(df_rb_beer.head(2))
print("RateBeer beer dataset has {} rows and {} colomns".format(df_rb_beer.shape[0],df_rb_beer.shape[1]))


In [None]:
#We are looking at the proportion of NaNs in each column of the beer datasets.
##Creation of 3 dataframes, one for each dataset having 3 columns
list = ['Total','NaN_proportion','NaN_number']
dfnan_adv = pd.DataFrame(columns = list)
dfnan_rate = pd.DataFrame(columns = list)


dfnan_adv['NaN_proportion'] = (df_adv_beer.isna().sum()/df_adv_beer.shape[0]) #proportion of NaN values in each column 
dfnan_adv['NaN_number'] = (df_adv_beer.isna().sum()) #total number of NaN values in each column 
dfnan_adv.loc[:,'Total'] = df_adv_beer.shape[0] #Number of rows in the dataset

dfnan_rate['NaN_proportion'] = (df_rb_beer.isna().sum()/df_rb_beer.shape[0])
dfnan_rate['NaN_number'] = (df_rb_beer.isna().sum())
dfnan_rate.loc[:,'Total'] = df_rb_beer.shape[0]


In [None]:
#We will look at the distribution of NaNs in each dataset. Here we plot the distribution of NaNs for beers.csv for RateBeer a d BeerAdvocate
plt.close('all')
fig = plt.figure(figsize=(11,6))

ax1 = plt.subplot(221)
ax2 = plt.subplot(222)


sns.set_color_codes("pastel")
sns.barplot(x="Total", y = dfnan_adv.index, data=dfnan_adv, label="Valid values", color="b", ax=ax1)
sns.set_color_codes("muted")
sns.barplot(x='NaN_proportion', y = dfnan_adv.index, data=dfnan_adv, label="NaN Values", color="b", ax=ax1)
ax1.legend(ncol=1, loc="upper right", frameon=True)
ax1.set(xlim=(0, 1), ylabel="", xlabel="NaN values BeerAdvocate")
sns.set_color_codes("pastel")
sns.barplot(x="Total", y = dfnan_rate.index, data=dfnan_rate, label="Valid Values", color="b",ax=ax2)
sns.set_color_codes("muted")
sns.barplot(x='NaN_proportion',  y = dfnan_rate.index, data=dfnan_rate, label="NaN", color="b", ax=ax2)
ax2.legend(ncol=1, loc="upper right", frameon=True)
ax2.set(xlim=(0, 1), ylabel="", xlabel="NaN values RateBeer") 

plt.suptitle('Missing values proportion per columns per beer dataset', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

We can see that several columns in the beer datasets display an important NaN content. For instance, there is around 90% NaN values of avg_matched_valid_ratings in both datasets. For each dataset, 4 columns have high NaN content. 

In [None]:
# Checking missing values in  breweries from all dataset 
# we see that there are no missing values


print('Test of the presences of NaN in the entire breweries datasets from AdvocatedBeer and RateBeer:', df_adv_breweries.isna().values.any(),
      'and ', df_rb_breweries.isna().values.any())

In [None]:
# analyze missing values in the AdvocateBeer breweries dataset and see that there are less than 30% of missing values 
df_adv_users.isna().sum()/df_adv_users.shape[0]

In [None]:
# analyze missing values in the RateBeer breweries dataset and see that there are less thqn 30% of missing values 
df_rb_users.isna().sum()/df_rb_users.shape[0]

We notice that a number of columns have a very high percentage of NA values especially in the Beer datasets. Analysis on these columns would be limited to a very narrow portion of the dataset, so we decide to drop these columns instead. We decide to drop columns that have more than 60% of NA values. 

In [None]:
df_adv_beer = df_adv_beer[df_adv_beer.columns[df_adv_beer.isna().sum()/df_adv_beer.shape[0] < 0.60]]
df_rb_beer=df_rb_beer[df_rb_beer.columns[df_rb_beer.isna().sum()/df_rb_beer.shape[0] < 0.60]]

In [None]:
#We check that all beers are unique in both datasets
print(df_adv_beer["beer_id"].nunique()==df_adv_beer.shape[0])
print(df_rb_beer["beer_id"].nunique()==df_rb_beer.shape[0])


In [None]:
#Some beers don't have ratings. We are not interested in them, so we drop them
df_adv_beer_wrating=df_adv_beer.drop(df_adv_beer[df_adv_beer['nbr_ratings']==0].index)
print("{} beers of the Beer Advocate dataset have been dropped".format((df_adv_beer.shape[0]-df_adv_beer_wrating.shape[0])))
df_rb_beer_wrating=df_rb_beer.drop(df_rb_beer[df_rb_beer['nbr_ratings']==0].index)
print("{} beers of the RateBeer dataset have been dropped".format((df_rb_beer.shape[0]-df_rb_beer_wrating.shape[0])))


<a id='user_processing'></a>

### Users datasets 

In [None]:
#Let's have a look 
df_adv_users.head()

In [None]:
df_rb_users.head()

In [None]:
#Let's check the shape of the datasets
print("Beer advocate users dataset has {} rows and {} colomns".format(df_adv_users.shape[0],df_adv_users.shape[1]))
print("RateBeer users dataset has {} rows and {} colomns".format(df_rb_users.shape[0],df_rb_users.shape[1]))


In [None]:
#Some user ids are in duplicates in the users datasets. We drop identical user ids to have only one instance of each.
df_adv_unique_users=df_adv_users.drop_duplicates(subset="user_id",keep="first") #suppresses all copies of same user_id and keeps the first instance
#sanity check 
print(df_adv_unique_users.shape[0]==df_adv_unique_users["user_id"].nunique())
df_rb_unique_users=df_rb_users.drop_duplicates(subset="user_id",keep="first") 
#sanity check 
print(df_rb_unique_users.shape[0]==df_rb_unique_users["user_id"].nunique())



<a id='brewery_processing'></a>

### Breweries dataset

In [None]:
#Let's check the shape of the breweries dataset
print("Beer advocate breweries dataset has {} rows and {} colomns".format(df_adv_breweries.shape[0],df_adv_breweries.shape[1]))
print("RateBeer breweries dataset has {} rows and {} colomns".format(df_rb_breweries.shape[0],df_rb_breweries.shape[1]))


In [None]:
#Let's have a look
df_adv_breweries.head()

In [None]:
df_rb_breweries.head()

In [None]:
#We don't care about breweries that don't have any beers, so we drop them 
df_adv_breweries_wbeer=df_adv_breweries.drop(df_adv_breweries[df_adv_breweries['nbr_beers']==0].index)
print("{} breweries have been dropped from the Beer Advocate dataset".format((df_adv_breweries.shape[0]-df_adv_breweries_wbeer.shape[0])))
df_rb_breweries_wbeer=df_rb_breweries.drop(df_rb_breweries[df_rb_breweries['nbr_beers']==0].index)
df_rb_breweries_wbeer['nbr_beers'].sort_values(ascending=True)
print("{} breweries have been dropped from the RateBeer dataset".format((df_rb_breweries.shape[0]-df_rb_breweries_wbeer.shape[0])))




<a id='exploration'></a>

## Exploration of the datasets 


### Beers dataset

In [None]:
#Histograms
#Beers binned by ratings count:
quantiles = [0.25, 0.5, 0.75]

titles = ['Beer Advocate - ratings by beer','Rate Beer - ratings by beer']
plot_data = [df_adv_beer_wrating.nbr_ratings, df_rb_beer_wrating.nbr_ratings]


fig, axes = plt.subplots(2, 1, figsize=(20, 7),sharey=True,sharex=True)
fig.tight_layout(pad=5)
for i,datum in enumerate(plot_data):
    sns.histplot(ax=axes.flat[i],data=datum,bins=100,log_scale=(True,True), kde=False, color="blue")
    axes.flat[i].set_xlabel("Beer count")
    axes.flat[i].set_ylabel("Rating Count")
    axes.flat[i].set_title(titles[i], pad=20)
    for q in quantiles:
        axes.flat[i].axvline(plot_data[i].quantile(q), 0, 1, color="black", ls='--',linewidth=3)
        axes.flat[i].text(plot_data[i].quantile(q)+0.1, 2e5, str(int(q*100))+'%', horizontalalignment='left', size='medium', color='black', weight='semibold')

We observe a skewed distribution of beers according to the rating count.

In [None]:
#Some stats
print(df_adv_beer_wrating.shape)
df_adv_beer_wrating[["nbr_ratings","avg","abv","avg_computed"]].describe()

In [None]:
print(df_rb_beer_wrating.shape)
df_rb_beer_wrating[["nbr_ratings","avg","abv","avg_computed"]].describe()

### Users

In [None]:
#Users binned by rating count

titles = ['Beer Advocate - ratings by user','Rate Beer - ratings by user', 'Matched RateBeer - ratings by user','Matched Beer Advocate - ratings by user']
plot_data = [df_rb_unique_users.nbr_ratings, df_adv_unique_users.nbr_ratings]


fig, axes = plt.subplots(2, 1, figsize=(20, 7),sharey=True,sharex=True)
fig.tight_layout(pad=5)
for i,datum in enumerate(plot_data):
    sns.histplot(ax=axes.flat[i],data=datum,bins=100,log_scale=(True,True), kde=False,color="blue")
    axes.flat[i].set_xlabel("Rating count")
    axes.flat[i].set_ylabel("User Count")
    axes.flat[i].set_title(titles[i], pad=20)
    for q in quantiles:
        #Offset of 0.1 so it is more readable
        axes.flat[i].axvline(plot_data[i].quantile(q)+0.1, 0, 1, color="black", ls='--',linewidth=3)
        axes.flat[i].text(plot_data[i].quantile(q)+0.1, 1e5, str(int(q*100))+'%', horizontalalignment='left', size='medium', color='black', weight='semibold')
      

We observe a skewed distribution of the number of ratings according to the number of raters. 

In [None]:
#Summary statistics of users
print(df_adv_unique_users.shape)
df_adv_unique_users[["nbr_ratings"]].describe()

In [None]:
print(df_rb_unique_users.shape)
df_rb_unique_users[["nbr_ratings"]].describe()

In [None]:
#Number of different locations the users come from: 
print("The users of Beer Advocate come from {} different locations".format(df_adv_unique_users["location"].nunique()))
print("The users of RateBeer come from {} different locations".format(df_rb_unique_users["location"].nunique()))
#Location can either be countries, national regions (case of UK) and states (case of USA)

In [None]:
#Top 5 locations of the users for each dataset:
users_top5_locations=pd.DataFrame(columns=['Beer Advocate users','Rate Beer users'])
users_top5_locations['Beer Advocate users']=df_adv_unique_users["location"].value_counts().index.tolist()[:5] #we sort the number of occurences of each location and extract the corresponding top 5 locations
users_top5_locations['Rate Beer users']=df_rb_unique_users["location"].value_counts().index.tolist()[:5]

users_top5_locations

### Breweries

In [None]:
#Histograms
#Breweries binned by beer count

titles = ['Beer Advocate - beers by brewery','Rate Beer - beers by brewery']
plot_data = [df_rb_breweries_wbeer.nbr_beers, df_adv_breweries_wbeer.nbr_beers]

fig, axes = plt.subplots(2, 1, figsize=(20, 7),sharey=True,sharex=True)
fig.tight_layout(pad=5)
for i,datum in enumerate(plot_data):
    sns.histplot(ax=axes.flat[i],data=datum,bins=100,log_scale=(True,True), kde=False, color="blue")
    axes.flat[i].set_xlabel("Brewery count")
    axes.flat[i].set_ylabel("Beer Count")
    axes.flat[i].set_title(titles[i], pad=20)
    for q in quantiles:
        axes.flat[i].axvline(plot_data[i].quantile(q), 0, 1, color="black", ls='--',linewidth=3)
        axes.flat[i].text(plot_data[i].quantile(q)+0.1, 4e3, str(int(q*100))+'%', horizontalalignment='left', size='medium', color='black', weight='semibold')

    

We observe a skewed distribution of the number of breweries according to their beer count. In addition, we notice that the RateBeer dataset has no breweries with more than 200 beers. It is possible that some data was lost.

In [None]:
#Some stats
print(df_adv_breweries_wbeer.shape)
df_adv_breweries_wbeer[["nbr_beers"]].describe()

In [None]:
print(df_rb_breweries_wbeer.shape)
df_rb_breweries_wbeer[["nbr_beers"]].describe()

In [None]:
#Number of different locations the breweries come from: 
print("The breweries of Beer Advocate come from {} different locations".format(df_adv_breweries_wbeer["location"].nunique()))
print("The breweries of RateBeer come from {} different locations".format(df_rb_breweries_wbeer["location"].nunique()))


In [None]:
#Top 5 locations of the breweries for each dataset:
breweries_top5_locations=pd.DataFrame(columns=['Beer Advocate breweries','Rate Beer breweries'])
breweries_top5_locations['Beer Advocate breweries']=df_adv_breweries_wbeer["location"].value_counts().index.tolist()[:5] #we sort the number of occurences of each location and extract the corresponding top 5 locations
breweries_top5_locations['Rate Beer breweries']=df_rb_breweries_wbeer["location"].value_counts().index.tolist()[:5]

breweries_top5_locations

<a id='textual_reviews'></a>
## Textual reviews and rating analysis

### Textual reviews

In [None]:
#Here we use a utilitary function to directly compute everything:
from functions import NLP_utils

help(NLP_utils.summary_analysis)

In [None]:
RB_counts, RB_dates = NLP_utils.summary_analysis("RateBeer")

In [None]:
BA_counts, BA_dates = NLP_utils.summary_analysis("BeerAdvocate")

In [None]:
# How does the review rate of users changes with time for each site:

fig, axes = plt.subplots(2, 1, figsize=(20, 7),sharey=True,sharex=True)
RB_dates = pd.to_datetime(RB_dates,unit='s')
BA_dates = pd.to_datetime(BA_dates,unit='s')
axes[1].set_title("Time evolution of RateBeer review posting", pad=20)
sns.histplot(RB_dates,log_scale=(False,False),kde=True,ax=axes[0])
axes[0].set_title("Time evolution of BeerAdvocate review posting", pad=20)
sns.histplot(BA_dates,log_scale=(False,False),kde=True,ax=axes[1])

We see for both datasets that the review posting shows a trend of increasing with time during the 2000s. However, while the review posting seems to keep increasing for BeerAdvocate, we see a decrease in the case of RateBeer.

In [None]:
#Word count distributions for each site: 
fig, axes = plt.subplots(1, 2, figsize=(20, 7),sharey=True,sharex=False)

axes[1].set_title('Rate Beer - Review word count')
axes[1].set_xlabel('Word per review')
axes[1].set_ylabel('Review count')
sns.histplot(RB_counts,bins=100, log_scale=(True,False),ax=axes[0])

axes[0].set_title('Beer Advocate - Review word count')
axes[0].set_xlabel('Word per review')
axes[0].set_ylabel('Review count')
sns.histplot(BA_counts,bins=100, log_scale=(True,False),ax=axes[1])

In [None]:
#It looks like the two distributions might be nomal. Let's check:
print(diagnostic.kstest_normal(BA_counts.values, dist = 'norm'))
print(diagnostic.kstest_normal(RB_counts.values, dist = 'norm'))

We reject the null hypothesis that the distribution of words per review of the Beer Advocate dataset follows a normal distribution. We get NaN values for the test on distribution of words per reviews of the Rate Beer dataset. We suppose that the distribution was so far from normal that it did not work properly.

### Rating analysis

In [None]:
#We start by fetching the ratings
help(NLP_utils.fetch_ratings)
rb_ratings = NLP_utils.fetch_ratings("RateBeer")
ba_ratings = NLP_utils.fetch_ratings("BeerAdvocate")


In [None]:
#Ratings distribution for each site
fig, axes = plt.subplots(1, 2, figsize=(20, 7),sharey=True,sharex=False)

axes[1].set_title('Rate beer - Rating distribution') 
axes[1].set_xlabel('Ratings')
axes[1].set_ylabel('Ratings count')
sns.histplot(rb_ratings,bins=100, log_scale=(False,False),ax=axes[0])

axes[0].set_title('Beer Advocate - Rating distribution')
axes[0].set_xlabel('Ratings')
axes[0].set_ylabel('Ratings count')
sns.histplot(ba_ratings,bins=100, log_scale=(False,False),ax=axes[1])

In [None]:
#It looks like the first distribution might be normal. The second is most likely not (looks like users like giving
#marks that are round numbers more than fractional marks), but let's check! @Auriane: je ne comprends pas ce graphe.
print(diagnostic.kstest_normal(ba_ratings.values, dist = 'norm'))
print(diagnostic.kstest_normal(rb_ratings.values, dist = 'norm'))

In both cases, we reject the null hypothesis that the distributions are normal. 

# Milestone 3 - Data Analysis


## Preprocessing for bias correction

In [None]:
#To facilitate our analysis, we add a column in the beer dataset corresponding to the location of the brewery.
location_to_brewery_name_adv=dict(zip(df_adv_breweries_wbeer.name,df_adv_breweries_wbeer.location)) #create a dictionary which keys are the breweries name and values are the breweries location
df_adv_beer_wrating["location"]=df_adv_beer_wrating.brewery_name.map(location_to_brewery_name_adv) #create a new location column indicating the location of the brewery
df_adv_beer_wrating.head()

In [None]:
location_to_brewery_name_rb=dict(zip(df_rb_breweries_wbeer.name,df_rb_breweries_wbeer.location)) #create a dictionary which keys are the breweries name and values are the breweries location
df_rb_beer_wrating["location"]=df_rb_beer_wrating.brewery_name.map(location_to_brewery_name_rb) #create a new location column indicating the location of the brewery
df_rb_beer_wrating.head()

### Check on the best beers before applying corrections

In [None]:
#Here we sort beers depending on their average rating for both datasets.  
df_beer_absolute_adv=df_adv_beer_wrating.sort_values(by="avg",ascending=False).reset_index()
df_beer_absolute_adv.head()

In [None]:
#Here we sort beers depending on their average rating for both datasets.  
df_beer_absolute_rb=df_rb_beer_wrating.sort_values(by="avg",ascending=False).reset_index()
df_beer_absolute_rb.head()

In [None]:
#We want to extract the best beer brewed in each location
grouped_adv_beer=df_adv_beer_wrating.sort_values(["location",'avg'],ascending=False).groupby('location').head(1) #groups beers coming from beweries of the same location
top_adv_country=df_adv_beer_wrating.sort_values(["location",'avg'],ascending=False).groupby('location').head(1).reset_index(drop=True) #gives top beer for each location! 
top_adv_country.head()

In [None]:
#We want to extract the best beer brewed in each location
grouped_rb_beer=df_rb_beer_wrating.sort_values(["location",'avg'],ascending=False).groupby('location').head(1) #groups beers coming from beweries of the same location
top_rb_country=df_rb_beer_wrating.sort_values(["location",'avg'],ascending=False).groupby('location').head(1).reset_index(drop=True) #gives top beer for each location! 
top_rb_country.head()

In [None]:
#Now that we have the best beer brewed in each location, let's rank locations
top_ranked_adv_country=top_adv_country.sort_values(by="avg",ascending=False).reset_index()
top_ranked_adv_country

In [None]:
#Now that we have the best beer brewed in each location, let's rank locations
top_ranked_rb_country=top_rb_country.sort_values(by="avg",ascending=False).reset_index()
top_ranked_rb_country

## Implementation of the bias correction

In [None]:
#Computation of the average rating of the beers
rb_average_rating=df_rb_beer.avg.mean()

In [None]:

help(NLP_utils.debiasing)
NLP_utils.debiasing("RateBeer",df_rb_beer_wrating, df_rb_unique_users)
NLP_utils.debiasing("BeerAdvocate",df_adv_beer_wrating,df_adv_unique_users)

In [None]:
display(df_adv_beer_wrating)

In [None]:
#Ratings distribution for each site
def legend_without_duplicate_labels(ax):
    """Erases duplicates in legend handles before plotting"""
    handles, labels = ax.get_legend_handles_labels()
    unique = [(h, l) for i, (h, l) in enumerate(zip(handles, labels)) if l not in labels[:i]]
    ax.legend(*zip(*unique))

#Plots of results of debiasing    
fig, axes = plt.subplots(1, 2, figsize=(20, 7),sharey=True,sharex=False)
axes[0].set_title('Rate Beer - Distribution of ratings')
axes[0].set_xlabel('Ratings')
axes[0].set_ylabel('Ratings count')
sns.histplot(df_rb_beer_wrating["avg"],bins=100, log_scale=(False,False),ax=axes[0],color='red',label="initial average rating")
sns.histplot(df_rb_beer_wrating["debiased_avg"],bins=100, log_scale=(False,False),ax=axes[0],label="debiased average rating")

axes[1].set_title('Beer Advocate - Distribution of ratings')
axes[1].set_xlabel('Ratings')
axes[1].set_ylabel('Ratings count')
sns.histplot(df_adv_beer_wrating["avg"],bins=100, log_scale=(False,False),ax=axes[1],color='red',label="initial average rating")
sns.histplot(df_adv_beer_wrating["debiased_avg"],bins=100, log_scale=(False,False),ax=axes[1],label="debiased average rating")
legends = [legend_without_duplicate_labels(ax) for ax in axes]

### Beer characteristics

In [None]:
Anglo_American_Ales=['Altbier', 'Barley Wine',"Bitter",'Premium Bitter/ESB',"Golden Ale/Blond Ale","Brown Ale", "California Common","Cream Ale","Black IPA","India Pale Ale (IPA)","Imperial IPA","Session IPA","Kölsch","American Pale Ale","Irish Ale","English Strong Ale", "American Strong Ale","Mild Ale","Amber Ale","English Pale Ale","Traditional ALe","Scotch Ale","Old Ale","Scottish Ale"]
Beligan_Style_Ales=["Belgian Ale","Belgian Strong Ale","Bière de Garde","Abbey Dubbel",'Abt/Quadrupel',"Saison","Abbey Tripel"]
Lagers=["Pale Lager","Premium Lager","Imperial Pils/Strong Pale Lager","India Style Lager","Amber Lager/Vienna",'Czech Pilsner (Světlý)',"Pilsener","Heller Bock","Doppelbock","Dumbler Bock","Weizen Bock","Esibock","Malt Liquor","Oktoberfest/Märzen","Radler/Shandy","Zwickel/Keller/Landbier","Dortmunder/Helles",'Dunkel/Tmavý','Schwarzbier','Polotmavý']
Stout_and_Porter=["Stout","Imperial Stout","Foreign Stout","Sweet Stout","Dry Stout","Porter","Baltic Porter","Imperial Porter"]
Wheat_beer=["Wheat Ale","Witbier",'German Hefeweizen','Dunkelweizen','German Kristallweizen']
Sour_beer=["Berliner Weisse","Sour/Wild Ale","Sour Red/Brown",'Grodziskie/Gose/Lichtenhainer','Lambic Style - Gueuze', 'Lambic Style - Unblended','Lambic Style - Faro','Lambic Style - Fruit',"Grodziskie/Gose/Lichtenhainer"]
Other_styles=["Spice/Herb/Vegetable","Smoked",'Fruit Beer',"Sahti/Gotlandsdricke/Koduõlu",'Low Alcohol','Specialty Grain']
Cider_Mead_Saké=['Cider','Mead','Saké - Daiginjo', 'Saké - Namasaké','Saké - Ginjo', 'Saké - Infused', 'Saké - Tokubetsu','Saké - Junmai', 'Saké - Nigori', 'Saké - Koshu', 'Saké - Taru','Saké - Honjozo', 'Saké - Genshu', 'Saké - Futsu-shu','Perry']
beer_style_dict={key: "Anglo American Ales" for key in Anglo_American_Ales}|{key: "Belgian Style Ales" for key in Beligan_Style_Ales}|{key:"Lagers" for key in Lagers}|{key:"Stout and Porter" for key in Stout_and_Porter}|{key:"Wheat beer" for key in Wheat_beer}|{key:"Sour beer" for key in Sour_beer}|{key:"Other styles" for key in Other_styles}|{key:"Cider, Mead and Saké" for key in Cider_Mead_Saké}


df_rb_beer_wrating["streamline_style"]=df_rb_beer_wrating["style"].map(beer_style_dict)

In [None]:
states=['United States, Wisconsin', 'United States, Georgia',
    'United States, North Carolina', 'United States, Arkansas',
       'United States, Louisiana', 'United States, West Virginia',
       'United States, California', 'United States, Washington',
       'United States, Massachusetts', 'United States, New Jersey',
       'United States, Maryland', 'United States, Arizona',
       'United States, Pennsylvania', 'United States, Indiana',
       'United States, Montana', 'United States, South Dakota',
       'United States, Tennessee', 'United States, Mississippi',
       'United States, Virginia', 'United States, Missouri',
       'United States, Maine', 'United States, Alabama',
       'United States, New Hampshire', 'United States, Delaware',
       'United States, Iowa', 'United States, Minnesota',
       'United States, Kentucky', 'United States, Nebraska',
       'United States, Wyoming', 'United States, Vermont',
       'United States, New Mexico', 'United States, Alaska',
       'United States, Rhode Island', 'United States, Kansas',
       'United States, Idaho', 'United States, Washington DC',
       'United States, Ohio', 'United States, Michigan',
       'United States, North Dakota', 'United States, Nevada',
       'United States, Oregon', 'United States, Hawaii',
       'United States, Connecticut', 'United States, Texas',
       'United States, Illinois', 'United States, South Carolina',
       'United States, Oklahoma', 'United States, Utah','United States, Florida','United States, Colorado','United States, New York']
kingdoms=["England",'Northern Ireland',"Scotland","Wales"]

country_dict={key:"United Kingdom of Great Britain and Northern Ireland" for key in kingdoms}|{key:"United States of America" for key in states}|{'Virgin Islands (British)':"United Kingdom of Great Britain and Northern Ireland",'Northern Marianas':'Northern Mariana Islands','South Ossetia':'Georgia','Dem Rep of Congo':'Congo','Nagorno-Karabakh':'Azerbaijan','Transdniestra':'Moldova','Saint Vincent and The Grenadines':'Saint Vincent and the Grenadines','Trinidad & Tobago':'Trinidad and Tobago',"Kosovo":"Albania","Reunion":"Réunion","Virgin Islands (U.S.)":"United States of America",'Tibet':"China","Abkhazia":"Georgia",'Cape Verde Islands':'Cabo Verde',"Fiji Islands":"Fiji",'Turkish Republic of Cyprus':'Cyprus','Antigua & Barbuda':'Antigua and Barbuda'}

df_rb_beer_wrating["Country"]=df_rb_beer_wrating.location.replace(country_dict)

In [None]:
df_rb_beer_wrating["Country"]=df_rb_beer_wrating.location.replace(country_dict)

In [None]:
df_rb_beer_wrating.sample(10)

In [None]:
import pycountry_convert as pc

def country_to_continent(country_name):
    country_alpha2 = pc.country_name_to_country_alpha2(country_name)
    if country_alpha2=="TL":
        country_continent_name="Asia"
    else:
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    return country_continent_name

df_rb_beer_wrating["Continent"]=df_rb_beer_wrating["Country"].apply(country_to_continent)

In [None]:
beer_features=pd.DataFrame(df_rb_beer_wrating[["abv","streamline_style","Continent"]])
beer_features.dropna(inplace=True)
categorical_columns=["streamline_style","Continent"]
beer_features=pd.get_dummies(beer_features, columns=categorical_columns)
beer_features.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [None]:
scaled_features = StandardScaler().fit(beer_features).transform(beer_features)
X_reduced_PCA = PCA(n_components=2,random_state=0).fit_transform(scaled_features)

plt.scatter(X_reduced_PCA[:,0], X_reduced_PCA[:,1])

In [None]:
mypca=PCA(n_components=2,random_state=0)
mypca.fit_transform(scaled_features)
mypca.explained_variance_ratio_

In [None]:
def plot_sse(features_X, start=10, end=20):
    sse = []
    for k in range(start, end):
        # Assign the labels to the clusters
        kmeans = KMeans(n_clusters=k, random_state=10,n_init=10).fit(features_X)
        sse.append({"k": k, "sse": kmeans.inertia_})
    sse=pd.DataFrame(sse)
    plt.plot(sse.k, sse.sse)
    plt.xlabel("K")
    plt.ylabel("Sum of Squared Errors")
    

In [None]:
plot_sse(scaled_features)

In [None]:
labels=0
cluster = KMeans(n_clusters=13, random_state=0,n_init=10).fit(scaled_features)
plt.scatter(X_reduced_PCA[:,0], X_reduced_PCA[:,1],c=cluster.labels_,alpha=0.5)

In [None]:
mypca=PCA(n_components=15,random_state=0)
mypca.fit_transform(scaled_features)
mypca.explained_variance_ratio_

In [None]:
print(cluster.labels_)
#labels.labels_()

In [None]:
#beer_features=pd.DataFrame(df_rb_beer_wrating[["abv","streamline_style","Continent"]])
#beer_features.dropna(inplace=True)
df_rb_beer_wrating_test=df_rb_beer_wrating.dropna(subset=["abv","streamline_style","Continent"])
df_rb_beer_wrating_test.head()
#df_rb_beer_wrating_test["cluster"]=cluster.labels_
#df_rb_beer_wrating_test["cluster"]

In [None]:
df_rb_beer_wrating_test["cluster"]=cluster.labels_
df_rb_beer_wrating_test["cluster"]

In [None]:
df_rb_beer_wrating_test["cluster"].unique()

In [None]:
rb_grouped_by_cluster=df_rb_beer_wrating_test.groupby(["cluster"]).debiased_avg
rb_grouped_by_cluster.describe()

In [None]:
df_rb_beer_wrating_test.groupby(["cluster"]).boxplot(column=['debiased_avg'],subplots=False)

### Positive and Negative words analysis in the reviews

For this analysis we worked with csv files, which allowed  more flexibility and the possible to run the code only once. We divided the AdvocateBeer reviews into 10 csv files with a maximum of 250,000 data, and 28 csv files with a maximum of 250,000 data for RateBeer. For each CSV we calculated the average number of positive and negative words per country. Theresults generated are small ( in ko).

In [None]:
# The lines of code below compute the average of positifs and negatis words for each csv per country and put the results
# in the folder generated data
# This line of code just need to run one time to get the csv file
# the durtion of run can take multiple hours

#loop_RateBeer_get_pos_neg_words()
#loop_AdvBeer_get_pos_neg_words()

In [None]:
# Rate Beer Management

# This function merge the results of all the csv files generated above
df_all_rb =  ratebeer_merging_csv_results()

# We group by country with all dataframe concated to get a dataframe with all the resume o the analysis to send a csv
# to add the information in the map
country_res_groups_all_rb = df_all_rb.groupby(by='country').apply(weighted_average_all, 'pos_words','neg_words','nb_review')
df_final_rb = pd.DataFrame(country_res_groups_all_rb.tolist(), columns=[['neg_words','pos_words','nb_review']])
df_final_rb['location'] = country_res_groups_all_rb.index

# save the results in a csv file to ease the integration in the worlds map
df_final_rb.to_csv(rateBeer_root + 'RateBeer_pos_neg_analysis_resume.csv',index=False)


In [None]:
# Advocate Beer Management

# This function merge the results of all the csv files generated above
df_all_adv =  advbeer_merging_csv_results()

# We group by country with all dataframe concated to get a dataframe with all the resume o the analysis to send a csv
# to add the information in the map
country_res_groups_all_adv = df_all_adv.groupby(by='country').apply(weighted_average_all, 'pos_words','neg_words','nb_review')
df_final_adv = pd.DataFrame(country_res_groups_all_adv.tolist(), columns=[['neg_words','pos_words','nb_review']])
df_final_adv['location'] = country_res_groups_all_adv.index

# save the results in a csv file to ease the integration in the worlds map
df_final_adv.to_csv(advBeer_root + 'BeerAdv_pos_neg_analysis_resume.csv',index=False)

## Matching SAT Beers with RateBeer/BeerAdvocate - Information Retrieval

In [None]:

SAT_beers = read_data.fetch_satellite_df()
BA_beers = df_adv_beer_wrating
RB_beers = df_rb_beer_wrating
BA_beers = BA_beers[BA_beers["nbr_ratings"] != 0].copy()
RB_beers = RB_beers[RB_beers["nbr_ratings"] != 0].copy()
sat_queries = NLP_utils.querify(SAT_beers)


#The queries should have the same document structure as the corpus. 
for index, biere in SAT_beers.iterrows():
    sat_queries[index] = (NLP_utils.tokenize(biere['nom']) +" "  + NLP_utils.tokenize(biere['brasseur']) + " " + NLP_utils.tokenize(str(biere['alcool'])))

#We implemented Vector Space Retrieval with cosine similarities to find matches between SAT and datasets
#More information can be found in the docstring:
help(NLP_utils.vector_space_retrieval)
BA_results = NLP_utils.vector_space_retrieval(sat_queries,BA_beers,k=5)
RB_results =  NLP_utils.vector_space_retrieval(sat_queries,RB_beers,k=5)
#We rename columns to avoid mixing up results when finding ratings of SAT beers.
BA_results.columns = 'BA_' + BA_results.columns.values
RB_results.columns = 'RB_' + RB_results.columns.values
beer_retrieval_results = pd.concat([pd.DataFrame(np.repeat(SAT_beers[["nom","alcool"]].values, 5, axis=0),columns=["nom","alcool"]),BA_results.reset_index()[["BA_beer_name","BA_avg",'BA_similarity','BA_abv', 'BA_brewery_name', 'BA_style', 'BA_beer_id']],RB_results.reset_index()[["RB_beer_name","RB_avg",'RB_similarity','RB_abv', 'RB_brewery_name', 'RB_style', 'RB_beer_id']]],axis=1)
#We save our work
beer_retrieval_results.to_csv("DATA/matched_SATbeers.csv",index=False)

## Extracting ratings of matched beers and estimating ratings of not matched beers - Supervised Learning

In [None]:
#We also created a helper function to automatically match beers of the dataset we found with Retrieval following our matching heuristic.
# More information can be found with help().
help(SAT_helpers.generate_automatic_beer_matches)
SAT_match_candidates = beer_retrieval_results

automatic_matches_BA, not_matched_BA, manual_match_BA = SAT_helpers.generate_automatic_beer_matches("BeerAdvocate",SAT_match_candidates,df_adv_beer_wrating)
automatic_matches_RB, not_matched_RB, manual_match_RB = SAT_helpers.generate_automatic_beer_matches("RateBeer",SAT_match_candidates,df_rb_beer_wrating)

#Example 
display(automatic_matches_BA.head(3))
display(not_matched_BA.head(3))

In [None]:
#We display each one of the dataframes containing non matched beers to check entry by entry
#display(manual_match_RB[["nom","alcool","RB_beer_name","RB_abv"]])
#display(manual_match_BA[["nom","alcool","BA_beer_name","BA_abv"]])
RB_manually_matched_list = [15,20,25,31,35,40,70,100,120,125,130,196,215,220,275,280]
BA_manually_matched_list = [15,27,30,40,71,95,100,120,130,150,156,195,215,220,225,250,260,275,280] 
RB_manually_matched = SAT_match_candidates.iloc[RB_manually_matched_list][["nom","RB_beer_name","RB_avg","RB_abv","RB_similarity","RB_brewery_name","RB_style","RB_beer_id"]].drop_duplicates(subset="nom", keep='first', inplace=False, ignore_index=False)
BA_manually_matched = SAT_match_candidates.iloc[BA_manually_matched_list][["nom","BA_beer_name","BA_avg","BA_abv","BA_similarity","BA_brewery_name","BA_style","BA_beer_id"]].drop_duplicates(subset="nom", keep='first', inplace=False, ignore_index=False)


#This is the final dataframe with all beers found in the datasets. These beers were either automatically or manually matched
RB_matches = pd.concat([automatic_matches_RB,RB_manually_matched],axis="index")
BA_matches = pd.concat([automatic_matches_BA,BA_manually_matched],axis="index")


In [None]:
#We created a helper function to automatically prepare features (Alcohol content, dummy variables for beer production location and beer style) .
# More information can be found with help(). A copy of the function is available at the end of the notebook.
help(SAT_helpers.prepare_features)
sat_rb_features,rb_features, RB_SAT_beer_to_predict, RB_training_beers =   SAT_helpers.prepare_features("RateBeer",RB_matches)  
sat_ba_features,ba_features, BA_SAT_beer_to_predict, BA_training_beers =  SAT_helpers.prepare_features("BeerAdvocate",BA_matches)  

#Example
display(sat_rb_features.head(5))

In [None]:
#We created a helper function to automatically train a regression model and evaluate it.
#Moreover, this function also outputs the predictions for SAT beers that were not found in the dataset (our main goal in this step).
# More information can be found with help(). A copy of the function is available at the end of the notebook.

help(SAT_helpers.randomforest_sat_beers_ratings)
RB_r2_score, RB_predictions = SAT_helpers.randomforest_sat_beers_ratings(rb_features,sat_rb_features)
BA_r2_score, BA_predictions = SAT_helpers.randomforest_sat_beers_ratings(ba_features,sat_ba_features)
print(f"The R2 score for ratings estimation with our Random Forest Regressor applied to SAT beers with RateBeer is: {RB_r2_score}")
print(f"The R2 score for ratings estimation with our Random Forest Regressor applied to SAT beers with RateBeer is: {BA_r2_score}")

#Unfortunately, our results are not spetacular. Somewhat expected since we use very limited features here.

In [None]:
#We created a helper function to automatically display and save our results for further use in the notebook.
# More information can be found with help(). A copy of the function is available at the end of the notebook.


help(SAT_helpers.save_and_display_sat_ratings) 
RB_sorted = SAT_helpers.save_and_display_sat_ratings("RateBeer",RB_predictions,RB_matches,RB_SAT_beer_to_predict,RB_training_beers)
BA_sorted = SAT_helpers.save_and_display_sat_ratings("BeerAdvocate",BA_predictions,BA_matches,BA_SAT_beer_to_predict,BA_training_beers)



# Querying the dataset for the beers prefered by each country

In [None]:
#We created a helper function to automatically search the notebook for the prefered beers of users of each country.
# A brief explanation of the steps performed by the function are :
#Note : our functions use the 'unbiased_ratings' we generated in our bias correction step.
#1. Group all ratings by country of origin of its corresponding user by summing and counting all ratings
#2. Calculate the average rating for each beer according to each country of origin 
#3. Find the most rated beer and the beer that has the highest rating, given that the beer is at least in the 50% quantile of number of ratings (i.e. the beer is not rated by only a few individuals).
#(bis): Redo step 1 to 3 in parallel, but considering beer styles instead of separate beers.

help(read_data.find_favourite_beers)

In [None]:
most_reviewed_beer_RB,favorite_beer_RB,most_reviewed_style_RB,favorite_style_RB = read_data.find_favourite_beers("RateBeer")
most_reviewed_beer_BA,favorite_beer_BA,most_reviewed_style_BA,favorite_style_BA = read_data.find_favourite_beers("BeerAdvocate")

In [None]:
#An example:
print("Favourite beer style of BeerAdvocate users, by country:")
display(favorite_style_BA)

print("Favourite beer of RateBeer users, by country:")
display(favorite_beer_RB)

# Graphs and Plots for Data Story

## Beer and Country t-SNE

In [None]:
import openai
from functions import plot_helpers


In [None]:
#We implemented a function to retrieve the reviews of the subset of the SAT beers that are in the datasets
help(plot_helpers.retrieve_reviews_SAT)

best_RB_reviews, best_reviews_SAT = plot_helpers.retrieve_reviews_SAT("RateBeer")
best_BA_reviews, best_reviews_SAT_BA = plot_helpers.retrieve_reviews_SAT("BeerAdvocate")

In [None]:
#We add country information to the dataset in order to allows us to plot countries separatedly from SAT beers
#We consider good reviews all reviews that are not empty (more than one word) and that are associated with high ratings
best_RB_reviews = best_RB_reviews[best_RB_reviews["good_reviews"].str.len() > 1]
best_BA_reviews = best_BA_reviews[best_BA_reviews["good_reviews"].str.len() > 1]
help(plot_helpers.add_country_column)
best_BA_reviews_with_countries = plot_helpers.add_country_column(best_BA_reviews,"DATA/favourite_beer_BA.csv",california_as_usa=True)
best_RB_reviews_with_countries = plot_helpers.add_country_column(best_RB_reviews,"DATA/favourite_beer_RB.csv",california_as_usa=True)

In [None]:
## Transforming reviews to embeddings with OpenAI API and ADA model ##
#We delegate the hard job of producing word embeddings to a foundation model. 
#In this case, ADA-002, launched on 16/12/2022

help(plot_helpers.request_embeddings)
openai.api_key = "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"
RB_embeddings = plot_helpers.request_embeddings(best_RB_reviews_with_countries["good_reviews"],verbose=False)
SAT_embeddings = plot_helpers.request_embeddings(best_reviews_SAT["good_reviews"],verbose=False)
BA_embeddings = plot_helpers.request_embeddings(best_BA_reviews_with_countries["good_reviews"],verbose=False)


In [None]:
help(plot_helpers.plot_tsne)

plot_helpers.plot_tsne(SAT_embeddings,RB_embeddings,best_reviews_SAT,best_RB_reviews_with_countries,perplexity=15,country_size=10,beer_size=25,acronym="RB",title="t-SNE representation of best beers sold on SAT and favourite beers of each country according to RateBeer")
plot_helpers.plot_tsne(SAT_embeddings,BA_embeddings,best_reviews_SAT,best_BA_reviews_with_countries,perplexity=15,country_size=20,beer_size=25,acronym="BA",title="t-SNE representation of best beers sold on SAT and favourite beers of each country according to BeerAdvocate")

## SAT Beer rankings

In [None]:
#We plot the rankings of SAT beers according to both datasets. We provide a button so you can choose the natural ranking

RB_sorted = pd.read_csv("DATA/predicted_SAT_RB_sorted.csv")
BA_sorted = pd.read_csv("DATA/predicted_SAT_BA_sorted.csv")

help(plot_helpers.create_rank_plot)
plot_helpers.create_rank_plot(BA_sorted,RB_sorted)

## Wordclouds

In [None]:
help(plot_helpers.plot_wordcloud_dropdown)
plot_helpers.plot_wordcloud_dropdown("BeerAdvocate")
plot_helpers.plot_wordcloud_dropdown("RateBeer")

### Interactive World Map

In [None]:
#We created our own function based on plotly to prepare our interactive World Map. 
#This function was put in a separated map_helpers.py for clarity purposes
# help(map_helpers) to see all we used to produce this plot.
from functions import map_generation
from functions import plot_helpers

help(plot_helpers)
plot_helpers.combine_neg_pos_and_favoured_beer('RateBeer_pos_neg_analysis_resume.csv',
                                              'favourite_beer_RB.csv',
                                             'RB_map_source.csv')

plot_helpers.combine_neg_pos_and_favoured_beer('BeerAdv_pos_neg_analysis_resume.csv',
                                              'favourite_beer_BA.csv',
                                             'BA_map_source.csv')
plot_helpers.generate_map('RB_map_source.csv', 'map_favourite_beer_RB',title="Favourite beer of users by country according to RateBeer")
plot_helpers.generate_map('BA_map_source.csv', 'map_favourite_beer_BA',title="Favourite beer of users by country according to BeerAdvocate")

### Addendum : Our helper modules exposed for easy reading and code-checking

NLP_utils.py :

In [None]:

tfidf = TfidfVectorizer()

nltk.download('stopwords')
def vector_space_retrieval(queries,dataframe,k=5):
    """
    implementation of Vector Space retrieval with cosine similarities
     Parameters
    ----------
    queries   (pandas.Series)    : dataframe with concatenated tokens used as a 'query' for information retrieval
    dataframe (pandas.DataFrame) : corpus considered for retrieval
    k          (int)             : number of matches to be found


    Returns
    -------
    (pandas.DataFrame) DataFrame with k top matches for 'queries'. Has a column for the cosine similarity value calculated during retrieval
    '''
    """
    document_dict = tokenize_dataframe(dataframe)
    doc_vectors = tfidf.fit_transform(document_dict.values())
    doc_ids = []
    similarity_coefs = []
    for query in queries.values():
        vector_queries = tfidf.transform([query])
        cosine_similarities = linear_kernel(vector_queries, doc_vectors).flatten()
        related_docs_indices, cos_sim_sorted = zip(*sorted(enumerate(cosine_similarities), key=itemgetter(1), 
                                                        reverse=True))
        for i, cos_sim in enumerate(cos_sim_sorted):
            if i >= k:
                break
            doc_ids.append(related_docs_indices[i])
            similarity_coefs.append(cos_sim)
    new_df = dataframe.iloc[doc_ids].copy()
    new_df["similarity"] = np.array(similarity_coefs)
    return new_df

def tokenize(text):
    """Transforms a 'text' into a collection of tokens. Implemented with lowercase, stemming and punctuation removal
    
    Parameters
    ----------
    text         (string)  : text to be tokenizec

    Returns
    -------
    (string) tokenized text
    '''"""
    stemmer = PorterStemmer()
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    return " ".join([stemmer.stem(word.lower()) for word in tokens])

def tokenize_dataframe(dataframe):
    """tokenizes a specific combination (abv, beer_name and brewery_name) of columns of a beer dataset. Used for preparing queries and documents information retrieval.
    
    Parameters
    ----------
    dataframe    (string)  : dataframe with columns to be tokenized and used to form documents

    Returns
    -------
    (list(string)) list of documents 
    """
    documents = {}
    for index, beer in dataframe.iterrows():
        documents[beer["beer_id"]] = (tokenize(beer["beer_name"]) + " " + tokenize(beer['brewery_name']) + " " + tokenize(str(beer["abv"])))
    return documents

def search_vec(query, features, threshold=0.1):
    new_features = tfidf.transform([query])
    cosine_similarities = linear_kernel(new_features, features).flatten()
    related_docs_indices, cos_sim_sorted = zip(*sorted(enumerate(cosine_similarities), key=itemgetter(1), 
                                                       reverse=True))
    doc_ids = []
    for i, cos_sim in enumerate(cos_sim_sorted):
        if cos_sim < threshold:
            break
        doc_ids.append(related_docs_indices[i])
    return doc_ids

def querify(query_df):
    """ produces queries for the SAT beer dataset
    
    Parameters
    ----------
    dataframe    (string)  : dataframe of SAT beers. Must have columns 'nom', 'alcool' and 'brasseur'.

    Returns
    -------
    (list(string)) list of queries
    """
    sat_queries = {}
    for index, biere in query_df.iterrows():
        sat_queries[index] = (tokenize(biere['nom']) +" "  + tokenize(biere['brasseur']) + " " + tokenize(str(biere['alcool'])))
    return sat_queries

def display_results_df(base_df,results_df,reference_column,results):
    new_df = pd.DataFrame()
    new_df[reference_column] = base_df[reference_column]
    new_df[results] = results_df[results].copy()
    return new_df

### Helper functions for word count, language identification and date 
def fetch_ratings(dataset):
    """fetches ratings of beers for plotting and exploratory data analysis

    Parameters
    ----------
    dataset    (string)  : name of the dataset. Either 'BeerAdvocate' or 'RateBeer'

    Returns
    -------
    (pandas.Series) pandas.Series with all the ratings of the dataset
    """
    first = pd.read_csv(f"data/{dataset}_ratings_part_0.csv",low_memory=False)
    if dataset == "BeerAdvocate" :
        first = first[first['rating'] != ' nan']
    if dataset == "RateBeer" :
        first = first[first['rating'] != 'NaN']    
    ratings = first.rating.astype(float)
    if dataset == "BeerAdvocate" :
        csv_count = 17
    else :
        csv_count = 15

    for index in range(1,csv_count):   
        temp = pd.read_csv(f"data/{dataset}_ratings_part_{index}.csv",low_memory=False)
        if dataset == "BeerAdvocate" :
            first = temp[temp['rating'] != 'nan']
        if dataset == "RateBeer" :
            first = temp[temp['rating'] != 'NaN']        
        
        rating = temp.rating.astype(float)
        ratings = pd.concat([ratings, rating])
    return ratings

def summary_analysis(dataset):
    """produces a dataframe of word counts by rating and of datetimes of rating creationf for exploratory data analysis purposes.
    
    Parameters
    ----------
    dataset     (string)  : name of the dataset. Either 'RateBeer' or 'BeerAdvocate
    Returns
    -------
    (series) series of wordcounts for each rating
    (series) series of correctly encoded datetime.Datetimes for the dates of creationof reviews
    
    """
    first = pd.read_csv(f"data/{dataset}_ratings_part_0.csv",low_memory=False)
    
    #NaNs between datasets are not standardized in the txt file
    if dataset == "BeerAdvocate" :
        first = first[first['rating'] != ' nan']
    if dataset == "RateBeer" :
        first = first[first['rating'] != 'NaN']
    print("Started counting words and binning dates...")
    first["word_count"] = first.text.apply(lambda x: len(str(x).split()))
    
    #We filter reviews which have only one word (NaNs in BeerAdvocate mostly)
    first = first[first['word_count'] > 1]
    
    counts = first.word_count
    dates = first.date
    #We iterate over the csvs used to keep the data, in order to not load the full dataset in memory
    if dataset == "BeerAdvocate" :
        csv_count = 17
    else :
        csv_count = 15
    for index in range(1,csv_count):
        temp = pd.read_csv(f"data/{dataset}_ratings_part_{index}.csv",low_memory=False)
        temp["word_count"] = temp.text.apply(lambda x: len(str(x).split()))
        #We filter reviews which have only one word (NaNs in BeerAdvocate mostly)
        temp = temp[temp['word_count'] > 1]
        count = temp.word_count
        date = temp.date
        counts = pd.concat([counts, count])
        dates = pd.concat([dates, date])
    print("Done")
    return counts, dates



def debiasing(website,beer_df,unique_user):
    """corrects for the bias of a given dataset. The correction heuristic is inspired from https://krisjensen.github.io/files/bias_blog.pdf/.
    Correction is implemented with clipping (such that all ratings are between 0 and 5) and attenuation (such that users with only 1 rating are not corrected and that the correction increases with number of ratings)
    
    Parameters
    ----------                                  
    website         (string)  :  name of the website/dataset considered. Either "RateBeer" or "BeerAdvocate"
    beer_df         (pandas.DataFrame) : dataframe with beer ratings that will be corrected
    unique_user     (pandas.DataFrame) : dataframe of beer reviewers (without duplicates) used as a basis to determine systematic reviewer bias 
    
    Returns
    -------
    None
    
    """
    
    def attenuating(row,max_rating):
        """attenuates the bias correction of a specific user. 
        Attenuation is an affine function of the number of ratings of a given user.
        
        Parameters
        -----------
        row: row of the given dataframe
        max_rating: maximum rating found in the dataframe
        
        Returns
        -----------
        attenuation coefficient 
        """
        if row.nbr_ratings==1:
            attenuation_coeff=0 #We cancel the bias if the user only rated once.
        if row.nbr_ratings==max_rating:
            attenuation_coeff=1 #If the user has the rated the most, we do not attenuate their bias.
        else:
            attenuation_coeff=1/(max_rating-1)*row.nbr_ratings-1/(max_rating-1) #for the other users, the bias is attenuated with a coefficient between 0 and 1 affine function of the number of ratings
        return attenuation_coeff
    
    def clip (dataframe):
        """clips a debiased rating such that all ratings are in [0,5] range
        Parameters
        ----------
        dataframe: dataframe on which the ratings will be debiased
        
        Returns
        ----------
        debiased_rating: debiased rating in [0,5] range
        """
        debiased_rating=dataframe['rating']-dataframe['bias'] #We compute the debiased rating as the bias of the user substracted to the initial rating of the user
        if debiased_rating<0: 
            debiased_rating=0 #if the new rating is inferior to 0, we clip it to 0
        if debiased_rating>5: 
            debiased_rating=5 #if the new rating is superior to 5, we clip it to 5
        return debiased_rating
    
    # We define local variables depending on the website
    if website == "RateBeer":
        acronym = "RB"
        NUM_CSV = 15
        average_rating = beer_df.avg.mean()
    if website == "BeerAdvocate":
        acronym = "BA"
        NUM_CSV = 17
        average_rating = beer_df.avg.mean()
    #We iterate over the ratings dataset and group ratings by user.
    grouped_by_users = pd.DataFrame([])
    for i in range(1,NUM_CSV):
        temp = pd.read_csv(f'DATA/{website}_ratings_part_{i}.csv')
        df_partial_grouped_by_users_ratings=temp.groupby(["user_id"]).rating.sum().to_frame()#We group the ratings by users and return the sum of the ratings of each user 
        grouped_by_users = pd.concat([grouped_by_users,df_partial_grouped_by_users_ratings]).groupby(["user_id"]).sum()# We compile the results from  all our csvs    
        del temp
        
        
    grouped_by_users = grouped_by_users.reset_index()
    user_to_nbr_ratings=dict(zip(unique_user.user_id,unique_user.nbr_ratings)) #create a dictionary which keys are user_ids and values are the number of ratings of said users
    grouped_by_users["nbr_ratings"]=grouped_by_users.user_id.map(user_to_nbr_ratings) #we map the number of ratings to the corresponding user_ids
    #We calculate the attenuation coefficient and bias of each user
    maximum_rating=max(grouped_by_users["nbr_ratings"])
    grouped_by_users["attenuation coeff"]=grouped_by_users.apply(lambda row: attenuating(row,maximum_rating),axis=1)
    #Since we computed the sum of ratings of each users when we iterated over the csvs, we need to divide this value by the number of ratings of the user to get the average. The bias is the average rating of the user - average rating of all beers
    grouped_by_users["bias"]=(grouped_by_users["rating"]/grouped_by_users["nbr_ratings"]-average_rating)*grouped_by_users["attenuation coeff"] #this bias must be multiplied by the attenuation coefficient
    #We apply the correction to each rating of the dataframe and clip the rating so its always between [0,5]
    user_to_bias=dict(zip(grouped_by_users.user_id,grouped_by_users.bias)) #We create a dictionary which keys are the user ids and and values are the user biases
    grouped_by_beer = pd.DataFrame([])
    for i in range(0,NUM_CSV): #We iterate over all the csvs
        temp = pd.read_csv(f'DATA/{website}_ratings_part_{i}.csv')
        temp["bias"]=temp.user_id.map(user_to_bias) #we map the bias to its user
        temp["debiased_rating"]=temp["rating"]-temp["bias"] #We substract the bias from the initial rating to get the debiased rating
        temp["debiased_rating"]=temp.apply(clip,axis=1) #Clipping of the new rating
        #Needed to save work, only need to be done once
        #temp.to_csv(f"DATA/{website}_ratings_part_{i}_corrected_w_attenuation.csv")
        partial_grouped_by_beer = temp.groupby(["beer_id"]).debiased_rating.sum().to_frame() #We group the debiased ratings by beer ids and return the sum of ratings for each beers
        grouped_by_beer = pd.concat([grouped_by_beer,partial_grouped_by_beer]).groupby(["beer_id"]).sum() #We compile the results from each csv


    grouped_by_beer=grouped_by_beer.reset_index()
    beer_to_debiased_rating=dict(zip(grouped_by_beer.beer_id,grouped_by_beer.debiased_rating)) #we create a dictionary which keys are the beer ids and values are the sums of debiased ratings
    beer_to_nbr_ratings=dict(zip(beer_df.beer_id,beer_df.nbr_ratings)) #we create a dictionary which keys are the beer ids and the keys are the number of ratings for those beers
    #We compute the debiased average of all beers as the sum of debiased ratings divided by the number of ratings
    beer_df["debiased_avg"]=beer_df.beer_id.map(beer_to_debiased_rating)/beer_df.beer_id.map(beer_to_nbr_ratings) 
    beer_df.to_csv(f"DATA/{website}_beers_corrected_avg.csv",index=False)


Anglo_American_Ales=['Altbier', 'Barley Wine',"Bitter",'Premium Bitter/ESB',"Golden Ale/Blond Ale","Brown Ale", "California Common","Cream Ale","Black IPA","India Pale Ale (IPA)","Imperial IPA","Session IPA","Kölsch","American Pale Ale","Irish Ale","English Strong Ale", "American Strong Ale","Mild Ale","Amber Ale","English Pale Ale","Traditional ALe","Scotch Ale","Old Ale","Scottish Ale"]
Beligan_Style_Ales=["Belgian Ale","Belgian Strong Ale","Bière de Garde","Abbey Dubbel",'Abt/Quadrupel',"Saison","Abbey Tripel"]
Lagers=["Pale Lager","Premium Lager","Imperial Pils/Strong Pale Lager","India Style Lager","Amber Lager/Vienna",'Czech Pilsner (Světlý)',"Pilsener","Heller Bock","Doppelbock","Dumbler Bock","Weizen Bock","Esibock","Malt Liquor","Oktoberfest/Märzen","Radler/Shandy","Zwickel/Keller/Landbier","Dortmunder/Helles",'Dunkel/Tmavý','Schwarzbier','Polotmavý']
Stout_and_Porter=["Stout","Imperial Stout","Foreign Stout","Sweet Stout","Dry Stout","Porter","Baltic Porter","Imperial Porter"]
Wheat_beer=["Wheat Ale","Witbier",'German Hefeweizen','Dunkelweizen','German Kristallweizen']
Sour_beer=["Berliner Weisse","Sour/Wild Ale","Sour Red/Brown",'Grodziskie/Gose/Lichtenhainer','Lambic Style - Gueuze', 'Lambic Style - Unblended','Lambic Style - Faro','Lambic Style - Fruit',"Grodziskie/Gose/Lichtenhainer"]
Other_styles=["Spice/Herb/Vegetable","Smoked",'Fruit Beer',"Sahti/Gotlandsdricke/Koduõlu",'Low Alcohol','Specialty Grain']
Cider_Mead_Saké=['Cider','Mead','Saké - Daiginjo', 'Saké - Namasaké','Saké - Ginjo', 'Saké - Infused', 'Saké - Tokubetsu','Saké - Junmai', 'Saké - Nigori', 'Saké - Koshu', 'Saké - Taru','Saké - Honjozo', 'Saké - Genshu', 'Saké - Futsu-shu','Perry']
beer_style_dict={key: "Anglo American Ales" for key in Anglo_American_Ales}|{key: "Belgian Style Ales" for key in Beligan_Style_Ales}|{key:"Lagers" for key in Lagers}|{key:"Stout and Porter" for key in Stout_and_Porter}|{key:"Wheat beer" for key in Wheat_beer}|{key:"Sour beer" for key in Sour_beer}|{key:"Other styles" for key in Other_styles}|{key:"Cider, Mead and Saké" for key in Cider_Mead_Saké}


plot_helpers.py

In [None]:
STATES = {
"Alabama": "AL",
"Alaska": "AK",
"Arizona": "AZ",
"Arkansas": "AR",
"California": "CA",
"Colorado": "CO",
"Connecticut": "CT",
"Delaware": "DE",
"Florida": "FL",
"Georgia": "GA",
"Hawaii": "HI",
"Idaho": "ID",
"Illinois": "IL",
"Indiana": "IN",
"Iowa": "IA",
"Kansas": "KS",
"Kentucky": "KY",
"Louisiana": "LA",
"Maine": "ME",
"Maryland": "MD",
"Massachusetts": "MA",
"Michigan": "MI",
"Minnesota": "MN",
"Mississippi": "MS",
"Missouri": "MO",
"Montana": "MT",
"Nebraska": "NE",
"Nevada": "NV",
"New Hampshire": "NH",
"New Jersey": "NJ",
"New Mexico": "NM",
"New York": "NY",
"North Carolina": "NC",
"North Dakota": "ND",
"Ohio": "OH",
"Oklahoma": "OK",
"Oregon": "OR",
"Pennsylvania": "PA",
"Rhode Island": "RI",
"South Carolina": "SC",
"South Dakota": "SD",
"Tennessee": "TN",
"Texas": "TX",
"Utah": "UT",
"Vermont": "VT",
"Virginia": "VA",
"Washington": "WA",
"West Virginia": "WV",
"Wisconsin": "WI",
"Wyoming": "WY",
"District of Columbia": "DC",
"American Samoa": "AS",
"Guam": "GU",
"Northern Mariana Islands": "MP",
"Puerto Rico": "PR",
"United States Minor Outlying Islands": "UM",
"U.S. Virgin Islands": "VI",
}


def generate_map(filename, map_name, usa= False, 
                 html = True, show_map = True,
                 title = '',
                 source_file_path = 'map/file_for_map/', 
                 html_file_path = 'map/html/'):

    ''' Author: Gabriel Benato @HOTCHOCOLATE, ADA2022
    This function generate an interactive map of the world (and USA)
    with the average score of beer per countries and their favored beer's style. 
    Is is assumed it will be used in a file at the root of the project. 
    If it's not the case see: source_file_path and html_file_path.

    Parameters
    ----------
    filename         (string)  :  Name of the source file containing the necessary dataframe for map generation 
                                  should contain a 'location', 'style', 'normalized_rating', 'pos_words' and 'neg_words' column.
    map_name         (string)  :  HTML file's name for the resulting interactive map.
    usa              (boolean) :  Activate the generation of the USA's map.
    html             (boolean) :  Activate the generation of html file.
    show_map         (boolean) :  Show the produced map(s).
    title            (string)  :  Title of the plot.
    source_file_path (string)  :  Path to the source file.
    html_file_path   (string)  :  Path to the futur html file.


    Returns
    -------
    None, but may generate html file of interactive map and show generated map.
    '''

    data = pd.read_csv(source_file_path + filename)
    
    ### NOTE: Clean corrrupted data but we should do it before 
    for i, e in enumerate(data['location']):
        if "http" in e or "<" in e:
            data = data.drop(i)
    data.reset_index(inplace = True, drop = True) #reset index so we don't make error due to assumption of continuous index

    # We have multiple occurence of the USA (multiple states) 
    # but we will only keep the best one for the world map 
    # We will also set-up a way to have a look only in the United Sates 
    location_country = data.copy()
    mask = [False] * data.shape[0]
    
    for j, country in enumerate(data['location']):
        if "United States" in country:
            mask[j] = True #Prepare mask for united states only dataframe (united_states)
            if "California" in country:
                #Delete the State
                location_country['location'][j] = "United States"
            else:
                #get rid of all the occurance of United States except Florida since it is the most populated state
                location_country = location_country.drop(j) 
                
    hover_data_world = np.stack((location_country["beer_name"],
                                 location_country["brewery_name"],
                                 location_country["normalized_rating"],
                                 location_country["style"],
                                 location_country["pos_words"],
                                 location_country["neg_words"]), axis=-1)

    #Plot the worldwide figure
    fig_world = go.Figure(data = go.Choropleth(
        locations = location_country['location'], #counties's nams are used to place data on the world map
        locationmode= 'country names',
        z = location_country['normalized_rating'], #data that describes the choropleth value-to-color mapping
        text = location_country['location'], #pop-up for each country 
        colorscale = 'Viridis',
        autocolorscale=False,
        reversescale=True,
        marker_line_color='darkgray',
        marker_line_width=0.5,
        colorbar_tickprefix = 'average rating ',
        colorbar_title = "Mean average rating",
        customdata = hover_data_world,
        hovertemplate="""   <br><b>Country</b>: %{text}
                            <br><b>Beer</b>: %{customdata[0]}
                            <br><b>Brewery</b>: %{customdata[1]}
                            <br><b>Mean Rating</b>: %{customdata[2]:.2f}
                            <br><b>Type</b>: %{customdata[3]}
                            <br><b>Positive words in reviews</b>: %{customdata[4]}
                            <br><b>Positive words in reviews</b>: %{customdata[5]}<br><extra></extra>"""
    ))

    fig_world.update_layout(
        title_text=title,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)', 
    )
    #print the worldwide map
    if show_map :
        fig_world.show()
    #Create an html file of the map for the site 
    if html : 
        fig_world.write_html(html_file_path + map_name +"_country.html")
        
    #Activate if we want USA map
    if usa:
        united_states = data[mask]
        for k, state in enumerate(united_states['location']):
            #Only keep the states 
            united_states['location'][k] = state.split('States, ',1)[1]
            
        #switch state by their abbreviation to fit 
        #the locationmode 'USA-states' of the plotly library
        united_states.location = united_states.location.map(STATES)
        hover_data_usa = np.stack((united_states["beer_name"],
                                 united_states["brewery_name"],
                                 united_states["normalized_rating"],
                                 united_states["style"],
                                 united_states["pos_words"],
                                 united_states["neg_words"]), axis=-1)
        
        #plot the usa map
        fig_usa = go.Figure(data = go.Choropleth(
            locations = united_states['location'], #abbreviation are used to place data on the USA map
            locationmode= 'USA-states',
            z = united_states['normalized_rating'], #data that describes the choropleth value-to-color mapping
            text = location_country['location'], #pop-up for each country 
            colorscale = 'Viridis',
            autocolorscale=False,
            reversescale=True,
            marker_line_color='darkgray',
            marker_line_width=0.5,
            colorbar_tickprefix = 'average rating ',
            colorbar_title = 'Mean average rating',
            customdata = hover_data_usa,
            hovertemplate="""   <br><b>States</b>: %{text}
                            <br><b>Beer</b>: %{customdata[0]}
                            <br><b>Brewery</b>: %{customdata[1]}
                            <br><b>Mean Rating</b>: %{customdata[2]:.2f}
                            <br><b>Type</b>: %{customdata[3]}
                            <br><b>Positive words in reviews</b>: %{customdata[4]}
                            <br><b>Positive words in reviews</b>: %{customdata[5]}<br><extra></extra>"""
    

    
        ))

        fig_usa.update_layout(
            title_text='Zoom on the USA',
            geo=dict( scope='usa'), #switch from world-map to USA
            paper_bgcolor='rgba(0,0,0,0)',
            plot_bgcolor='rgba(0,0,0,0)', 
        )
        #print the map
        if show_map :
            fig_usa.show()
        #generate html file of USA map
        if html :
            fig_usa.write_html(html_file_path + map_name +"_usa.html")
    return

def combine_neg_pos_and_favoured_beer(neg_pos_filename, favoured_filename, combined_filename, 
                                       source_file_path = 'map/file_for_map/'):
    ''' 
    Function combines dataset from positive-negative word analysis and favoured beer analysis.

    Parameters
    ----------    
    neg_pos_filename  (string)  :  Name of the source file from the favoured beer analysis.
    favoured_filename (string)  :  Name of the source file from the favoured beer analysis.
    combined_filename (string)  :  Name of the resulting combined file.
    source_file_path  (string)  :  Path to the source files
    '''
    neg_pos = pd.read_csv(source_file_path + neg_pos_filename)
    favoured_beer = pd.read_csv(source_file_path + favoured_filename)


    merged_data = favoured_beer.merge(neg_pos[["location","neg_words","pos_words"]],
                                               how="outer",left_on="location",right_on="location")

    merged_data['pos_words'] = merged_data['pos_words'].fillna('Unknown')
    merged_data['neg_words'] = merged_data['neg_words'].fillna('Unknown')
    merged_data.to_csv(source_file_path + combined_filename)
    return

def find_high_rating_review(df):
    """ To be used as an aggregation function of GroupBy object (e.g.  pandas.DataFrame.groupby(...).agg(find_high_rating_review))
    Filters the input dataframe by selecting rows with reviews that verify:
    1. Max rating was given
    2. The overall rating (sum of taste, aroma, ...) is not 5 points less than the maximum value
    3. The review is not empty (has at least 2 characters)

    Arguments
    ------
    df (pandas.DataFrame) : dataframe in which reviews will be searched
    
    Returns
    ------
    (pandas.DataFrame)         : reduced dataframe with only high rating reviews

    """
    max_rating = df["rating"].max()
    max_overall = df["overall"].max()
    nice_reviews = df[(df["rating"] == max_rating) & (df["overall"] >  max_overall - 5) & (len(df["text"]) > 1)]["text"]
    nice_reviews_sample = nice_reviews.sample(n=min(5,len(nice_reviews)))
    concatenated_nice_reviews = nice_reviews_sample.str.cat(sep=' ')
    return pd.DataFrame.from_dict([{"beer_id" : df["beer_id"].iloc[0], "beer_name" : df["beer_name"].iloc[0], "style": df["style"].iloc[0],"brewery_name": df["brewery_name"].iloc[0], "good_reviews" : concatenated_nice_reviews}])


def retrieve_reviews_SAT(website):
    """ Retrieves reviews corresponding to beers sold on SAT in the dataset corresponding to a given 'website' and to beers favoured by each country
    Parameters
    ----------
    website     (string)  : name of the dataset. Either 'RateBeer' or 'BeerAdvocate
    Returns
    -------
    (series) best reviews of the beers favoured by each country
    (series) best reviews of the beers sold at SAT that are found in 'website'
    
    
    """
    if website == "RateBeer":
        TOTAL_CSV = 72
        acronym = "RB"
        fav_beer = pd.read_csv("DATA/favourite_beer_RB.csv")["beer_id"].unique().tolist()
    if website == "BeerAdvocate":
        TOTAL_CSV = 26
        acronym = "BA"
        fav_beer = pd.read_csv("DATA/favourite_beer_BA.csv")["beer_id"].unique().tolist()

    #Retrieve favourite beer of each country according to users for RB
    temp = pd.read_csv(f"DATA/{website}_reviews_part_0.csv")
    best_reviews = temp[temp["beer_id"].isin(fav_beer)]
    
    ##Retrieve reviews for SAT beers
    SAT_RB_best_beers = pd.read_csv(f"DATA/predicted_SAT_{acronym}_sorted.csv",index_col=0)
    #Take 10 best SAT beers according to users
    bestSAT_RB = SAT_RB_best_beers[f"{acronym}_beer_id"].values[0:13]
    best_reviews_SAT = temp[temp["beer_id"].isin(bestSAT_RB)]
    for i in range(1,TOTAL_CSV):
        #Iterate over all the partitioned dataset and populate a dataframe only with the reviews of SAT beers
        temp = pd.read_csv(f"DATA/{website}_reviews_part_{i}.csv")
        best_reviews = pd.concat([best_reviews,temp[temp["beer_id"].isin(fav_beer)]],join="outer")
        best_reviews_SAT = pd.concat([best_reviews_SAT,temp[temp["beer_id"].isin(bestSAT_RB)]],join="outer")
    best_reviews = best_reviews.groupby(by="beer_id").apply(find_high_rating_review).reset_index(drop=True)
    best_reviews_SAT = best_reviews_SAT.groupby(by="beer_id").apply(find_high_rating_review).reset_index(drop=True)
    return best_reviews, best_reviews_SAT


def add_country_column(target_df,country_csv_path,california_as_usa=True):
    """
    Given a dataset of beer reviews 'target_df', augments the dataset with the location of the reviewer from an accessory dataset located in 'country_csv_path'
    
    Arguments
    ------
    target_df           (pandas.DataFrame): dataframe in which the country column will be added
    country_csv_path    (string)          : path to file in which csv with country data is located
    california_as_usa   (boolean)         : if True, we drop all information about individual USA states and keep only California, which is the most populated one as of 2022
    Returns
    ------
    (pandas.DataFrame) DataFrame with country column added

    """
    
    #Recover the dataframe of favourite beer for users of each country. 
    #Drop countries for which there were no enough reviewers (tagged as favourite beer_id = -1)
    countries = pd.read_csv(country_csv_path)
    countries = countries[~ (countries["beer_id"] == -1.)]
    best_with_countries = target_df.merge(countries[["beer_id","location"]],how="left",on="beer_id")
    #Clean individual states of United States, by keeping only California
    if california_as_usa:
        mask = (best_with_countries["location"].str.contains("United States")) & ~(best_with_countries["location"].str.contains("California"))
        best_with_countries = best_with_countries[~mask]
    return best_with_countries


def request_embeddings(series,verbose=True):
    """
    Given a series of textual reviews, sends a series of calls to OpenAI Embeddings API for the ADA model embeddings. 
    Calls are sent on 6 seconds interval to avoid RateLimitError from the OpenAI API.
    
    Parameters
    ----------
    series     (pd.Series)  : Series of reviews to be embedded by ADA-002
    verbose    (bool)       : if True, reviews are printed as they are sent to the OpenAI API
    Returns
    -------
    (numpy.array) (len(series),1956) length numpy array corresponding to OpenAi ADA embeddings for each review sent
    
    
    """
    import time
    from tenacity import retry, wait_random_exponential, stop_after_attempt
    #We use exponential backoff to limit adaptively our request rate
    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def get_embedding_exponential_bo(text: str, engine="text-embedding-ada-002") -> list[float]:

        # replace newlines, which can negatively affect performance.
        text = text.replace("\n", " ")
        return openai.Embedding.create(input=[text], engine=engine)["data"][0]["embedding"]
    corpus = series.values
    embeddings = []
    #We send request to the API on a loop in order to control the request rate and avoid being limited
    for (i,x) in enumerate(corpus):
        if verbose:
            print("Sending API request to embed the following review:\n")
            print(x)
        embeddings.append(get_embedding_exponential_bo(x, engine='text-embedding-ada-002'))
        #Toggle the waiting time between requests, max request rate is 20/min
        time.sleep(6)

    embeddings = np.array(embeddings)
    if verbose:
        print(embeddings.shape)
    return embeddings

### Plotting a t-SNE with plotly with custom markers ###

def plot_tsne(SAT_embeddings,dataset_embeddings,sat_df,dataset_df,perplexity=10,country_size=20,beer_size=20,acronym="",title=""):
    """
    Given arrays of embeddings corresponding to reviews of SAT and of the preferred beers of each country for a given dataset,
    plot a t-SNE graph where embeddings corresponding to SAT beers are rendered as beer images and where favourite beers are rendered as the flag of the country that prefers them.
    The plot can be customized for perplexity and marker size.

    Arguments
    --------
    SAT_embeddings      (numpy.array)      : (len(sat_df),1916) array corresponding to OpenAI   
    dataset_embeddings  (numpy.array)      : (len(dataset_df),1916) array corresponding to OpenAI   
    sat_df              (pandas.DataFrame) : dataset from which the reviews corresponding to SAT beers were taken (for rendering data on hover)
    dataset_df          (pandas.DataFrame) : dataset from which the reviews corresponding to countries were taken (for rendering data on hover)
    perplexity          (int)    : hyperparameter of t-SNE plots
    country_size        (int)    : size of country flag images in plot (in pts)
    beer_size           (int)    : size of beer figures images in plot (in pts)
    acronym             (string) : Prefix used for saving the plot
    title               (string) : Title of the plot
    """
    #Beers sold on SAT should be shown with their image, while embeddings representing favoured beers for each
    #country should be shown with the country flag.
    tsne_vectors = np.concatenate([SAT_embeddings,dataset_embeddings])
    sat_label = np.ones(len(SAT_embeddings))
    non_sat_label = np.zeros(len(dataset_embeddings))
    label = np.concatenate([sat_label,non_sat_label])
    SAT_ids = sat_df["beer_id"].values
    df = pd.DataFrame()
    hover_data = np.stack((pd.concat([sat_df["beer_name"],dataset_df["beer_name"]]),pd.concat([sat_df["brewery_name"],dataset_df["brewery_name"]]),pd.concat([sat_df["style"],dataset_df["style"]])),axis=-1)
    df["is_sat_beer"] = label
    tsne = TSNE(
        n_components=2, perplexity=perplexity, random_state=42, init="random", learning_rate=200,n_iter=10000
    )

    sat_beer_id = [str(number) for number in SAT_ids]
    vis_dims2 = tsne.fit_transform(tsne_vectors)
    df["x"] = vis_dims2[:,0]
    df["y"] = vis_dims2[:,1]
    df["general_id"] = sat_beer_id + list(dataset_df["location"].values)
    df["beer"] = list(sat_df["beer_name"].values) + list(dataset_df["beer_name"].values)
    fig = px.scatter(
        df,
        x="x",
        y="y",
        hover_name="general_id",
        hover_data=["beer"],
        labels=dict(x="t-SNE  first dimension", y="t-SNE second dimension")

    )
    fig.update_traces(marker_color="rgba(0,0,0,0)",mode='markers',
                      customdata=hover_data,  
                      hovertemplate="""
                            <br><b>Beer</b>: %{customdata[0]}
                            <br><b>Brewery</b>: %{customdata[1]}
                            <br><b>Style</b>: %{customdata[2]}<br><extra></extra>""")
    maxDim = df[["x", "y"]].max().idxmax()
    maxi = df[maxDim].max()
    for i, row in df.iterrows():
        general_id = row["general_id"].replace(" ","-")
        if row.is_sat_beer:
            fig.add_layout_image(
                dict(
                    source=Image.open(f"Images/beers/{general_id}.0.png"),
                    xref="x",
                    yref="y",
                    xanchor="center",
                    yanchor="middle",
                    x=row["x"],
                    y=row["y"],
                    sizex=beer_size,
                    sizey=beer_size,
                    sizing="contain",
                    opacity=1,
                    layer="above"
                )
            )
        #We do not have flag images for certain countries ::sad_face::
        elif general_id not in["Afghanistan","Albania","Bahrain","Benin","Bosnia-and-Herzegovina","Botswana","Bulgaria","Burkina-Faso","Burundi","Cambodia","Central-African-Republic","Comoros","Congo,-Dem.-Rep."
                          ,"Congo,-Rep.","Costa-Rica","Cote-d'Ivoire","El-Salvador","Equatorial-Guinea","Eritrea","Ethiopia","Gabon","Gambia","Ghana","Guatemala","Guinea","Guinea-Bissau","Haiti","Honduras","Hong-Kong,-China","Hungary","Indonesia",
                         "Iraq","Jordan","Kenya","Korea,-Dem.-Rep.","Korea,-Rep.","Kuwait","Lebanon","Lesotho","Liberia","Libya","Madagascar",
                         "Malawi","Malaysia","Mali","Mauritania","Mauritius","Mongolia","Montenegro","Mozambique","Myanmar","Namibia","Nepal","Nicaragua","Niger","Nigeria","Oman","Pakistan","Panama","Paraguay",
                         "Philippines","Puerto-Rico","Reunion","Romania","Rwanda","Sao-Tome-and-Principe","Saudi-Arabia","Senegal","Serbia","Sierra-Leone","Singapore","Slovak-Republic","Slovenia","Somalia","Sri-Lanka",
                         "Sudan","Swaziland","Syria","Taiwan","Tanzania","Togo","Trinidad-and-Tobago","Tunisia","Uganda","United-Kingdom","Vietnam","West-Bank-and-Gaza","Yemen,-Rep.","Zambia","Zimbabwe"]:
            fig.add_layout_image(
                dict(
                    source=Image.open(f"Images/country_flags/icons8-{general_id.lower()}-100.png"),
                    xref="x",
                    yref="y",
                    xanchor="center",
                    yanchor="middle",
                    x=row["x"],
                    y=row["y"],
                    sizex=country_size,
                    sizey=country_size,
                    sizing="contain",
                    opacity=1,
                    layer="above"
                )
            )
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True,showgrid=False,zeroline=False)
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True,showgrid=False,zeroline=False)                        , 
    fig.update_layout(height=600, 
                      width=1000, 
                      paper_bgcolor='rgba(0,0,0,0)',
                      plot_bgcolor='rgba(0,0,0,0)',
                      title_text=title)

    fig.show()

    fig.write_html(f"Images/{acronym}_tsne.html",config = {'displayModeBar': True})
def retrieve_style_list(website):
    """
    Returns a list of all beer styles of the dataset of given 'website'

    Parameters
    ----------
    website         (string)  :  website corresponding to the dataset being processed. Either 'RateBeer' or 'BeerAdvocate'

    Returns
    -------
    (list) : A lexicographically sorted list of beer styles
    """
    if website == "RateBeer":
        TOTAL_CSV = 72
    if website == "BeerAdvocate":
        TOTAL_CSV = 26        

    style_set = set()
    for index in range(0,TOTAL_CSV):
            temp = pd.read_csv(f"DATA/{website}_reviews_part_{index}.csv",usecols=["text","style"],low_memory=False).astype(str)
            styles = set(temp["style"].unique())
            style_set.update(styles)
    style_list = list(style_set)
    style_list.sort()
    return style_list
def plot_wordcloud_dropdown():
    """
    Plots wordclouds corresponding to beer reviews for all beer styles in RateBeer and BeerAdvocate. Style can be chosen with a dropdown menu
    
    Arguments
    -------
    (None)

    Returns
    ------
    (None) but saves figure in a separate file
    
    """

    # Load images
    img_list = os.listdir("Images/word_clouds")
    # Initialize figures
    fig = go.Figure(layout=go.Layout(width=500, height=500,
                                    xaxis=dict(range=[280, 680],
                                            fixedrange = False),
                                    yaxis=dict(range=[620, 100],
                                            fixedrange = False
                                    ),
                                    ))
    #List all styles that will be shown
    style_list_BA = retrieve_style_list("BeerAdvocate")
    style_list_RB = retrieve_style_list("RateBeer")
    style_list = style_list_BA + style_list_RB
    description = ["BeerAdvocate style : " + style for style in style_list_BA] + ["RateBeer style : "+ style for style in style_list]
    #Create all renderings in the plot
    for i,style in enumerate(style_list):
        if "/" in style:
            style = style.replace("/","")
        if i < len(style_list_BA):
            pil_img = Image.open(f'Images/word_clouds/BA_{style}_wordcloud.png') # PIL image object
        else:
            pil_img = Image.open(f'Images/word_clouds/RB_{style}_wordcloud.png') # PIL image object
        prefix = "data:image/png;base64,"
        with BytesIO() as stream:
            pil_img.save(stream, format="png")
            base64_string = prefix + base64.b64encode(stream.getvalue()).decode("utf-8")
        if i == 0:
            goImg = go.Image(source=base64_string,
                            x0=0, 
                            y0=0,
                            dx=1,
                            dy=1,
                            visible = True,)
        else:
            goImg = go.Image(source=base64_string,
                        x0=0, 
                        y0=0,
                        dx=1,
                        dy=1,
                        visible = False,)
        fig.add_trace(goImg)
        fig.update_traces(
                    hovertemplate = None,
                    hoverinfo = "skip")
    #Create masks to activate only one rendering at a time
    mask_list = []
    mask = np.arange(0,len(style_list))
    for i in range(len(style_list)):
        mask_list.append(list(mask==i))
    buttons = [{'label': description[i], 'method':'update','args':[{"visible":mask_list[i]}]} for i,style in enumerate(style_list)]
    # Add Annotations and Buttons

    fig.update_layout(template="simple_white",
                updatemenus=[dict(
                active=1,
                x=1.05,
                y=1.1,
                buttons=buttons,
            )
        ])
    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)
    fig.show()
    #Export for site
    fig.write_html(f"Images/test_wordclouds.html",config = {'displayModeBar': False})





def create_rank_plot(BA_sorted,RB_sorted):
    """Creates plots of rankings (with option for normalized rankings) for BeerAdvocate and RateBeer datasets
    
    Arguments
    --------
    BA_sorted   (pandas.DataFrame) : dataframe with SAT beers with ratings estimated from/found in BeerAdvocate
    RB_sorted   (pandas.DataFrame) : dataframe with SAT beers with ratings estimated from/found in RateBeer
    
    Returns
    ----
    (None) but save figure in a separate file
    """

    
    def compute_normalized_ranking(dataframe):
        dataframe["prix"] =  dataframe.apply(lambda row: re.findall(r"\d+\.\d+",row["prix"])[-1],axis=1)
        dataframe["normalized_rating"] = (dataframe["avg"]/(dataframe["vol"].astype(float)*dataframe["prix"].astype(float)))
        dataframe["normalized_rating"] = dataframe["normalized_rating"]
        dataframe_normalized_sorted = dataframe.sort_values(by="normalized_rating",ascending=False)
        dataframe_normalized_sorted["normalized_rating"] = dataframe_normalized_sorted["normalized_rating"].apply(lambda x : x*5/dataframe_normalized_sorted["normalized_rating"].max())
        return dataframe_normalized_sorted
    RB_normalized = compute_normalized_ranking(RB_sorted)
    BA_normalized = compute_normalized_ranking(BA_sorted)
    fig_rank = make_subplots(rows=1, cols=2, horizontal_spacing = 0.1)
    customdata_BA  = np.stack((BA_sorted['nom'],BA_sorted['brasseur'],BA_sorted['avg'], BA_sorted['type'],BA_sorted['prix']), axis=-1)
    customdata_RB  = np.stack((RB_sorted['nom'],RB_sorted['brasseur'],RB_sorted['avg'], RB_sorted['type'],RB_sorted['prix']), axis=-1)
    customdata_normalized_BA  = np.stack((BA_normalized['nom'],BA_normalized['brasseur'],BA_normalized['normalized_rating'], BA_normalized['type'],BA_normalized['prix']), axis=-1)
    customdata_normalized_RB  = np.stack((RB_normalized['nom'],RB_normalized['brasseur'],RB_normalized['normalized_rating'], RB_normalized['type'],RB_normalized['prix']), axis=-1)
    fig_rank.update_xaxes(showgrid=False)
    fig_rank.update_yaxes(autorange = "reversed")
    fig_rank.add_trace(go.Bar(
                                        y=RB_sorted["nom"],
                                        x=RB_sorted["avg"],
                                        orientation="h",
                                        visible=True,
                                        width=1,
                                        customdata=customdata_RB,
                                        hovertemplate=""" <br><b>Brew</b>: %{customdata[0]}
                                                        <br><b>Brewery</b>: %{customdata[1]}
                                                        <br><b>Mean Rating</b>: %{customdata[2]:.2f}
                                                        <br><b>Type</b>: %{customdata[3]}
                                                        <br><b>Price</b>: %{customdata[4]}<br><extra></extra>"""),
                            row=1,
                            col=1,)
    fig_rank.add_trace(go.Bar(
                                        y=RB_normalized["nom"],
                                        x=RB_normalized["normalized_rating"],
                                        orientation="h",
                                        visible=False,
                                        width=1,
                                        customdata=customdata_normalized_RB,
                                        hovertemplate=""" <br><b>Brew</b>: %{customdata[0]}
                                                        <br><b>Brewery</b>: %{customdata[1]}
                                                        <br><b>Rating normalized by price and serving volume</b>: %{customdata[2]:.2f}
                                                        <br><b>Type</b>: %{customdata[3]}
                                                        <br><b>Price</b>: %{customdata[4]}<br><extra></extra>"""),
                            row=1,
                            col=1,)
    fig_rank.add_trace(go.Bar(
                                        y=BA_sorted["nom"],
                                        x=BA_sorted["avg"],
                                        orientation="h",
                                        visible=True,
                                        width=1,

                                        customdata=customdata_BA,
                                        hovertemplate=""" <br><b>Brew</b>: %{customdata[0]}
                                                        <br><b>Brewery</b>: %{customdata[1]}
                                                        <br><b>Brewery</b>: %{customdata[2]:.2f}
                                                        <br><b>Type</b>: %{customdata[3]}
                                                        <br><b>Price</b>: %{customdata[4]}<br><extra></extra>"""),
                            row=1,
                            col=2,)
    fig_rank.add_trace(go.Bar(
                                        y=BA_normalized["nom"],
                                        x=BA_normalized["normalized_rating"],
                                        orientation="h",
                                        visible=False,
                                        width=1,
                                        customdata=customdata_normalized_BA,
                                        hovertemplate=""" <br><b>Brew</b>: %{customdata[0]}
                                                        <br><b>Brewery</b>: %{customdata[1]}
                                                        <br><b>Rating normalized by price and serving volume</b>: %{customdata[2]:.2f}
                                                        <br><b>Type</b>: %{customdata[3]}
                                                        <br><b>Price</b>: %{customdata[4]}<br><extra></extra>"""),
                            row=1,
                            col=2,)


    fig_rank.update_layout( autosize=False,
                            margin=dict(t=0, b=0, l=0, r=0),
                            paper_bgcolor='rgba(0,0,0,0)',
                            plot_bgcolor='rgba(0,0,0,0)', 
                            xaxis_title="Beer rating",
                            yaxis_title="Beer name",
                       )

    fig_rank.update_layout(showlegend=False,
                            annotations=[
                                dict(text="Normalize by price and volume:", x=0.88, xref="paper", y=1.05, yref="paper",
                                 align="left", showarrow=False)],
                           title_text='Ranking of SAT beers according to RateBeer and BeerAdvocate',
                           width=900,
                           height=1000,
                           xaxis2=dict(range=[0, 5],title="RateBeer rating"),
                           xaxis1=dict(range=[0, 5],title="BeerAdvocate rating"), 
                           updatemenus=[
                                dict(
                                    type="buttons",
                                    direction="right",
                                    active=0,
                                    x=1,
                                    y=1.05,
                                    buttons=([
                                        dict(label="No",
                                             method="update",
                                             args=[{"visible": [True, False, True, False]}
                                                  ]),
                                        dict(label="Yes",
                                             method="update",
                                             args=[{"visible": [False, True, False, True]}
                                                   ]),
                                                ]),
                                            )
                                        ],
                           font=dict(size=10))
    fig_rank.show(config= dict(
                displayModeBar = False))
    fig_rank.write_html("Images/sat_rank_separated.html",config = {'displayModeBar': False})

read_data.py

In [None]:

MAX_CSV_SIZE = 1000000
CHUNK_SIZE = 200
def fetch_satellite_data():
    return 0
COLUMNS_NAMES = ["beer_name","beer_id","brewery_name","brewery_id","style","abv","date","username","user_id","appearance","aroma","palate","taste","overall","rating","text"]

def fetch_csv(dataset_path, name):
    with tarfile.open(dataset_path) as tar:
        dataframe = pd.DataFrame()
        for filename in tar.getnames():
            if name in filename:   
                with tar.extractfile(filename) as file:
                    dataframe = pd.read_csv(file)
    return dataframe


def fetch_reviews(dataset_path, max_csv_size = MAX_CSV_SIZE,early_stop = 0):
    """dumps ratings and/or reviews that are in a large text file to multiple csv files of max_csv_size length.
        
        Parameters
        ----------                                  
        dataset_path  (dataframe) :  path to the tar file containing the dataset
        max_csv_size  (int)       : max size of the csvs created by the function
        early_stop    (int)       : for debugging purposes. Stops function at early_stop csvs created. Default is 0 and creates as many csvs as needed
        
        
        Returns
        -------
        location of user with doubled user_ids

   
    """

    tarfile_name = re.search("[ \w-]+?(?=\.)",dataset_path)[0]
    with tarfile.open(dataset_path) as tar:
        datadumps = [filename for filename in tar.getnames() if "txt.gz" in filename]
        print(datadumps)
        filename = input("Please choose a file from the list above to open: ")
        with tar.extractfile(filename) as file:
            with gzip.open(file,'rt') as f:
                review = []
                review_dict = {}
                row_count = 0
                csv_count = 0
                for line in f:
                    if len(line) < 2:
                        review_dict[row_count] = review
                        row_count += 1
                        review = []
                        if row_count % max_csv_size == 0:
                            df = pd.DataFrame.from_dict(review_dict, orient="index")
                            df.columns = COLUMNS_NAMES
                            df.to_csv(f"DATA/{tarfile_name.replace('.tar','')}_{filename.replace('.txt.gz','')}_part_{csv_count}.csv")
                            del review_dict #Just to not kill my memory please disregard :)
                            review_dict = {}
                            print(f"Dumping data to csv number {csv_count}...")
                            csv_count += 1
                            if early_stop and csv_count == early_stop : 
                                break
                    else:
                        (key,value) = line.split(": ", 1)
                        #BeerAdvocate has one column called "review" that is useless and makes everything harder
                        if key != "review":
                            review.append(value.rstrip())
    
    df = pd.DataFrame.from_dict(review_dict, orient="index")
    df.columns=COLUMNS_NAMES
    df.to_csv(f"DATA/{tarfile_name.replace('.tar','')}_{filename.replace('.txt.gz','')}_part_{csv_count}.csv")
    del review_dict #Just to not kill my memory please disregard :)
    print(f"Dumping data to csv number {csv_count}...")
    print("Success!")
    if early_stop :
        return 1
    else : 
        return df
#    except: 
#        print("Euh, no file was found in this path")

def fetch_satellite_df():

    url = "https://satellite.bar/bar/"
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')
    beer = re.findall('var djangoBoissons = (.*?);\s*$', soup.prettify(), re.M)
    beerJson = beer[0]
    satellite_dict = json.loads(beerJson)
    satellite_df = pd.DataFrame(satellite_dict)

    return satellite_df



def find_favourite_beers(website,threshold=10):
    """
        Calculates the beers/beer styles with most votes and with biggest ratings (given there are at least 'threshold ratings')
        
        Parameters
    ----------                                  
    website         (string)  :  name of the website/dataset considered
    threshold       (int)     :  The minimum number of ratings a beer/style need to be considered a valid best beer/style
    
    Returns
    -------
    most_reviewed_beer    (dataframe)
    favorite_beer         (dataframe)
    most_reviewed_style   (dataframe)
    favorite_style        (dataframe)
        
    """
    def correct_double_user_id(users,row):
        """ Routine that corrects the location of users with multiple user_id (case of Ratebeer dataset, when user_id is doubled, one of the entried has no location)
            To be used as an argument of pandas.DataFrame.apply()
            Parameters
        ----------                                  
        users         (dataframe) :  dataframe with user information
        row           (array)     :  row in dataframe corresponding to user with two user_ids 

        
        Returns
        -------
        location of user with doubled user_ids

        """
        name = row["username"].strip()
        return users[users["user_name"] == name]['location'].values[0]
    users = fetch_csv(f"DATA/{website}.tar","users")
    #Unknown location for some users, some users have user_id duplicates either in the users.csv (in this case, no location is available) or in ratings.txt:
    users["location"] = users["location"].fillna("Unknown")
    beer_most_drinked_by_country =pd.DataFrame([])
    style_most_drinked_by_country =pd.DataFrame([])

    if website == "RateBeer":
        TOTAL_CSV = 15
        acronym = "RB"
    if website == "BeerAdvocate":
        TOTAL_CSV = 17
        acronym = "BA"
    print(f"Task : find favourite beers/styles of users. Processing {website} dataset: {TOTAL_CSV} csv files in total.")
    for number in range(0,TOTAL_CSV):
        temp = pd.read_csv(f"DATA/{website}_ratings_part_{number}_corrected_w_attenuation.csv")
        
        temp= temp.merge(users[["user_id","location"]], how="left",left_on= "user_id",right_on="user_id")
        temp["user_id"] = temp["user_id"].apply(lambda x : str(x).strip()) 
        double_id_rows = temp[temp["location"].isna()]
        if len(double_id_rows) != 0:
            temp.loc[temp["location"].isna(),"location"] = temp.loc[temp["location"].isna()].apply(lambda row : correct_double_user_id(users,row),axis=1).apply(str)
    
        temp_grouped_on_beer = temp[["location","beer_name","beer_id","style","brewery_name","user_id","rating"]].groupby(by=["location","beer_name","beer_id","style","brewery_name"]).agg({"user_id":"count","rating":"sum"}).reset_index()
        temp_grouped_on_style = temp[["location","style","user_id","rating"]].groupby(by=["location","style"]).agg({"user_id":"count","rating":"sum"}).reset_index()
        beer_most_drinked_by_country = pd.concat([beer_most_drinked_by_country, temp_grouped_on_beer]).groupby(['location', 'beer_name',"beer_id","style","brewery_name"]).sum().reset_index()
        style_most_drinked_by_country =  pd.concat([style_most_drinked_by_country, temp_grouped_on_style]).groupby(['location', 'style']).sum().reset_index()
    
    print("Calculating most rated and best rated beers and styles...")
    #We renormalize the averages that were aggregated from all parts of the dataset
    beer_most_drinked_by_country["normalized_rating"] = beer_most_drinked_by_country.apply(lambda row: row["rating"]/row["user_id"],axis=1)
    style_most_drinked_by_country["normalized_rating"] = style_most_drinked_by_country.apply(lambda row: row["rating"]/row["user_id"],axis=1)

    #Standardize column names before saving work
    BEER_COLUMN_NAMES = ["location","beer_name","beer_id","style","brewery_name","count","cumulated_rating","normalized_rating"]
    STYLE_COLUMN_NAMES = ["location","style","count","cumulated_rating","normalized_rating"]
    beer_most_drinked_by_country.columns = BEER_COLUMN_NAMES
    style_most_drinked_by_country.columns = STYLE_COLUMN_NAMES

    def most_rated(df):
        df = df.reset_index()
        row = df.iloc[df['count'].idxmax()].copy()
        return row
    def best_rating(df,threshold,metric):
        df = df.reset_index()
        row = df.loc[[0]].copy()
        df = df[df["count"] > threshold].copy()
        if len(df) == 0:
            row[metric] = "No beer with enough votes"
            if metric == "beer_name":
                row["beer_id"] = -1
                row["style"] = "-"
                row["brewery_name"] = "-"
                row["normalized_rating"] = "0"
            return row
        else : 
            row = df.loc[[df['normalized_rating'].idxmax()]].copy()
            return row
    most_reviewed_beer = beer_most_drinked_by_country.groupby(by="location").apply(lambda df : most_rated(df)).reset_index(drop=True)
    favorite_beer = beer_most_drinked_by_country.groupby(by="location").apply(lambda df : best_rating(df,threshold,"beer_name")).reset_index(drop=True)

    most_reviewed_style = style_most_drinked_by_country.groupby(by="location").apply(lambda df : most_rated(df)).reset_index(drop=True)
    favorite_style = style_most_drinked_by_country.groupby(by="location").apply(lambda df : best_rating(df,threshold,"style")).reset_index(drop=True)
    print("Success!")
    return most_reviewed_beer,favorite_beer,most_reviewed_style,favorite_style

SAT_helpers.py

In [None]:

RB_style_dict = {
            "IPA" : 'India Pale Ale (IPA)',
            "Blanche" : "Belgian Ale",
            "White IPA": "India Pale Ale (IPA)",
            "Sour": 'Sour/Wild Ale',
            "Blonde" : 'Golden Ale/Blond Ale',
            "New England IPA" : "American Pale Ale",
            "Imperial Stout" : "Imperial Stout",
            "Berliner Weisse" : 'Berliner Weisse',
            "Ambrée": 'Amber Ale',
            "Pale Ale" : 'English Pale Ale'
        }

RB_countries_dict = {
            'Royaume-Uni' : 'England',
            "Suisse" : "Switzerland",
            "Norvège": "Norway",
            "Allemagne": 'Germany',
            "Pays-Bas" : "Netherlands",
            "Pologne" : "Poland",
            "Espagne" : 'Spain',
            "France" : "France"
        }

BA_style_dict = {
            "IPA" : 'English India Pale Ale (IPA)',
            "Blanche" : 'Belgian IPA',
            "White IPA": 'American Pale Wheat Ale',
            "Blonde" : 'Belgian IPA',
            "Lambic" : 'Lambic - Fruit',
            "Sour": 'Extra Special / Strong Bitter (ESB)',
            "New England IPA" : 'American IPA',
            "Imperial Stout" : 'American Double / Imperial Stout',
            "Berliner Weisse" : 'Berliner Weissbier',
            "Ambrée": 'American Amber / Red Ale',
            "Pale Ale" :  'American Pale Ale (APA)',
            "Imperial IPA" :  'American Double / Imperial IPA' 
        }

BA_countries_dict = {
        'Royaume-Uni' : 'England',
        "Suisse" : "Switzerland",
        "Norvège": "Norway",
        "Allemagne": 'Germany',
        "Pays-Bas" : "Netherlands",
        "Pologne" : "Poland",
        "Espagne" : 'Spain',
        "France" : "France",
        "Belgique" : "Belgium"}

def generate_automatic_beer_matches(website,matched_dataset,corrected_beers_df):
    """Generates a dataframe with the items/beers of a given 'website' dataset that have more than 0.8 cosine similarity with SAT beers.
    
    Parameters
    ----------
    website         (string)     :  Name of the dataset. Either 'RateBeer' or 'BeerAdvocate'.
    matched_dataset (dataframe)  :  Dataframe with all matches between SAT beers and dataset
    corrected_beers_df (dataframe): Dataframe with bias-corrected ratings for each beer
    Returns
    -------
    (dataframe) : dataframe of all SAT beers that found a reasonable match in the 'website' dataframe
    (dataframe) : dataframe of all SAT beers that have not found a reasonable match in the 'website' dataframe
    (dataframe) : dataframe with top 5 retrievals of SAT beers without match. Used for manual matching in following step of the pipeline
    """
    if website == "RateBeer":
        acronym = "RB"
    if website == "BeerAdvocate":
        acronym = "BA"
    SAT_beers = read_data.fetch_satellite_df()
    SAT_match_candidates = matched_dataset
    beers = corrected_beers_df
    beers =  beers[beers["nbr_ratings"] != 0].copy()
    mask = ((SAT_match_candidates["alcool"] == SAT_match_candidates[f"{acronym}_abv"]) & (SAT_match_candidates[f"{acronym}_similarity"] > 0.8))
    automatic_matches = SAT_match_candidates[mask][["nom",f"{acronym}_beer_name",f"{acronym}_avg",f"{acronym}_abv",f"{acronym}_similarity",f"{acronym}_brewery_name",f"{acronym}_style",f"{acronym}_beer_id"]].drop_duplicates(subset="nom", keep='first', inplace=False, ignore_index=False)
    
    not_matched =  SAT_match_candidates[~mask][["nom",f"{acronym}_beer_name",f"{acronym}_avg",f"{acronym}_abv",f"{acronym}_similarity",f"{acronym}_brewery_name",f"{acronym}_style",f"{acronym}_beer_id"]].drop_duplicates(subset="nom", keep='first', inplace=False, ignore_index=False)
    not_matched =  SAT_beers[~SAT_beers["nom"].isin(automatic_matches["nom"].unique())]
    top5_for_manual_matching = SAT_match_candidates[~SAT_match_candidates["nom"].isin(automatic_matches["nom"].unique())]
    return automatic_matches,not_matched, top5_for_manual_matching



def prepare_features(website,matched_dataset,corrected_beers_df):
    """ Constructs feature vectors for beers. 
        These features are used for training a model on rating estimation and/or to estimate ratings
        . Feature vectors consists of:
        - Alcohol content (float), 
        - dummy variables for country of origin of the brewery (int)
        - dummy variables for the beer style
    
        When given the name of the dataset ('website') and the dataset subset ('matched_dataset')
    consisting only of beers sold at SAT
    
    Parameters
    ----------
    website         (string)     :  Name of the dataset. Either 'RateBeer' or 'BeerAdvocate' 
                                 
    matched_dataset (dataframe)  : dataframe of all the beers sold on SAT that were found 
                on the dataset corresponding with 'website'

    corrected_beers_df (dataframe) :  Dataframe with bias-corrected ratings for each beer
    
    
    Returns
    -------
    (dataframe) : dataframe without the rating, but with abv (alcohol content) value and 
                dummy variables for all considered features. Used for estimating ratings of SAT beers
    (dataframe) : dataframe with the rating given in the 'website' dataset. Used to train the model
    
    """
    if website == "RateBeer":
        style_dict = {
            "IPA" : 'India Pale Ale (IPA)',
            "Blanche" : "Belgian Ale",
            "White IPA": "India Pale Ale (IPA)",
            "Sour": 'Sour/Wild Ale',
            "Blonde" : 'Golden Ale/Blond Ale',
            "New England IPA" : "American Pale Ale",
            "Imperial Stout" : "Imperial Stout",
            "Berliner Weisse" : 'Berliner Weisse',
            "Ambrée": 'Amber Ale',
            "Pale Ale" : 'English Pale Ale'
        }
        countries_dict = {
            'Royaume-Uni' : 'England',
            "Suisse" : "Switzerland",
            "Norvège": "Norway",
            "Allemagne": 'Germany',
            "Pays-Bas" : "Netherlands",
            "Pologne" : "Poland",
            "Espagne" : 'Spain',
            "France" : "France"
        }
    if website == "BeerAdvocate":
        style_dict = {
            "IPA" : 'English India Pale Ale (IPA)',
            "Blanche" : 'Belgian IPA',
            "White IPA": 'American Pale Wheat Ale',
            "Blonde" : 'Belgian IPA',
            "Lambic" : 'Lambic - Fruit',
            "Sour": 'Extra Special / Strong Bitter (ESB)',
            "New England IPA" : 'American IPA',
            "Imperial Stout" : 'American Double / Imperial Stout',
            "Berliner Weisse" : 'Berliner Weissbier',
            "Ambrée": 'American Amber / Red Ale',
            "Pale Ale" :  'American Pale Ale (APA)',
            "Imperial IPA" :  'American Double / Imperial IPA' 
        }
        countries_dict = {
        'Royaume-Uni' : 'England',
        "Suisse" : "Switzerland",
        "Norvège": "Norway",
        "Allemagne": 'Germany',
        "Pays-Bas" : "Netherlands",
        "Pologne" : "Poland",
        "Espagne" : 'Spain',
        "France" : "France",
        "Belgique" : "Belgium"}
    SAT_beers = read_data.fetch_satellite_df()

    beers_to_predict = SAT_beers.loc[~SAT_beers["nom"].isin(matched_dataset["nom"])]
    beers_to_predict["type"] = beers_to_predict["type"].apply(lambda x : style_dict[x])
    beers_to_predict["from"] = beers_to_predict["from"].apply(lambda x : countries_dict[x])
    beers = corrected_beers_df
    features_for_traning = beers[["abv","location","style","avg"]]
    features_for_traning.dropna(subset="avg",axis='index',inplace=True)
    features_for_traning.fillna(0,inplace=True)
    SAT_features = beers_to_predict[["alcool","from","type"]]
    SAT_features.columns = ["abv","location","style"]
    sat_beers_to_rate = pd.concat([features_for_traning[["abv","location","style"]],SAT_features],axis=0)
    sat_beers_to_rate_with_dummies =pd.get_dummies(sat_beers_to_rate,columns=["style","location"])
    features = pd.get_dummies(features_for_traning,columns=["style","location"])
    return sat_beers_to_rate_with_dummies.tail(len(beers_to_predict)), features,beers_to_predict, features_for_traning


def randomforest_sat_beers_ratings(features_to_train,features_to_estimate):
    """ Trains a RandomForestRegressor with 'features_to_train' in order to estimate ratings of beers corresponding to 'features_to_estimate'
    
    Parameters
    ----------
    features_to_train    (dataframe)  :  dataframe of features used to train the dataset. Column '1' should be the labels. 
                                 
    features_to_estimate (dataframe)  : dataframe of features used to estimate beer ratings of beers without a match.

    Returns
    -------
    (float) : r2 score of the regression performed
    (np.array) : array with predicted ratings
    
    """
    
    X = pd.concat([features_to_train.iloc[:,0],features_to_train.iloc[:,2:len(features_to_train.columns)]],axis=1)
    y = features_to_train["avg"]
    clf = RandomForestRegressor()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 )
    clf.fit(X_train,y_train)
    y_fitted = clf.predict(X_test)
    r2_score_result = r2_score(y_test,y_fitted)
    predictions = clf.predict(features_to_estimate)
    return r2_score_result, predictions  


def save_and_display_sat_ratings(website,predictions,matching_results,beers_to_predict,training_set):
    """
    saves and displays rating estimation results to a csv and display the full set of ratings of sat beers. 
    Ratings correspond to : 
    - Ratings that were automatically matched with generate_automatic_beer_matches()
    - Ratings that were manually matched
    - Ratings that were estimated with a regressor model with randomforest_sat_beers_ratings()
    
     Parameters
    ----------
    website          (string)    : Name of the dataset. Either 'RateBeer' or 'BeerAdvocate'. 
                                 
    predictions      (np.array)  : Array of predicted ratings for beers without a match in the 'website' dataframe.

    matching_results (dataframe) : Ratings for beers that found a match in 'website' dataframe.
    
    training_set     (dataframe) : dataframe of features used to train the dataset. Column '1' should be the labels. 

    
    """
    if website == "BeerAdvocate":
        acronym = "BA"
    if website == "RateBeer":
        acronym = "RB"
    SAT_beers = read_data.fetch_satellite_df()
    beers_to_predict["predictions"] = predictions
    naive_average = training_set.groupby(by=["abv","location"]).agg({"mean"})
    SAT_ratings = beers_to_predict.merge(naive_average,how="left",left_on=["alcool","from"],right_on=["abv","location"])
    SAT_beers = SAT_beers.merge(matching_results[["nom",f"{acronym}_avg",f"{acronym}_beer_id"]],how="left",on="nom")
    SAT_results = SAT_beers.merge(SAT_ratings[["nom","predictions",('avg', 'mean')]],how="left",left_on="nom",right_on="nom")
    #III.5. As a sanity check and to have an alternative for our model, we naively compute averages 
    #of beers that come from the same country and have the same ABV. We ignore information about beer type
    #in order to have averages over bigger sets (otherwise, some combinations of (origin,abv,type) would have a single element)
    SAT_results[f'{acronym}_avg'].fillna(SAT_results['predictions'],inplace=True)
    SAT_results.rename({f'{acronym}_avg' : 'avg'},inplace=True,axis=1)
    SAT_results.drop_duplicates(subset="nom", keep='first', inplace=True, ignore_index=False)
    SAT_results.sort_values(by="avg",ascending=False,inplace=True)
    SAT_results.to_csv(f"data/predicted_SAT_{acronym}_sorted.csv",index=True)
    display(SAT_results[["nom","type","brasseur","avg"]])

    return SAT_results