# Pre-processing

This notebook contains pre-processing steps to create cleaned data in the `/data` folder from the raw datasets.

In [None]:
import pandas as pd
import os
import subprocess

from utils.location_extracter import extract_location

In [None]:
os.makedirs('data',                   exist_ok=True)
os.makedirs('data/beer_advocate',     exist_ok=True)
os.makedirs('data/matched_beer_data', exist_ok=True)
os.makedirs('data/rate_beer',         exist_ok=True)

# Downloading data

The dataset is stored on a remote Google Drive folder. The following cell with execute a python script that will download the files and store then in a folder `raw_data` at the root of the repository.

In [15]:
%run src/scripts/download.py {os.path.join(os.getcwd(), 'raw_data')}

# Beer Advocate

## Beers

In [None]:
def preprocess_ba_beers(df_beers: pd.DataFrame, df_beers_styles: pd.DataFrame) -> pd.DataFrame:
    df_beers.rename(columns={
        'style': 'beer_style',
        'nbr_ratings': 'ratings_count',
        'nbr_reviews': 'reviews_count',
        'avg': 'ratings_average',
        'ba_score': 'ratings_ba_score',
        'bros_score': 'ratings_bros_score',
        'abv': 'beer_alcohol_by_volume',
        'avg_computed': 'ratings_average_computed',
        'nbr_matched_valid_ratings': 'matching_ratings_count',
        'avg_matched_valid_ratings': 'matching_ratings_average',
    }, inplace=True)

    df_beers.drop(columns=['brewery_name'], inplace=True, errors='ignore')

    df_beers = df_beers[['beer_id', 'brewery_id', 'beer_name', 'beer_style', 'ratings_count', 'reviews_count', 'ratings_average', 'ratings_ba_score', 'ratings_bros_score', 'beer_alcohol_by_volume', 'ratings_average_computed', 'zscore', 'matching_ratings_count', 'matching_ratings_average']]

    df_beers = df_beers.merge(df_beers_styles, on='beer_style', how='left')

    return df_beers

In [None]:
df_ba_beers        = pd.read_csv('raw_data/beer_advocate/beers.csv')
df_ba_beers_styles = pd.read_csv('src/utils/beers_styles.csv')

df_ba_beers = preprocess_ba_beers(df_ba_beers, df_ba_beers_styles)
df_ba_beers.to_csv('data/beer_advocate/beers.csv', index=False)

## Breweries

In [None]:
def preprocess_ba_breweries(df_breweries: pd.DataFrame) -> pd.DataFrame:
    df_breweries.rename(columns={
        'id': 'brewery_id',
        'location': 'brewery_location',
        'name': 'brewery_name',
        'nbr_beers': 'brewery_beers_count'
    }, inplace=True)

    # Location of two breweries were missing, we found their location using Google
    df_breweries.loc[df_breweries['brewery_id'] == 18989, 'brewery_location'] = 'United States'
    df_breweries.loc[df_breweries['brewery_id'] == 11016, 'brewery_location'] = 'Austria'

    df_breweries['brewery_country'], df_breweries['brewery_region'], df_breweries['brewery_country_code'] = extract_location(df_breweries['brewery_location'])
    df_breweries.drop(columns=['brewery_location'], inplace=True)
    
    return df_breweries    

In [None]:
df_ba_breweries = pd.read_csv('raw_data/beer_advocate/breweries.csv')
df_ba_breweries = preprocess_ba_breweries(df_ba_breweries)
df_ba_breweries.to_csv('data/beer_advocate/breweries.csv', index=False)

## Users

In [None]:
def preprocess_ba_users(df_users: pd.DataFrame) -> pd.DataFrame:
    df_users.rename(columns={
        'joined': 'user_created_date',
        'location': 'user_location',
        'nbr_ratings': 'user_ratings_count',
        'nbr_reviews': 'user_reviews_count',
    }, inplace=True)

    df_users['user_country'], df_users['user_region'], df_users['user_country_code'] = extract_location(df_users['user_location'])
    df_users.drop(columns=['user_location'], inplace=True)

    return df_users

In [None]:
df_ba_users = pd.read_csv('raw_data/beer_advocate/users.csv')
df_ba_users = preprocess_ba_users(df_ba_users)
df_ba_users.to_csv('data/beer_advocate/users.csv', index=False)

## Ratings & Reviews

On BeerAdvocate, users can either submit a *rating* or a *review*. The dataset contains two separate text files `ratings.txt` and `reviews.txt`. Ratings and reviews are formatted as (key, value) pairs on each line of a plain text file with empty lines to split different ratings/reviews. 

The program `txt_to_csv.cpp` is a C++ program that transform a plain text file in a `.csv` file, that can be more easily analyzed using Python and Pandas. The following cell will compile the program using a C++ compiler and run it on the `ratings.txt` file only. Indeed, after analyzing the overlapping between the two files, we found that `reviews.txt` is a subset of `ratings.txt`. Thus, we will discard `reviews.txt` and only keep `ratings.txt`.

In [None]:
subprocess.run('g++ -std=c++17 -o src/scripts/txt_to_csv src/scripts/txt_to_csv.cpp', shell=True)

subprocess.run([
    'src/scripts/txt_to_csv', 
    'raw_data/beer_advocate/ratings.txt', 
    'raw_data/beer_advocate/ratings.csv'
])

In [None]:
def preprocess_ba_ratings(df_ratings: pd.DataFrame) -> pd.DataFrame:
    df_ratings.drop(columns=['user_name', 'brewery_name', 'beer_name', 'style', 'abv'], inplace=True, errors='ignore')
    df_ratings = df_ratings[['user_id', 'beer_id', 'brewery_id', 'date', 'review', 'rating', 'overall', 'aroma', 'appearance', 'palate', 'taste', 'text']]    

    return df_ratings

In [None]:
df_ba_ratings = pd.read_csv('raw_data/beer_advocate/ratings.csv')
df_ba_ratings = preprocess_ba_ratings(df_ba_ratings)

For several analyses, we will be looking for the average scores given by the user on all the beers and on different styles of beer at the time of rating. We will also require the current rating of the beer evaluated at the time of rating. The function `get_past_ratings_count_and_average` computes the count and average of ratings according to provided parameters :

|Parameter|Description|
|---|---|
|`sorting_columns` | Columns to sort to ensure the cumulative count and mean are done in the right order |
|`grouping_columns` | Columns forming the group within which we want to compute the count and mean |
|`name` | Prefix of the resulting columns |
|`shift` | If `True`, the count and mean will **not** include the current rating in the computation |

In [47]:
def get_past_ratings_count_and_average(df_ratings: pd.DataFrame, sorting_columns: list[str], grouping_columns: list[str], name: str, shift: bool) -> pd.DataFrame:
    df_ratings = df_ratings.sort_values(sorting_columns, kind='stable')

    if shift:
        df_ratings[f'{name}_past_ratings_count']   = df_ratings.groupby(grouping_columns).cumcount()
        df_ratings[f'{name}_past_ratings_average'] = df_ratings.groupby(grouping_columns)['rating'].expanding().mean().shift().reset_index(level=[i for i in range(len(grouping_columns))], drop=True)
    else:
        df_ratings[f'{name}_past_ratings_count']   = df_ratings.groupby(grouping_columns).cumcount() + 1
        df_ratings[f'{name}_past_ratings_average'] = df_ratings.groupby(grouping_columns)['rating'].expanding().mean().reset_index(level=[i for i in range(len(grouping_columns))], drop=True)

    df_ratings.loc[df_ratings[f'{name}_past_ratings_count'] == 0, f'{name}_past_ratings_average'] = pd.NA

    df_ratings = df_ratings.sort_index()
    return df_ratings

### Past ratings of each user

The next cell computes the number and the average of past ratings of the user that posted a given rating at the time of this rating.
$$
\begin{align*}
& \forall i \in \{0, 1, \dots, N\} \\ 
\mathcal S_i &= \{\verb|ratings[n]| | (\verb|ratings[i][user] == ratings[n][user]|) \land (\verb|ratings[i][date] > ratings[n][date]|) \} \\
\verb|count[i]| &= |\mathcal S_i| \\
\verb|average[i]| &= \frac{1}{|\mathcal S_i|} \sum_{\verb|rating| \in \mathcal S_i} \verb|rating[rating]|
\end{align*}
$$



In [48]:
df_ba_ratings = get_past_ratings_count_and_average(df_ba_ratings, ['user_id', 'date'], ['user_id'], 'user', shift=True)

### Past ratings of each beer

The next cell computes the number and the average of past ratings of the beer that is rated in a given rating at the time of this rating.
$$
\begin{align*}
& \forall i \in \{0, 1, \dots, N\} \\ 
\mathcal S_i &= \{\verb|ratings[n]| | (\verb|ratings[i][beer] == ratings[n][beer]]|) \land (\verb|ratings[i][date] > ratings[n][date]|) \} \\
\verb|count[i]| &= |\mathcal S_i| \\
\verb|average[i]| &= \frac{1}{|\mathcal S_i|} \sum_{\verb|rating| \in \mathcal S_i} \verb|rating[rating]|
\end{align*}
$$



In [49]:
df_ba_ratings = get_past_ratings_count_and_average(df_ba_ratings, ['beer_id', 'date'], ['beer_id'], 'beer', shift=True)

### Past ratings of each user for a specific beer style

The next cell computes the number and the average of past ratings of the user that posted a given rating at the time of this rating and only for beers of the same style as the one of the current rating.
$$
\begin{align*}
& \forall i \in \{0, 1, \dots, N\} \\ 
\mathcal S_i &= \{\verb|ratings[n]| |
    \\ & \quad\quad (\verb|ratings[i][user] == ratings[n][user]|)
    \\ & \quad\quad \land  (\verb|ratings[i][date] > ratings[n][date]|)
    \\ & \quad\quad \land (\verb|ratings[i][beer_style] == ratings[n][beer_style]|) 
\\ & \} \\
\verb|count[i]| &= |\mathcal S_i| \\
\verb|average[i]| &= \frac{1}{|\mathcal S_i|} \sum_{\verb|rating| \in \mathcal S_i} \verb|rating[rating]|
\end{align*}
$$

In [83]:
# Add global beer style to ratings
df_ba_ratings = df_ba_ratings.merge(df_ba_beers[['beer_id', 'beer_global_style']], on='beer_id', how='left')
df_ba_ratings = get_past_ratings_count_and_average(df_ba_ratings, ['user_id', 'beer_global_style', 'date'], ['user_id', 'beer_global_style'], 'user_beer_style', shift=False)

### Past ratings of each user for all beer styles

The next cell computes the number and the average of past ratings of the user that posted a given rating at the time of this rating, for all styles of beer.

Let $\mathcal B$ be the set of beer styles.

$$
\begin{align*}
& \forall i \in \{0, 1, \dots, N\}, \forall j \in \mathcal B\\ 
\mathcal S_{i, j} &= \{\verb|ratings[n]| |
    \\ & \quad\quad (\verb|ratings[i][user] == ratings[n][user]|)
    \\ & \quad\quad \land  (\verb|ratings[i][date] > ratings[n][date]|)
    \\ & \quad\quad \land (\verb|ratings[n][beer_style] == j|) 
\\ & \} \\
\verb|count[i][j]| &= |\mathcal S_{i, j}| \\
\verb|average[i][j]| &= \frac{1}{|\mathcal S_{i, j}|} \sum_{\verb|rating| \in \mathcal S_{i, j}} \verb|rating[rating]|
\end{align*}
$$

Thus this function will create $2 \times |\mathcal B|$ new columns with the counts and averages for each style of beer.

In [86]:
def get_past_ratings_counts_all_style(df_ratings: pd.DataFrame, dummy_column: str, multiplying_columns_prefix: str) -> pd.DataFrame:
    # Create one-hot encoded binary matrix for the beer styles
    beer_styles_one_hot_encoding = pd.get_dummies(df_ratings[dummy_column], dtype=int, prefix='user_past_ratings').replace(0, pd.NA)

    # Multiply the binary matrix with the past ratings count and average corresponding to the current rating style
    df_user_beer_style_past_ratings = pd.merge(
        beer_styles_one_hot_encoding.mul(df_ratings[f'{multiplying_columns_prefix}_past_ratings_count'],   axis=0),
        beer_styles_one_hot_encoding.mul(df_ratings[f'{multiplying_columns_prefix}_past_ratings_average'], axis=0),
        left_index=True, 
        right_index=True, 
        suffixes=('_count', '_average')
    )

    modified_columns = [column for column in df_user_beer_style_past_ratings.columns if ('_count' in column or '_average' in column)]

    # Merge to get the user id and date in order to forward fill the values within groups of user_id
    df_user_beer_style_past_ratings = df_user_beer_style_past_ratings.merge(
        df_ratings[['user_id', 'date', 'beer_id']], 
        left_index=True, 
        right_index=True, 
        how='left'
    )

    # Ensure duplicate are well sorted by the next sorting
    df_user_beer_style_past_ratings.sort_index(inplace=True)

    # Fill forward the values within groups of user_id
    df_user_beer_style_past_ratings[modified_columns] = df_user_beer_style_past_ratings\
        .sort_values(['user_id', 'date'], kind='stable')\
        .groupby('user_id')\
        .ffill()[modified_columns]

    # Shift the value forward to have the counts and average without taking into
    # account the current rating (since counts and average are computed taking 
    # into account current rating)
    df_user_beer_style_past_ratings[modified_columns] = df_user_beer_style_past_ratings\
        .sort_values(['user_id', 'date'], kind='stable')\
        .groupby('user_id')\
        .shift(1)[modified_columns]
    
    # Fill remaining cells with 0 as they are cells of ratings occuring before
    # the first rating in a given beer style
    df_user_beer_style_past_ratings[modified_columns] = df_user_beer_style_past_ratings[modified_columns].fillna(0.0)
    
    return df_user_beer_style_past_ratings

In [None]:
df_user_beer_style_past_ratings = get_past_ratings_counts_all_style(df_ba_ratings, 'beer_global_style', 'user_beer_style')
df_user_beer_style_past_ratings.to_csv('data/beer_advocate/user_beer_style_past_ratings.csv', index=False)

### User rating habits of different beer styles

The next cell computes the Gini Index of each user with respect to the number of beers of each style they rated. The Gini Index is in $[0, 1]$ where a higher score indicates a higher heterogeneity within the styles of beer rated. A score of $1$ means that the user only rates one style of beer.

In [102]:
def compute_gini_index(df_ratings: pd.DataFrame, df_user_beer_style_past_ratings: pd.DataFrame) -> pd.DataFrame:
    count_columns = [column for column in df_user_beer_style_past_ratings.columns if column.endswith('_count')]

    total_ratings = df_user_beer_style_past_ratings[count_columns].sum(axis=1)
    proportions   = df_user_beer_style_past_ratings[count_columns].div(total_ratings, axis=0).fillna(0)

    df_ratings['gini_index'] = 1 - (proportions ** 2).sum(axis=1)
    return df_ratings

In [103]:
df_ba_ratings = compute_gini_index(df_ba_ratings, df_user_beer_style_past_ratings)

In [124]:
df_ba_ratings.sort_index(inplace=True)
df_ba_ratings.to_csv('data/beer_advocate/ratings.csv', index=False)

## Memory cleaning

In [125]:
del df_ba_beers
del df_ba_breweries
del df_ba_users
del df_ba_ratings

# RateBeer

## Beers

In [126]:
def preprocess_rb_beers(df_beers: pd.DataFrame) -> pd.DataFrame:
    df_beers = df_beers.drop(columns=['brewery_name'])

    df_beers = df_beers.rename(columns={
        'style': 'beer_style',
        'nbr_ratings': 'ratings_count',
        'avg': 'ratings_average',
        'overall_score': 'ratings_overall_score',
        'abv': 'beer_alcohol_by_volume',
        'avg_computed': 'ratings_average_computed',
        'nbr_matched_valid_ratings': 'matching_ratings_count',
        'avg_matched_valid_ratings': 'matching_ratings_average',
    })

    return df_beers

In [127]:
df_rb_beers = pd.read_csv('raw_data/rate_beer/beers.csv')
df_rb_beers = preprocess_rb_beers(df_rb_beers)
df_rb_beers.to_csv('data/rate_beer/beers.csv', index=False)

## Breweries

In [None]:
def preprocess_rb_breweries(df_breweries: pd.DataFrame) -> pd.DataFrame:
    df_breweries = df_breweries.rename(columns={
        'id': 'brewery_id',
        'location': 'brewery_location',
        'name': 'brewery_name',
        'nbr_beers': 'brewery_beers_count',
    })

    df_breweries['brewery_country'], df_breweries['brewery_region'], df_breweries['brewery_country_code'] = extract_location(df_breweries['brewery_location'])
    df_breweries = df_breweries.drop(columns=['brewery_location'])

    return df_breweries

In [129]:
df_rb_breweries = pd.read_csv('raw_data/rate_beer/breweries.csv')
df_rb_breweries = preprocess_rb_breweries(df_rb_breweries)
df_rb_breweries.to_csv('data/rate_beer/breweries.csv', index=False)

## Users

In [130]:
df_rb_users = pd.read_csv('raw_data/rate_beer/users.csv')

In [None]:
def preprocess_rb_users(df_users: pd.DataFrame) -> pd.DataFrame:
    df_users = df_users.sort_values('joined', kind='stable')
    df_users = df_users.groupby('user_id').last().reset_index()

    df_users = df_users.rename(columns={
        'joined': 'user_created_date',
        'location': 'user_location',
        'nbr_ratings': 'user_ratings_count',
    })

    df_users['user_country'], df_users['user_region'], df_users['user_country_code'] = extract_location(df_users['user_location'])
    df_users = df_users.drop(columns=['user_location'])

    return df_users

In [132]:
df_rb_users = pd.read_csv('raw_data/rate_beer/users.csv')
df_rb_users = preprocess_rb_users(df_rb_users)
df_rb_users.to_csv('data/rate_beer/users.csv', index=False)

## Ratings & Reviews

Since there are only ratings on Rate Beer, the files `ratings.txt` and `reviews.txt` are strictly identical. We discard `reviews.txt` and parse `ratings.txt` using the `txt_to_csv.cpp` utility.

In [None]:
subprocess.run([
    'src/scripts/txt_to_csv', 
    'raw_data/rate_beer/ratings.txt', 
    'raw_data/rate_beer/ratings.csv'
])

In [135]:
def preprocess_rb_ratings(df_ratings: pd.DataFrame) -> pd.DataFrame:
    df_ratings = df_ratings.drop(columns=['user_name', 'brewery_name', 'beer_name', 'style', 'abv'], errors='ignore')
    
    df_ratings = df_ratings.sort_values('date')
    df_ratings = df_ratings.groupby(['user_id', 'beer_id']).last().reset_index()

    df_ratings = df_ratings[['user_id', 'beer_id', 'brewery_id', 'date', 'rating', 'overall', 'aroma', 'appearance', 'palate', 'taste', 'text']]

    return df_ratings

In [136]:
df_rb_ratings = pd.read_csv('raw_data/rate_beer/ratings.csv')
df_rb_ratings = preprocess_rb_ratings(df_rb_ratings)
df_rb_ratings.to_csv('data/rate_beer/ratings.csv', index=False)

## Memory cleaning

In [None]:
del df_rb_beers
del df_rb_breweries
del df_rb_users
del df_rb_ratings

# Matched Beer Data

## Beers

In [None]:
df_ma_beers = pd.read_csv('raw_data/matched_beer_data/beers.csv', header=[0, 1])
df_ma_beers = df_ma_beers[[('ba', 'beer_id'), ('rb', 'beer_id'), ('scores', 'diff'), ('scores', 'sim')]]
df_ma_beers.to_csv('data/matched_beer_data/beers.csv', index=False)

## Breweries

In [None]:
df_ma_breweries = pd.read_csv('raw_data/matched_beer_data/breweries.csv', header=[0, 1])
df_ma_breweries = df_ma_breweries.rename(columns={'id': 'brewery_id',}, level=1)
df_ma_breweries = df_ma_breweries[[('ba', 'brewery_id'), ('rb', 'brewery_id'), ('scores', 'diff'), ('scores', 'sim')]]
df_ma_breweries.to_csv('data/matched_beer_data/breweries.csv', index=False)

## Users

In [None]:
df_ma_users = pd.read_csv('raw_data/matched_beer_data/users.csv', header=[0, 1])
df_ma_users = df_ma_users[[('ba', 'user_id'), ('rb', 'user_id')]]
df_ma_users.to_csv('data/matched_beer_data/users.csv', index=False)

In [None]:
df_ma_users_approx = pd.read_csv('raw_data/matched_beer_data/users_approx.csv', header=[0, 1])
df_ma_users_approx = df_ma_users_approx[[('ba', 'user_id'), ('rb', 'user_id'), ('scores', 'sim')]]
df_ma_users_approx.to_csv('data/matched_beer_data/users_approx.csv', index=False)

## Ratings & Reviews

In [None]:
df_ma_ratings = pd.read_csv('raw_data/matched_beer_data/ratings.csv', header=[0, 1])
df_ma_ratings = df_ma_ratings[[
    ('ba', 'beer_id'), ('ba', 'user_id'), 
    ('rb', 'beer_id'), ('rb', 'user_id'),
]]
df_ma_ratings.to_csv('data/matched_beer_data/ratings.csv', index=False)

## Memory cleaning

In [None]:
del df_ma_beers
del df_ma_breweries
del df_ma_users
del df_ma_users_approx
del df_ma_ratings