In [1]:
# import all the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
import os
import warnings
import spacy

from textblob import TextBlob
from IPython.display import display, HTML, Markdown

from sklearn.feature_extraction.text import CountVectorizer

import string
import nltk
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

from wordcloud import WordCloud
from collections import Counter

warnings.filterwarnings('ignore') # remove the warnings

In [2]:
def load_txt_file(website, Nb_files):
    if website == 'BeerAdvocate':
        # Specify the directory containing the text files 
        # Need to be outside of the repo folder and might need to change the path according to one's unique folders configuration
        directory = './../../dataset_BeerReviews/BeerAdvocate/ratings_split_BA/'
    elif website == 'RateBeer':
        directory = './../../dataset_BeerReviews/RateBeer/ratings_split_RB/'

    # Initialize an empty list to store DataFrames
    dfs = []

    # Loop through the files in reverse order (ratings-5.txt to ratings-1.txt)
    for i in range(Nb_files, 0, -1):
        file_name = f'ratings-{i}.txt'
        file_path = os.path.join(directory, file_name)

        with open(file_path, 'r') as f:
            text = f.read()

        # Remove double quotes at the beginning of each line
        data = re.sub('"', '', text)
        data = re.sub(r'^"', '', data, flags=re.MULTILINE)

        # Split the text into individual beer reviews
        beer_reviews = data.split('beer_name')

        # Extract the beer information from each review
        beer_data = []
        for review in beer_reviews:
            beer_info = {}
            for line in [entry.split(':', 1) for entry in review.split('\n') if ':' in entry]:# and 'text' not in entry]:
                if line:  # Check if the list is not empty
                    key, value = line[0].strip(), line[1].strip()
                    beer_info[key] = value
            beer_data.append(beer_info)

        # Convert the beer data into a DataFrame
        df = pd.DataFrame(beer_data)

        # Append the DataFrame to the list
        dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame
    final_df = pd.concat(dfs, ignore_index=True)

    # Rename the columns depending on the website
    if website == 'BeerAdvocate':
        final_df.columns = [
            'beer_name',
            'beer_id',
            'brewery_name',
            'brewery_id',
            'style',
            'abv',
            'date',
            'user_name',
            'user_id',
            'appearance',
            'aroma',
            'palate',
            'taste',
            'overall',
            'rating',
            'text',
            'review'] # additional column compared to RB
    else:
        final_df.columns = [
            'beer_name',
            'beer_id',
            'brewery_name',
            'brewery_id',
            'style',
            'abv',
            'date',
            'user_name',
            'user_id',
            'appearance',
            'aroma',
            'palate',
            'taste',
            'overall',
            'rating',
            'text']
    return final_df

def sample_data(df, ratio):
    # sample the data to make it even smaller
    return df.sample(frac = ratio, random_state=42)

In [3]:
# Load data : for faster processing, we will only load 1 file out of the whole dataset
ratings_BA = load_txt_file('BeerAdvocate', 1)
ratings_RB = load_txt_file('RateBeer', 1)

In [25]:
# Sample data to make it even smaller AND RANDOMIZED (not randomized in the initial txt splitting process)
ratings_BA = sample_data(df = ratings_BA, ratio = 0.1)
ratings_RB = sample_data(df = ratings_RB, ratio = 0.1)

In [24]:
# BeerAdvocate
beers_BA = pd.read_csv("./dataset_BeerReviews/BeerAdvocate/beers.csv")
breweries_BA = pd.read_csv("./dataset_BeerReviews/BeerAdvocate/breweries.csv")
users_BA = pd.read_csv("./dataset_BeerReviews/BeerAdvocate/users.csv")

# RateBeer
beers_RB = pd.read_csv("./dataset_BeerReviews/RateBeer/beers.csv")
breweries_RB = pd.read_csv("./dataset_BeerReviews/RateBeer/breweries.csv")
users_RB = pd.read_csv("./dataset_BeerReviews/RateBeer/users.csv")

# matched_beer_data
beers_matched = pd.read_csv("./dataset_BeerReviews/matched_beer_data/beers.csv", header=1)
breweries_matched = pd.read_csv("./dataset_BeerReviews/matched_beer_data/breweries.csv", header = 1)
ratings_matched = pd.read_csv("./dataset_BeerReviews/matched_beer_data/ratings.csv", header=1)
users_approx = pd.read_csv("./dataset_BeerReviews/matched_beer_data/users_approx.csv", header=1)
users_matched = pd.read_csv("./dataset_BeerReviews/matched_beer_data/users.csv", header=1)

In [17]:
# Merge the dataframes
display(ratings_matched)

Unnamed: 0,abv,appearance,aroma,beer_id,beer_name,brewery_id,brewery_name,date,overall,palate,...,brewery_name.1,date.1,overall.1,palate.1,rating.1,style.1,taste.1,text.1,user_id.1,user_name.1
0,11.3,4.50,4.50,645,Trappistes Rochefort 10,207,Brasserie de Rochefort,1324810800,5.0,4.50,...,Brasserie Rochefort,1387710000,19.0,4.0,4.6,Abt/Quadrupel,9.0,a) Geruch malzig-schwer-sÃ¼Ã. Riecht schon ...,83106,Erzengel
1,5.0,,,28191,Myanmar Lager Beer,9369,Myanmar Brewery and Distillery,1322650800,,,...,Myanmar Brewery and Distillery,1322564400,6.0,2.0,1.7,Pale Lager,4.0,"Can. Weak and watery, not the best beer of the...",91324,visionthing
2,5.0,3.50,3.50,57911,Cantillon Tyrnilambic Baie D’Argousier Lambic,388,Brasserie Cantillon,1344074400,4.0,4.00,...,Cantillon,1353582000,17.0,4.0,4.1,Lambic Style - Fruit,8.0,"Bottle @ One Pint Pub, Helsinki. Originally ra...",98624,tiong
3,5.0,4.00,3.50,57913,Cantillon Pikkulinnun Viskilambic,388,Brasserie Cantillon,1344074400,4.0,4.00,...,Cantillon,1416222000,16.0,4.0,4.1,Lambic Style - Unblended,9.0,"Draught @Â Pikkulintu, Helsinki, Finland. A pr...",98624,tiong
4,6.0,4.00,4.00,81125,Drie Fonteinen Oude Geuze - Armand & Tommy,2216,Brouwerij 3 Fonteinen,1346234400,4.0,4.00,...,Brouwerij 3 Fonteinen,1345284000,16.0,4.0,4.0,Lambic Style - Gueuze,8.0,750ml bottleBottling date: 2011/02/17 - Pours ...,98624,tiong
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21959,8.5,4.50,3.50,28030,Valeir Divers,3463,Brouwerij Contreras,1323946800,4.0,4.00,...,Contreras,1323946800,15.0,4.0,3.7,Abbey Tripel,7.0,Valeir Divers 33cl bottle from www.belgianbeer...,137922,TheBeerWatcher
21960,8.0,4.25,4.25,5057,Fantôme Saison,738,Brasserie Fantôme,1484046000,4.0,4.00,...,Brasserie Fantôme,1487502000,16.0,4.0,4.0,Saison,8.0,.................................................,394232,jonj
21961,5.2,,,349,Jupiler,134,Brasserie Piedboeuf,1393239600,,,...,Brasserie Piedboeuf (InBev),1393153200,1.0,1.0,1.2,Pale Lager,2.0,"Biere, foin, pain grillÃ©, terreux, banane, ca...",304802,maxilouis
21962,6.5,3.00,3.00,127162,Redenaar,32560,Brouwerij d'Oude Maalderij,1419764400,3.0,2.75,...,dOude Maalderij,1397037600,17.0,3.0,3.9,Belgian Ale,8.0,This is the first blond ale from this brewery....,276666,simoen


In [18]:
display(ratings_BA)

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,aroma,palate,taste,overall,rating,text,review
43736,"Hello, My Name Is Ingrid",67199,BrewDog,16315,American Double / Imperial IPA,8.2,1370426400,aztraz,aztraz.671249,,,,,,3.50,,False
273843,Solstice D'été Aux Cerises,73716,Brasserie Dieu du Ciel!,1141,Berliner Weissbier,6.5,1444644000,hopsolutely,hopsolutely.513175,4.0,4.0,3.5,4.0,3.75,3.90,Enjoying with thanks to Phyl21ca.Rarely does a...,True
169417,O'Hanlons Organic Rye,9123,O'Hanlon Brewing Co. Ltd.,1533,Rye Beer,5.0,1214733600,aerozeppl,aerozeppl.101084,3.0,3.0,4.0,3.0,3.0,3.10,A: 2 Finger pour. White head with some piting....,True
25783,Fraoch Heather Ale,245,Williams Brothers Brewing Company,12142,Scottish Gruit / Ancient Herbed Ale,5.0,1375869600,ekalb31,ekalb31.747152,,,,,,3.50,,False
90288,Samuel Smith's Pure Brewed Organic Lager Beer,778,Samuel Smith Old Brewery (Tadcaster),113,Euro Pale Lager,5.0,1160560800,jasonjlewis,jasonjlewis.74038,4.0,4.5,4.0,4.0,4.0,4.12,This beer is really a no frills beer to relax ...,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268479,Péché Mortel,10325,Brasserie Dieu du Ciel!,1141,American Double / Imperial Stout,9.5,1310032800,gskitt,gskitt.386412,5.0,4.5,5.0,5.0,5.0,4.88,In Montreal in the brewery. Very excited. Pour...,True
156544,Belgian Pale Ale (White Wine BA With Brett),148433,Brodie's,22276,Belgian Pale Ale,5.0,1432375200,PlutonowyManiek,plutonowymaniek.897406,4.0,3.5,4.0,4.0,4.0,3.88,"White foam, maybe not great, is correct.The ar...",True
352051,Blackstone Porter,46769,Driftwood Brewery,18858,English Porter,6.0,1359716400,rodrigueZflyingf0x,rodriguezflyingf0x.680444,,,,,,4.00,,False
347540,Mountain Monk,174484,The Grizzly Paw Brewing Company,2587,Belgian Strong Dark Ale,6.4,1432893600,biboergosum,biboergosum.168458,3.5,3.5,3.75,3.5,3.5,3.53,"20oz pint at the brewpub in Canmore. Aaaah, A...",True


In [47]:
# Merge the dataframes
print(ratings_matched['user_name'].loc[ratings_matched['beer_id'] == 349])

463      BlackHaddock
2096      Huhzubendah
4793        chinchill
5641              SVD
6564        drpimento
9962           stevoj
12334         janubio
16987     BeerBelcher
21330         WillemG
21961       Maxilouis
Name: user_name, dtype: object


In [46]:
print(ratings_matched['user_name.1'].loc[ratings_matched['beer_id'] == 349])

463      BlackHaddock
2096      Huhzubendah
4793        chinchill
5641              SVD
6564        drpimento
9962           stevoj
12334         janubio
16987     BeerBelcher
21330         WillemG
21961       maxilouis
Name: user_name.1, dtype: object


In [50]:
print(ratings_BA['user_name'].loc[ratings_BA['beer_id'] == 645])

Series([], Name: user_name, dtype: object)


In [51]:
ratings_matched.columns

Index(['abv', 'appearance', 'aroma', 'beer_id', 'beer_name', 'brewery_id',
       'brewery_name', 'date', 'overall', 'palate', 'rating', 'review',
       'style', 'taste', 'text', 'user_id', 'user_name', 'abv.1',
       'appearance.1', 'aroma.1', 'beer_id.1', 'beer_name.1', 'brewery_id.1',
       'brewery_name.1', 'date.1', 'overall.1', 'palate.1', 'rating.1',
       'style.1', 'taste.1', 'text.1', 'user_id.1', 'user_name.1'],
      dtype='object')

In [89]:
ratings_matched_test = ratings_matched.copy(deep=True)
ratings_matched_test['beer_id.1'] = ratings_matched_test['beer_id']
ratings_matched_test['brewery_id.1'] = ratings_matched_test['brewery_id']
ratings_matched_test['beer_name.1'] = ratings_matched_test['beer_name']
ratings_matched_test['brewery_name.1'] = ratings_matched_test['brewery_name']
ratings_matched_test_1 = ratings_matched_test.iloc[:, :17]
ratings_matched_test_1 = ratings_matched_test_1.drop(['review'], axis=1)
ratings_matched_test_2 = ratings_matched_test.iloc[:, 17:]
ratings_matched_test_2.columns = ratings_matched_test_1.columns
ratings_matched_test = pd.concat([ratings_matched_test_1, ratings_matched_test_2], axis=0)
ratings_matched_test.sort_values(by=['beer_id', 'user_id'], inplace=True)
ratings_matched_test

Unnamed: 0,abv,appearance,aroma,beer_id,beer_name,brewery_id,brewery_name,date,overall,palate,rating,style,taste,text,user_id,user_name
9156,6.0,5.0,8.00,12,Bert Grant's Imperial Stout,5,Yakima Brewing Co. / Bert Grant's Ales,1049882400,15.00,4.0,4.00,Imperial Stout,8.00,What a magnificent site to these tired old eye...,46,Gusler
1186,6.0,5.0,7.00,12,Bert Grant's Imperial Stout,5,Yakima Brewing Co. / Bert Grant's Ales,1053338400,17.00,4.0,4.10,Imperial Stout,8.00,Thanks to Darkover for the trade. Considering ...,2714,Dogbrick
8941,6.0,3.0,5.00,12,Bert Grant's Imperial Stout,5,Yakima Brewing Co. / Bert Grant's Ales,1052560800,11.00,2.0,2.60,Imperial Stout,5.00,What a freaking joke. This was by far the wors...,3782,aracauna
13086,6.0,4.0,8.00,12,Bert Grant's Imperial Stout,5,Yakima Brewing Co. / Bert Grant's Ales,1079780400,14.00,3.0,3.50,Imperial Stout,6.00,Opaque black with a khaki brown head that stan...,9520,rajendra82
13032,6.0,5.0,7.00,12,Bert Grant's Imperial Stout,5,Yakima Brewing Co. / Bert Grant's Ales,1121335200,14.00,3.0,3.60,Imperial Stout,7.00,from the bottle. poured black with a small bro...,10189,walleye
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7838,4.3,3.5,3.25,292835,Shmash Equinox,49808,Kame & Kettle,1500631200,2.75,3.0,3.14,American IPA,3.25,,sammy.3853,Sammy
8106,3.5,3.0,6.00,292882,Moonlight Kettle Series (2017): Summertime Siesta,661,Muskoka Brewery,1500804000,16.00,3.0,3.50,Sour/Wild Ale,7.00,"Decent sour albeit it kettle sour, on tap at S...",11905,Sammy
8106,3.5,3.5,3.75,292882,Moonlight Kettle Series (2017): Summertime Siesta,661,Muskoka Brewery,1500804000,3.75,3.5,3.81,American Blonde Ale,4.00,,sammy.3853,Sammy
5192,5.5,3.0,7.00,293187,Single Hop Ale: Sorachi Ace,29894,Brevard Brewing Company,1500890400,12.00,3.0,3.00,India Pale Ale (IPA),5.00,On tap at the brewery taproom. Clear pale gold...,114306,chinchill
