In [1]:
import os
import subprocess
from pathlib import Path

"""
Dynamically find the project root (where .git exists) and set it as the current working directory.
"""
project_root = Path(subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], text=True).strip())
os.chdir(project_root)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math

## Looking at BeerAdvocate

In [None]:
beers = pd.read_csv('data/baseData/BeerAdvocate/beers.csv')

print(beers.shape)
beers.drop_duplicates(inplace=True)
print(beers.columns)
print(beers.shape)
beers.sample(10)

(280823, 15)
Index(['beer_id', 'beer_name', 'brewery_id', 'brewery_name', 'style',
       'nbr_ratings', 'nbr_reviews', 'avg', 'ba_score', 'bros_score', 'abv',
       'avg_computed', 'zscore', 'nbr_matched_valid_ratings',
       'avg_matched_valid_ratings'],
      dtype='object')
(280823, 15)


Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,style,nbr_ratings,nbr_reviews,avg,ba_score,bros_score,abv,avg_computed,zscore,nbr_matched_valid_ratings,avg_matched_valid_ratings
184548,196073,Rout Brown Ale,38649,Lupine Brewing Company,American Brown Ale,7,1,3.59,,,6.3,3.787143,,0,
232485,164091,Chocolate Stick,11814,Saugatuck Brewing Company,American Stout,1,0,3.56,,,5.0,3.56,,0,
10447,2564,The Tanner's Jack,203,Greene King / Morland Brewery,English Pale Ale,152,112,3.35,78.0,,4.4,3.3475,,0,
198086,268886,Civil Disobedience #19,22511,Hill Farmstead Brewery,Saison / Farmhouse Ale,60,12,4.32,91.0,,,4.386333,,0,
53841,235554,Atomga Cacau Aged Series,25832,Cervejaria Bodebrown Ltda.,American Double / Imperial Stout,3,0,4.26,,,10.0,4.256667,,0,
115274,58481,Harpoon Imperial IPA - Oak Aged,98,Harpoon Brewery & Beer Hall,American Double / Imperial IPA,3,3,4.65,,,10.0,4.65,,0,
132552,244277,Ulterior,28439,TRVE Brewing Co.,Witbier,1,0,4.11,,,5.3,4.11,0.274437,0,
233810,70300,Fire Dog Smoked Wheat,3098,Big Dog's Draft House,Smoked Beer,1,1,4.35,,,8.0,4.35,,0,
256909,207809,Hello...,28178,Pipeworks Brewing Company,American Porter,58,9,3.9,86.0,,9.5,3.997586,0.03125,10,3.897
17336,56209,Série Signature Sticke Alt,11056,Les Trois Mousquetaires,Altbier,109,51,3.87,86.0,,6.0,3.847339,,0,


In [None]:
breweries = pd.read_csv('data/baseData/BeerAdvocate/breweries.csv')

print(breweries.shape)
breweries.drop_duplicates(inplace=True)
print(breweries.columns)
print(breweries.shape)
breweries.sample(10)

(16758, 4)
Index(['id', 'location', 'name', 'nbr_beers'], dtype='object')
(16758, 4)


Unnamed: 0,id,location,name,nbr_beers
15735,13658,"Canada, Ontario",Draft Horse Brewery / Navan Feed Mill Restaurant,0
15482,4384,"United States, Montana",Lightning Boy Brewery,1
13177,44401,"United States, Michigan",Kilkenny's Irish Public House,0
9612,47435,"United States, New Jersey",Troon Brewing,30
4489,16829,Germany,Bayreuther Bierbrauerei,14
15617,14916,"Canada, Quebec",Les Brasseurs du Hameau,47
14501,321,Belgium,Brasserie d'Achouffe,20
3556,7613,Germany,Wittichenauer Stadtbrauerei,1
3496,5860,Germany,Brauerei Xaver Wasserburger,0
8776,35011,"United States, California",Sundowner Brewery,32


In [None]:
users = pd.read_csv('data/baseData/BeerAdvocate/users.csv')

print(users.shape)
users.drop_duplicates(inplace=True)
print(users.columns)
print(users.shape)
users.sample(10)

(153704, 6)
Index(['nbr_ratings', 'nbr_reviews', 'user_id', 'user_name', 'joined',
       'location'],
      dtype='object')
(153704, 6)


Unnamed: 0,nbr_ratings,nbr_reviews,user_id,user_name,joined,location
11460,4,0,crazybrody.706269,crazybrody,1354705000.0,"United States, Indiana"
5261,9,2,wwwagainsttimeca.1149511,wwwagainsttimeca,1464430000.0,Canada
22094,438,0,seaale.682156,SeaAle,1340532000.0,"United States, Oregon"
57588,2,0,jake095.919394,jake095,1420283000.0,
88755,1,0,mountainmaniac.438398,MountainManiac,1268651000.0,"United States, Colorado"
111826,3,1,jelpernw.408660,jelpernw,1261825000.0,"United States, New York"
86106,15,0,sethberry.938950,Sethberry,1423048000.0,
33526,6,2,fivebyfive.793394,FiveByFive,1397038000.0,Canada
94272,1,0,tagyoureit24824.919262,tagyoureit24824,1420283000.0,
58931,1,0,thefearedbeard.707199,TheFearedBeard,1355224000.0,


In [4]:
#there are 151 million lines of text... thats a lot
def parse_ratings_file(file_path, limit):
    data = []
    current_block = []
    i = 0

    with open(file_path, 'r') as file:  
        for line in file:
            i += 1
            
            line = line.strip()
            
            if not line:
                data.append(current_block)
                current_block = []
                continue 
            
            if ': ' in line:  
                key, value = line.split(': ', 1)
                current_block.append(value)
            

    return pd.DataFrame(data)


ratings_df = parse_ratings_file('data/baseData/RateBeer/ratings.txt', limit = None)

#since our txt file is in a predicatble format we can make things easy on ourselves
new_column_names = [
    'beer_name',
    'beer_id',
    'brewery_name',  
    'brewery_id',    
    'style',         
    'abv',           
    'date',          
    'user_name',     
    'user_id',       
    'appearance',    
    'aroma',         
    'palate',        
    'taste',         
    'overall',       
    'rating',        
    'text'
]

ratings_df.columns =  new_column_names
ratings_df.sample(10)


Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,aroma,palate,taste,overall,rating,text
5432722,Goose Island 312 Urban Wheat Ale,30913,Goose Island Beer Company &#40;AB-InBev&#41;,12853,Wheat Ale,4.2,1124445600,MilkmanDan,1094,2,4,2,3,5,1.6,"""Urban Wheat"" is apparently some sort of beer ..."
5864656,Beavertown Heavy Water (Longmorn 1992 Barrel A...,242378,Beavertown Brewery,14320,Imperial Stout,10.0,1396692000,abemorsten,87513,3,8,3,7,16,3.7,Small bottle kindly shared by Gunnar. Black bo...
6583700,Achel 8 Blond,10050,Brouwerij der Trappistenabdij De Achelse Kluis,1805,Belgian Strong Ale,8.0,1215338400,MuttyD,73469,5,7,4,7,16,3.9,"Soapy white head. Loads of apples, citrus, an..."
2259245,Tatra Jasne Pełne / Pils / Beer,13059,Żywiec (Żywiec Group/Heineken),6626,Pale Lager,6.0,1484391600,daab007,441531,3,5,3,5,15,3.1,"Zapach znikomy, ledwo wyczuwalny. Piana drobna..."
2787384,Terrapin Side Project Samurai Krunkles,177011,Terrapin Beer Company,2851,India Pale Ale (IPA),7.1,1351594800,Ughsmash,21554,4,7,1,7,15,3.4,Bomber. Poured cloudy orangish-golden with a c...
1709123,Monteiths Saison,13683,Monteiths Brewing Co. (DB Breweries),1505,Saison,5.5,1053684000,Sully,7208,3,6,3,6,12,3.0,A great effort. Good uplifting fruit overtones...
6258825,St Peters Old Style Porter,5874,St Peters &#40;UK&#41;,974,Porter,5.1,1056794400,PorterPounder,4759,4,7,4,8,15,3.8,Smoky! I usually do not like a smoked flavor i...
1162864,Heller Herzogenauracher Bock,66818,Privatbrauerei Heller,8130,Heller Bock,6.4,1370944800,Maria,19592,3,7,3,7,14,3.4,Itâs golden with a white head. The aroma has...
5416624,5 Rabbit 5 Rabbit,145480,5 Rabbit Cerveceria,12885,Golden Ale/Blond Ale,5.3,1311156000,User29624,29624,3,5,3,6,12,2.9,Pours gold with a white head. Light aroma of ...
1338321,Dinkelacker CD-Pils,8930,Dinkelacker-Schwaben Bräu,1709,Pilsener,4.9,1119866400,DocLock,11244,2,6,3,5,11,2.7,Straw pour with some aroma and tastes of hopci...


In [None]:
ratings_df.to_csv('data/baseData/RateBeer/ratings.csv', index=False)

In [None]:
#reviews.txt is ratings.txt but only where there are reviews, (review: True)
#there are 45 million lines of text... thats a lot
def parse_ratings_file(file_path, limit):
    data = []
    current_block = []
    i = 0

    with open(file_path, 'r') as file:  
        for line in file:
            i += 1

            
            line = line.strip() 
            
            if not line:
                data.append(current_block)
                current_block = []
                continue
            
            if ': ' in line:  
                key, value = line.split(': ', 1) 
                current_block.append(value)
            

    return pd.DataFrame(data)


reviews_df = parse_ratings_file('../../Thomas/baseData/RateBeer/reviews.txt', limit = None)

#since our txt file is in a predicatble format we can make things easy on ourselves
new_column_names = [
    'beer_name',
    'beer_id',
    'brewery_name',  
    'brewery_id',    
    'style',         
    'abv',           
    'date',          
    'user_name',     
    'user_id',       
    'appearance',    
    'aroma',         
    'palate',        
    'taste',         
    'overall',       
    'rating',        
    'text'
]

reviews_df.columns =  new_column_names
reviews_df.sample(10)


Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,aroma,palate,taste,overall,rating,text
2164397,Three Hearts 50 år 3.5%,50771,Krönleins Bryggeri,191,Pale Lager,3.5,1207821600,KnutAlbert,26960,3,4,2,5,9,2.3,Can from a Svinesund supermarket.Decent attemp...
6662317,Duysters Loterbol Bruin,22337,Brouwerij Loterbol (was Duysters),1958,Belgian Strong Ale,8.0,1338890400,tderoeck,101220,4,5,4,8,15,3.6,29/V/2012 - 33cl bottle from Geers (Oostakker)...
3903658,Fort Collins Kidd Black Lager (Schwarzbier),30736,Fort Collins Brewery,3891,Schwarzbier,4.5,1240826400,LtDan,74534,3,5,4,8,14,3.4,Appearance: Pours a midnight amber letting al...
5886321,Crate I.P.A.,179853,Crate Brewery,14881,India Pale Ale (IPA),5.8,1395140400,rodbod,294971,4,7,4,8,16,3.9,"Bottle from the Good Spirits Co, Glasgow. Ambe..."
1177510,Pyraser Kellerbier,68754,Pyraser Landbrauerei,4623,Zwickel/Keller/Landbier,4.8,1353322800,3fourths,6533,4,7,3,6,13,3.3,"steinkrug at the new Huttân, Nuremberg.spicy..."
9338,Lucky Beer (Lucky Buddha),51355,Cheerday Hangzhou Qiandaohu Beer Co.,11483,Pale Lager,4.8,1334916000,anders8000,147471,4,3,3,5,12,2.7,Light aromas of hops with nuances of honey & m...
929021,Lidl Perlenbacher Schwarzbier / Black Beer,289391,Mauritius Privatbrauerei,1851,Schwarzbier,4.9,1486724400,Calisky,106177,3,6,3,5,11,2.8,"Couleur brune sombre, trÃ¨s lÃ©gÃ¨re mousse. N..."
5511531,Two Brothers Cane and Ebel,59141,Two Brothers Brewing Company,507,Specialty Grain,7.0,1207389600,Juelze,20180,4,7,3,7,17,3.8,Pours a dark amber color with thick white foam...
556963,Ebrius Brewing IPA,155546,Ebrius Brewing,13678,India Pale Ale (IPA),4.9,1329476400,dreadnord,146208,4,8,4,8,16,4.0,Pours darker yellow with a big white head. Nos...
5900835,Fullers Imperial Stout (10.7%),226113,Fullers,55,Imperial Stout,10.7,1427540400,The_Osprey,249130,4,7,4,7,16,3.8,"Mar 2015 - 500ml, boxed, 10.7%, 2013 vintage. ..."


In [None]:
reviews_df.to_csv('data/baseData/RateBeer/reviews.csv', index=False)