In [1]:
import os
import subprocess
from pathlib import Path

"""
Dynamically find the project root (where .git exists) and set it as the current working directory.
"""
project_root = Path(subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], text=True).strip())
os.chdir(project_root)

In [2]:
import pandas as pd

## Looking at RateBeer

In [None]:
beers = pd.read_csv('data/baseData/RateBeer/beers.csv')

print(beers.shape)
beers.drop_duplicates(inplace=True)
print(beers.columns)
print(beers.shape)
beers.sample(10)

(442081, 14)
Index(['beer_id', 'beer_name', 'brewery_id', 'brewery_name', 'style',
       'nbr_ratings', 'overall_score', 'style_score', 'avg', 'abv',
       'avg_computed', 'zscore', 'nbr_matched_valid_ratings',
       'avg_matched_valid_ratings'],
      dtype='object')
(442081, 14)


Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,style,nbr_ratings,overall_score,style_score,avg,abv,avg_computed,zscore,nbr_matched_valid_ratings,avg_matched_valid_ratings
70555,457059,Freigeist Veedels Ale,11414,Freigeist Bierkultur,Kölsch,5,,,3.18,5.5,3.38,,0,
373549,186029,Yardy Shandy Kola,3635,Big City Brewing Company,Radler/Shandy,2,,,2.79,1.2,2.9,,0,
308165,371864,Angry Chair Charon Porter - Coffee And Hazelnut,16719,Angry Chair Brewing,Imperial Porter,2,,,3.23,8.2,3.8,,0,
354865,491670,Buffalo Bayou Thin Mint Stout,14108,Buffalo Bayou Brewing,Sweet Stout,1,,,3.04,9.4,3.5,,0,
65249,229302,Antoniter Bayerisch Weisse,1340,Memminger Brauerei,German Hefeweizen,6,,,2.73,5.0,2.7,,0,
199657,176036,Big Al Aztlan,9766,Big Al Brewing,Spice/Herb/Vegetable,1,,,2.91,,3.9,,0,
111340,385674,The 400 Frost Hammer,17628,The 400 Brewing Company,Golden Ale/Blond Ale,2,,,3.11,4.7,3.45,,0,
172297,187468,Hermitage Fruit Crate Pumpkin Ale,11753,Hermitage Brewing Company,Spice/Herb/Vegetable,24,46.0,54.0,3.1,9.0,3.125,-0.488175,24,3.125
285442,318105,Indeed Margarita Mexican Cousin Imperial Lager,14786,Indeed Brewing Company,Imperial Pils/Strong Pale Lager,2,,,2.95,8.7,3.55,,0,
161158,447455,Parkersburg On The Lam Oatmeal Stout,28555,Parkersburg Brewing Company,Dry Stout,4,,,3.26,4.7,3.575,,0,


In [None]:
breweries = pd.read_csv('data/baseData/RateBeer/breweries.csv')

print(breweries.shape)
breweries.drop_duplicates(inplace=True)
print(breweries.columns)
print(breweries.shape)
breweries.sample(10)

(24189, 4)
Index(['id', 'location', 'name', 'nbr_beers'], dtype='object')
(24189, 4)


Unnamed: 0,id,location,name,nbr_beers
14157,15790,"United States, California",Tent City Beer Company,9
7082,11633,Russia,Argus,4
20275,17451,England,Atom,44
23426,17490,France,Ferme-Brasserie Cordoeil,3
22472,9336,South Korea,Herzen Brau Bar,3
19784,20128,"United States, Illinois",Moody Tongue Brewing Company,22
4047,5043,Germany,Brauhaus Joh. Albrecht Soltau,9
17684,152,"United States, North Carolina",Chesapeake Bay Brewing &#40;formerly Rock Cree...,14
14016,18105,"United States, California",Progress Brewing,37
17315,14879,"United States, Minnesota",HammerHeart Brewing Company,76


In [None]:
users = pd.read_csv('data/baseData/RateBeer/users.csv')

print(users.shape)
users.drop_duplicates(inplace=True)
print(users.columns)
print(users.shape)
users.sample(10)

(70174, 5)
Index(['nbr_ratings', 'user_id', 'user_name', 'joined', 'location'], dtype='object')
(70174, 5)


Unnamed: 0,nbr_ratings,user_id,user_name,joined,location
13379,11,101760,smithjay,1266232000.0,
48643,1,14693,afr0byte,1093255000.0,"United States, Vermont"
41437,2,459127,BeerGeek69,1492250000.0,
54289,2,121036,idiotbox,1294830000.0,
59200,2,283308,robw,1381399000.0,"United States, Rhode Island"
13931,1,147317,segir52,1324638000.0,
53581,1,381236,Brian1176,1439719000.0,"United States, New Jersey"
45043,3,335040,mlong,1410430000.0,
2777,63,77818,NewBelgium5,1215079000.0,"United States, Pennsylvania"
61555,1,206482,emcmillan,1342951000.0,


In [None]:
#there are 121 million lines of text... thats a lot
def parse_ratings_file(file_path, limit):
    data = []
    current_block = []
    i = 0

    with open(file_path, 'r') as file:  
        for line in file:
            i += 1
            
            if i%1_000_000 == 0: print('line x million: ', i/1_000_000)
            if i > limit :
                print('Saved first ' + str(limit/1_000_000) + ' million lines')
                break
            
            line = line.strip()
            
            if not line:
                data.append(current_block)
                current_block = []
                continue
            
            if ': ' in line:  
                key, value = line.split(': ', 1)
                current_block.append(value)
            
    return pd.DataFrame(data)


ratings_df = parse_ratings_file('data/baseData/RateBeer/ratings.txt', limit = 10_000_000)

#since our txt file is in a predicatble format we can make things easy on ourselves
new_column_names = [
    'beer_name',
    'beer_id',
    'brewery_name',  
    'brewery_id',    
    'style',         
    'abv',           
    'date',          
    'user_name',     
    'user_id',       
    'appearance',    
    'aroma',         
    'palate',        
    'taste',         
    'overall',       
    'rating',        
    'text',
]

ratings_df.columns =  new_column_names
ratings_df.sample(10)


line x million:  1.0
line x million:  2.0
line x million:  3.0
line x million:  4.0
line x million:  5.0
line x million:  6.0
line x million:  7.0
line x million:  8.0
line x million:  9.0
line x million:  10.0
Saved first 10.0 million lines


Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,aroma,palate,taste,overall,rating,text
30608,Pivovara Medvedgrad Zlatni Medvjed,11471,Pivovara Medvedgrad,2124,Pilsener,4.4,1399024800,Iznogud,296471,2,5,2,4,10,2.3,"Draught at Pivnica Medvedgrad, Zagreb. Pours g..."
273257,Central City Red Racer Imperial IPA (Red Betty),77959,Central City Brewers + Distillers,4327,Imperial IPA,9.0,1189850400,presario,5348,3,7,3,7,15,3.5,GCBF cask conditioned sample. Big bubbles. Ora...
7334,Yanjing 11º Premium Extra,12045,Beijing Yanjing Brewery Co.,2244,Pale Lager,5.5,1134903600,grant,5938,1,2,1,1,1,0.6,Appears as expected. Aroma of sweetish malt - ...
162783,Le Trèfle Noir California,329076,Le Trèfle Noir,10587,India Pale Ale (IPA),5.8,1439978400,BeerGolem,169857,4,8,4,7,16,3.9,Verre au Brouhaha (MontrÃ©al). Robe couleur mi...
466048,Shimono Loco Kolsch (Sakura Kaori no Nama),46782,Shimono Loco Beer,3543,Kölsch,,1277200800,Ratman197,18957,4,6,5,6,13,3.4,Bottle poured a clear yellow with a small ling...
196477,Le Corsaire La Perruche,121206,Microbrasserie Le Corsaire,9465,Fruit Beer,4.6,1463133600,rodenbach99,61913,4,7,3,7,14,3.5,Style et rÃ©sumÃ©: Blanche au sureau. Comment:...
54681,Corona Light,743,Grupo Modelo (Corona),119,Pale Lager,3.7,1071486000,whaleman,5299,2,2,2,3,5,1.4,Pale golden yellow body with piddly carbonatio...
14636,Tsingtao,730,Tsingtao Brewery,116,Pale Lager,4.8,1363518000,allanbowers,132401,4,5,3,4,8,2.4,"fizzy lager, chinese style.on a warm day this ..."
118540,Brouemont India Pale Ale,25032,Brouemont,3788,India Pale Ale (IPA),6.0,1431770400,tom10101,126758,4,7,3,7,14,3.5,On tap @ Brouemont.Appearance: Slightly hazy b...
226613,Unibroue Éphémère Framboises,50684,Unibroue &#40;Sapporo&#41;,180,Fruit Beer,5.5,1133089200,Siroy,29749,2,5,3,5,8,2.3,"Bottle. Pink/orange, bubbles-carbonation - Loo..."
