In [1]:
import pandas as pd
import gc

In [2]:
new_column_names = [
    'beer_name', 'beer_id', 'brewery_name', 'brewery_id', 'style', 'abv', 'date',
    'user_name', 'user_id', 'appearance', 'aroma', 'palate', 'taste', 'overall',
    'rating', 'text', 'review'
]

output_file_path = '../../minimizedData/BA_ratings_small.csv'

def parse_ratings_file(file_path, chunk_size=15_000_012, start=0, stop=200_000_000): #chunk is a mulitple of 18 since our text 'block' size is 18
    #initialise the csv
    pd.DataFrame(columns=new_column_names).drop(['user_name','abv','date','text','review'], axis=1).to_csv(
        output_file_path, index=False
    )

    current_block = []
    data = []
    i = 0
    k = 0 
    chunk = 0
    
    with open(file_path, 'r') as file:
        for line in file:
            i += 1
            k += 1
            if i<start: continue
           
            if i+18>stop: break
           
            if i % 10_000_000 == 0: print('Processing line: ', i)

            line = line.strip()  #remove whitespace

            if not line:  #empty line signifies end of block
                data.append(current_block)
                current_block = []
                
            if k >= chunk_size:
                k = 0
                chunk += 1
                print('Processing chunk :', chunk)
                
                df = pd.DataFrame(data, columns=new_column_names)
                df = df.drop(['user_name', 'abv', 'date', 'text', 'review'], axis=1)
                df.to_csv(output_file_path, mode='a', index=False, header=False)
                data = []  
                gc.collect()  #manually invoke garbage collection
                    
                    
            if ': ' in line:  
                key, value = line.split(': ', 1)
                current_block.append(value)

        if data: #save any leftover data at the end
            data.append(current_block)
            current_block = []
            print(k)
            k = 0
            chunk += 1 
            print('Processing chunk :', chunk)
            
            df = pd.DataFrame(data, columns=new_column_names)
            df = df.drop(['user_name', 'abv', 'date', 'text', 'review'], axis=1)
            df.dropna()
            df.to_csv(output_file_path, mode='a', index=False, header=False)
            data = []  
            gc.collect() 
            
    print('Done!')




In [3]:
#takes 4 mins
parse_ratings_file('../../baseData/BeerAdvocate/ratings.txt')

Processing line:  10000000
Processing chunk : 1
Processing line:  20000000
Processing line:  30000000
Processing chunk : 2
Processing line:  40000000
Processing chunk : 3
Processing line:  50000000
Processing line:  60000000
Processing chunk : 4
Processing line:  70000000
Processing chunk : 5
Processing line:  80000000
Processing line:  90000000
Processing chunk : 6
Processing line:  100000000
Processing chunk : 7
Processing line:  110000000
Processing line:  120000000
Processing chunk : 8
Processing line:  130000000
Processing chunk : 9
Processing line:  140000000
Processing line:  150000000
Processing chunk : 10
1074456
Processing chunk : 11
Done!


In [4]:
df = pd.read_csv('../../minimizedData/BA_ratings_small.csv')
df = df.drop(df.index[-1])
df.to_csv('../../minimizedData/BA_ratings_small.csv', index=False)
df = pd.read_csv('../../minimizedData/BA_ratings_small.csv')
df.head(2)

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,user_id,appearance,aroma,palate,taste,overall,rating
0,Régab,142544.0,Societe des Brasseries du Gabon (SOBRAGA),37262.0,Euro Pale Lager,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88
1,Barelegs Brew,19590.0,Strangford Lough Brewing Company Ltd,10093.0,English Pale Ale,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67


In [7]:
df.tail(2)

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,user_id,appearance,aroma,palate,taste,overall,rating
0,Régab,142544.0,Societe des Brasseries du Gabon (SOBRAGA),37262.0,Euro Pale Lager,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88
1,Barelegs Brew,19590.0,Strangford Lough Brewing Company Ltd,10093.0,English Pale Ale,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67


In [6]:
df.sample(3)

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,user_id,appearance,aroma,palate,taste,overall,rating
6655121,Founders Centennial IPA,5441.0,Founders Brewing Company,1199.0,American IPA,josh012012.779943,,,,,,4.0
7840892,Prairie Standard,88169.0,Prairie Artisan Ales,30356.0,Saison / Farmhouse Ale,mattmorin.982377,3.5,3.5,3.5,3.5,3.5,3.5
7994603,Houblon Chouffe Dobbelen IPA Tripel,27804.0,Brasserie d'Achouffe,321.0,Belgian IPA,albinoh.31728,5.0,4.5,4.0,4.5,4.0,4.38
