# Add the file `reviews.txt` from the BeerAdvocate folder into your working folder for the code below to work. Otherwise give absolute path to `reviews.txt`

In [None]:
import pandas as pd
import math

In [2]:
def read_data(filepath="reviews.txt", num_rows=200, extract_text_reviews=False):
    """
    Extracts reviews from .txt file and saves it into a dataframe. Converts date to readable format and store numeric values as int/float
    
    Input:
        filepath: str, path of dataset ("reviews.txt" by default)
        num_rows: int, number of rows to extract from dataset (200 by default)
        extract_text_reviews: boolean, set to True to extract text reviews as well (False by default)
        
    Return:
        df: shape: (min(num_rows, num_datapoints), 15) if extract_text_reviews=False
                   (min(num_rows, num_datapoints), 16) if extract_text_reviews=True
            contains the content of file pointed to by filepath.
    """
    # set column names
    column_names = ["beer_name", "beer_id", "brewery_name", "brewery_id", "style", "abv", "date", "user_name", "user_id", "appearance", "aroma", "palate", "taste", "overall", "rating"]
    if extract_text_reviews:
        column_names.append("text")
        
    # initialise empty dataframe 
    data_dict = {col: [] for col in column_names}

    # read from file line by line
    with open('reviews.txt') as data_file:
        for line in data_file:
            # skip if line is empty
            if line == '\n':
                continue
            
            # get attribute (beer_name, beer_id, etc)
            attribute = line[:line.index(':')]
        
            # skip if attribute is text 
            if attribute == "text" and not extract_text_reviews:
                continue
            
            # add value of attribute to the corresponding list   
            data_dict[attribute].append(line[line.index(':')+2:-1])
            
            # stop reading from file if we gathered num_rows datapoints
            if len(data_dict["rating"]) == num_rows:
                break

    # convert to dataframe            
    df = pd.DataFrame(data_dict)
    # retrieve numerical value of ratings (from string to float/int)
    df.astype({'beer_id':'int32', 'brewery_id':'int32', 'abv':'float', 'date':'int32', 'appearance':'float', 'aroma':'float', 'palate':'float', 'taste':'float', 'overall':'float', 'rating':'float'})
    # convert unix time to readable format
    df['date'] = pd.to_datetime(df['date'],unit='s')
    
    return df

In [3]:
# the line below extracts ALL datapoints (without text), change num_rows to 100 if the cell takes too long to run
df_no_text = read_data(filepath="reviews.txt", num_rows=math.inf, extract_text_reviews=False) 

# get first 5 reviews
df_no_text.head()

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,aroma,palate,taste,overall,rating
0,Régab,142544,Societe des Brasseries du Gabon (SOBRAGA),37262,Euro Pale Lager,4.5,2015-08-20 10:00:00,nmann08,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88
1,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2009-02-20 11:00:00,StJamesGate,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67
2,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2006-03-13 11:00:00,mdagnew,mdagnew.19527,4.0,3.5,3.5,4.0,3.5,3.73
3,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-12-01 11:00:00,helloloser12345,helloloser12345.10867,4.0,3.5,4.0,4.0,4.5,3.98
4,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-08-30 10:00:00,cypressbob,cypressbob.3708,4.0,4.0,4.0,4.0,4.0,4.0


In [4]:
# the line below extracts ALL datapoints (with text), change num_rows to 100 if the cell takes too long to run
df_with_text = read_data(filepath="reviews.txt", num_rows=math.inf, extract_text_reviews=True) 

# get first 5 reviews
df_with_text.head()

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,aroma,palate,taste,overall,rating,text
0,Régab,142544,Societe des Brasseries du Gabon (SOBRAGA),37262,Euro Pale Lager,4.5,2015-08-20 10:00:00,nmann08,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ..."
1,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2009-02-20 11:00:00,StJamesGate,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67,Pours pale copper with a thin head that quickl...
2,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2006-03-13 11:00:00,mdagnew,mdagnew.19527,4.0,3.5,3.5,4.0,3.5,3.73,"500ml Bottle bought from The Vintage, Antrim....."
3,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-12-01 11:00:00,helloloser12345,helloloser12345.10867,4.0,3.5,4.0,4.0,4.5,3.98,Serving: 500ml brown bottlePour: Good head wit...
4,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-08-30 10:00:00,cypressbob,cypressbob.3708,4.0,4.0,4.0,4.0,4.0,4.0,"500ml bottlePours with a light, slightly hazy ..."


In [5]:
# print shapes of each dataframe 
print("Shape of data frame without text column: ", df_no_text.shape, end='\n')
print("Shape of data frame with text column: ", df_with_text.shape, end='\n')

Shape of data frame without text column:  (2589586, 15)
Shape of data frame with text column:  (2589586, 16)


# Notice that there are $2589586$ reviews in total.