In [7]:
import pandas as pd
import datetime

In [8]:
def prepare_row(raw_to_column):
    retRow = {}
    for key in raw_to_column.keys():
        retRow[raw_to_column[key]] = None
    return retRow

def read_file(path_to_file):
    """
        Function that reads .txt file on the given path and transforms it to pandas DataFrame. It's assumed that
        an empty line is a separator between entries in the file.
        Usage example:
            df = read_file("datasets/MatchedData/ratings_ba.txt")
            df.head()
    """
    
    f = open(path_to_file, "r", encoding = 'utf8')
    
    lines = f.readlines()
    
    raw_to_column = {"beer_name" : "Beer Name", "beer_id" : "Beer Id", "style" : "Style", "brewery_name" : "Brewery Name", "brewery_id" : "Brewery Id", "abv" : "Abv", "date" : "Date", "user_name" : "Username", "user_id" : "User Id", "appearance" : "Appearance", "palate" : "Palate", "taste" : "Taste", 'aroma' : 'Aroma', "overall" : "Overall", "rating" : "Rating", "text" : "Text", "review" : "Review"}
    
    #Preparing the dictionary for reading the next entry
    newRow = prepare_row(raw_to_column)
    
    dataFrameDict = {'Beer Name' : [], 'Beer Id' : [], 'Brewery Name' : [], 'Brewery Id' : [], 'Style' : [], 'Abv' : [], 'Date' : [], 'Username' : [], 'User Id' : [],'Appearance' : [], 'Aroma' : [],'Palate' : [],'Taste' : [],'Overall' : [],'Rating' : [],'Text' : [],'Review' : []}
    
    for line in lines:
        
        delim = line.find(":")
        
        if ':' not in line: #No delimiter found <=> empty line found <=> current entry has ended
            for key in newRow.keys():
                dataFrameDict[key].append(newRow[key])
            newRow = prepare_row(raw_to_column)
        else:
            key = line[:delim]
            val = line[delim+1:-1]
            # Following lines are just converting the value to the corresponding data type
            if key == 'date':
                try:
                    newRow[raw_to_column[key]] = datetime.datetime.fromtimestamp(int(val) / 1000)
                except:
                    newRow[raw_to_column[key]] = None
            else:
                if key == 'beer_id' or key == 'brewery_id':
                    try:
                        newRow[raw_to_column[key]] = int(val)
                    except:
                        newRow[raw_to_column[key]] = None
                else:
                    if key == 'abv' or key == 'appearance' or key =='aroma' or key == 'palate' or key == 'taste' or key == 'overall' or key =='rating':
                        try:
                            newRow[raw_to_column[key]] = float(val)
                        except:
                            newRow[raw_to_column[key]] = None
                    else:
                        if key == 'review':
                            try:
                                newRow[raw_to_column[key]] = bool(val)
                            except:
                                newRow[raw_to_column[key]] = None
                        else:
                            newRow[raw_to_column[key]] = val
                    
    #If file doesn't end with an empty line
    if len(newRow) != 0:
        for key in newRow.keys():
            dataFrameDict[key].append(newRow[key])
    
    df = pd.DataFrame(dataFrameDict)
    #Pandas will cast these values to float by default - because NaN's exist
    df['Beer Id'] = df['Beer Id'].astype(pd.Int64Dtype())
    df['Brewery Id'] = df['Brewery Id'].astype(pd.Int64Dtype())
    
    return df