# Data import

Important functions for data import and creation of pickle files.

In [1]:
import tarfile
import os
import gzip
import shutil
import datetime
import pandas as pd

In [2]:
DATA_FILE = './datasets'

In [3]:
def gunzip_shutil(source_filepath, dest_filepath, block_size=65536):
    """
    Creates .txt from .txt.gz
    """
    with gzip.open(source_filepath, 'rb') as s_file, \
            open(dest_filepath, 'wb') as d_file:
        shutil.copyfileobj(s_file, d_file, block_size)

In [4]:
def extract_tar_or_gz(filepath, to_folder='.'):
    """
    extracts file
    """
    if filepath.endswith(".txt.gz"):
        gunzip_shutil(filepath, os.path.splitext(filepath)[0])
        os.remove(filepath)
        
    elif filepath.endswith("tar.gz"):
        gz_file = tarfile.open(filepath, "r:gz")
        gz_file.extractall(path=to_folder)
        gz_file.close()
    
    elif filepath.endswith(".tar"):
        tar_file = tarfile.open(filepath, "r:")
        tar_file.extractall(path=to_folder)
        tar_file.close()

In [5]:
def extract_file(filepath, to_folder):
    """
    Fully extrats given file (with path filepath) to to_folder
    """
    extract_tar_or_gz(filepath, to_folder)
    print(filepath)
    print(to_folder)
    
    directory = os.path.join(to_folder)
    for filename in os.listdir(directory):
        file = os.path.join(directory, filename)
        if os.path.isfile(file):
            print(file)
            extract_tar_or_gz(file)

Simple examples for data extraction:

In [12]:
extract_file(DATA_FILE + '/BeerAdvocate.tar.gz', DATA_FILE + '/BeerAdvocate')

In [7]:
def prepare_row(raw_to_column):
    """
    Creates empty dictinary that represents the row for dataframe
    """
    retRow = {}
    for key in raw_to_column.keys():
        retRow[raw_to_column[key]] = None
    return retRow

def read_file(path_to_file):
    """
        Function that reads .txt file on the given path and transforms it to pandas DataFrame. It's assumed that
        an empty line is a separator between entries in the file.
        Usage example:
            df = read_file("datasets/MatchedData/ratings_ba.txt")
            df.head()
        note: the structure of the file follows pattern present in ratings.txt or reviews.txt
    """
    
    f = open(path_to_file, "r", encoding = 'utf8')
    
    lines = f.readlines()
    
    raw_to_column = {"beer_name" : "Beer Name", "beer_id" : "Beer Id", "style" : "Style", "brewery_name" : "Brewery Name", "brewery_id" : "Brewery Id", "abv" : "Abv", "date" : "Date", "user_name" : "Username", "user_id" : "User Id", "appearance" : "Appearance", "palate" : "Palate", "taste" : "Taste", 'aroma' : 'Aroma', "overall" : "Overall", "rating" : "Rating", "text" : "Text", "review" : "Review"}
    
    #Preparing the dictionary for reading the next entry
    newRow = prepare_row(raw_to_column)
    
    dataFrameDict = {'Beer Name' : [], 'Beer Id' : [], 'Brewery Name' : [], 'Brewery Id' : [], 'Style' : [], 'Abv' : [], 'Date' : [], 'Username' : [], 'User Id' : [],'Appearance' : [], 'Aroma' : [],'Palate' : [],'Taste' : [],'Overall' : [],'Rating' : [],'Text' : [],'Review' : []}
    
    nonEmpty = False
    
    for line in lines:
        
        delim = line.find(":")
        
        if ':' not in line: #No delimiter found <=> empty line found <=> current entry has ended
            for key in newRow.keys():
                dataFrameDict[key].append(newRow[key])
            newRow = prepare_row(raw_to_column)
            nonEmpty = False
        else:
            key = line[:delim]
            val = line[delim+1:-1].strip()
            nonEmpty = True
            # Following lines are just converting the value to the corresponding data type
            
            if key == 'date':
                try:
                    newRow[raw_to_column[key]] = datetime.datetime.fromtimestamp(int(val))
                except:
                    newRow[raw_to_column[key]] = None
            else:
                if key == 'beer_id' or key == 'brewery_id':
                    try:
                        newRow[raw_to_column[key]] = int(val)
                    except:
                        newRow[raw_to_column[key]] = None
                else:
                    if key == 'abv' or key == 'appearance' or key =='aroma' or key == 'palate' or key == 'taste' or key == 'overall' or key =='rating':
                        try:
                            newRow[raw_to_column[key]] = float(val)
                        except:
                            newRow[raw_to_column[key]] = None
                    else:
                        if key == 'review':
                            try:
                                #print(val)
                                newRow[raw_to_column[key]] = (val == "True")
                            except:
                                newRow[raw_to_column[key]] = None
                        elif key == 'text' and val == 'nan':
                                newRow[raw_to_column[key]] = None
                        else:
                            newRow[raw_to_column[key]] = val
                    
    #If file doesn't end with an empty line
    if nonEmpty:
        for key in newRow.keys():
            dataFrameDict[key].append(newRow[key])
    
    df = pd.DataFrame(dataFrameDict)
    #Pandas will cast these values to float by default - because NaN's exist
    df['Beer Id'] = df['Beer Id'].astype(pd.Int64Dtype())
    df['Brewery Id'] = df['Brewery Id'].astype(pd.Int64Dtype())
    
    return df

In [9]:
def create_pickle_from_file(filepath):
    """
    for a file (with path filepath), loads ratings/reviews from 
    """
    df = read_file(filepath)
    df.to_pickle(os.path.splitext(filepath)[0]+'.pkl')

Simple examples for creating pickles: (takes **several minutes** to execute)

In [10]:
create_pickle_from_file(DATA_FILE + '/BeerAdvocate/reviews.txt')
create_pickle_from_file(DATA_FILE + '/BeerAdvocate/ratings.txt')

How to read pickle file:

In [11]:
reviews = pd.read_pickle(DATA_FILE + '/BeerAdvocate/reviews.pkl')
ratings = pd.read_pickle(DATA_FILE + '/BeerAdvocate/ratings.pkl')
ratings.head()

Unnamed: 0,Beer Name,Beer Id,Brewery Name,Brewery Id,Style,Abv,Date,Username,User Id,Appearance,Aroma,Palate,Taste,Overall,Rating,Text,Review
0,Régab,142544,Societe des Brasseries du Gabon (SOBRAGA),37262,Euro Pale Lager,4.5,2015-08-20 12:00:00,nmann08,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ...",True
1,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2009-02-20 12:00:00,StJamesGate,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67,Pours pale copper with a thin head that quickl...,True
2,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2006-03-13 12:00:00,mdagnew,mdagnew.19527,4.0,3.5,3.5,4.0,3.5,3.73,"500ml Bottle bought from The Vintage, Antrim.....",True
3,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-12-01 12:00:00,helloloser12345,helloloser12345.10867,4.0,3.5,4.0,4.0,4.5,3.98,Serving: 500ml brown bottlePour: Good head wit...,True
4,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-08-30 12:00:00,cypressbob,cypressbob.3708,4.0,4.0,4.0,4.0,4.0,4.0,"500ml bottlePours with a light, slightly hazy ...",True
