In [1]:
import tarfile
import gzip
import shutil

import pandas as pd
import os
import numpy as np

In [2]:
def ungzip_folder(FOLDER_NAME):
    """
    This function allows to extract datas from gzip folder into a new accessible folder
    - FOLDER_NAME: name of the folder to create from the gzip folder
    The function returns nothing, it only creates the new folder
    """
    FOLDER_PATH = '../Data/' + FOLDER_NAME
    
    #check if folder is already unzipped
    if os.path.exists(FOLDER_PATH) == False:
    
        # open folder handling the tarfile type and the compression
        folder = tarfile.open(FOLDER_PATH + '.tar.gz',"r:gz")
        # extracting folder
        folder.extractall(FOLDER_PATH)
        # closing folder
        folder.close()

In [3]:
def ungzip_txt_file(FOLDER_NAME, FILE_NAME):
    """
    This function allows to extract datas from gzip file into a new accessible file
    - FOLDER_NAME: name of the folder in which the gzip file is
    - FILE_NAME: name of the file to create from the gzip file
    The function returns nothing, it only creates the new file
    """
    FILE_PATH = '../Data/' + FOLDER_NAME + '/' + FILE_NAME
    
    #check if file is already unzipped
    if os.path.exists(FILE_PATH) == False:
    
        # open files handling the compression
        with gzip.open(FILE_PATH + '.gz', 'r') as f_in:
            with open(FILE_PATH, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

In [4]:
def txt_to_dataframe(FOLDER_NAME,TXT_FILE_NAME,keys_list):
    """
    This function allows to pass the txt file to a dataframe
    - FOLDER_NAME: name of the folder in which the txt file is
    - TXT_FILE_NAME: text file with a dictionary format
    - keys_list: list of keys for the columns as the first line is not a header
    The function returns a dataframe file
    """
    dict_list = []
    newDict = {key: None for key in keys_list}
    
    FILE_PATH = '../Data/' + FOLDER_NAME + '/' + TXT_FILE_NAME

    with open(FILE_PATH, 'r', encoding="utf8") as f:
    
        for line in f:
            line = line.replace(u'\xa0','')
            listedline = line.split(':',1) # split around the ":" sign
        
            # take non-empty lines
            if len(listedline) > 1:
                newDict[listedline[0]] = listedline[1].strip()
        
            # finish the dictionary for the line
            if listedline[0] == keys_list[-1]:
                dict_list.append(newDict)
            
                # reinitialize the dictionary for the next line
                newDict = {key: None for key in keys_list}
    
        # transform list of dictionaries to dataframe
        df = pd.DataFrame(dict_list,columns=keys_list)    
    
    return df

In [5]:
def gunzip_folder(FOLDER_NAME):
    """
    This function allows to extract datas from gzip folder into a new accessible folder
    - FOLDER_NAME: name of the folder to create from the gzip folder
    The function returns nothing, it only creates the new folder and delete the folder with all extracted files
    """
    # define the path of the new folder
    SOURCE = '../Data/' + FOLDER_NAME
    DESTINATION = SOURCE + '_CSV'
    # this is the extension you want to detect
    extension = '.csv'
    
    #check if folder is already
    #if os.path.exists(DESTINATION) == False:
     #   os.mkdir(DESTINATION)
        
    # find and list all csv files path 
    all_files = os.listdir(SOURCE)    
    csv_files = list(filter(lambda f: f.endswith(extension), all_files)) 
    
    folder = tarfile.open(DESTINATION + ".tar.gz", "w:gz")
    for name in csv_files :
        folder.add(SOURCE + '/' + name)
        print('copied and compressed', SOURCE + '/' + name)
    shutil.rmtree(SOURCE)  
    folder.close()    

In [6]:
"""
RateBeer pre-processing
    - Unzip the folder
    - Unzip the txt files
    - Convert the txt files into csv files
    - Zip all the csv files into RateBeer_CSV.tar.gz
"""

ungzip_folder("RateBeer")
ungzip_txt_file("RateBeer","ratings.txt")
ungzip_txt_file("RateBeer","reviews.txt")


reviews_header = (['beer_name','beer_id','brewery_name','brewery_id','style','abv','date','user_name','user_id',\
                          'appearance','aroma','palate','taste','overall','rating','text'])
ratings_header = (['beer_name','beer_id','brewery_name','brewery_id','style','abv','date','user_name','user_id',\
                          'appearance','aroma','palate','taste','overall','rating','text'])

df_reviews_RB = txt_to_dataframe("RateBeer","reviews.txt", reviews_header)
df_ratings_RB = txt_to_dataframe("RateBeer","ratings.txt", ratings_header)

df_reviews_RB.to_csv (r'../Data/RateBeer/reviews.csv', index=None)
df_ratings_RB.to_csv (r'../Data/RateBeer/ratings.csv', index=None)

gunzip_folder('RateBeer') 

copied and compressed ../Data/RateBeer/beers.csv
copied and compressed ../Data/RateBeer/breweries.csv
copied and compressed ../Data/RateBeer/reviews.csv
copied and compressed ../Data/RateBeer/users.csv
copied and compressed ../Data/RateBeer/ratings.csv


In [7]:
"""
BeerAdvocate pre-processing
    - Unzip the folder
    - Unzip the txt files
    - Convert the txt files into csv files
    - Zip all the csv files into BeerAdvocate_CSV.tar.gz
"""

ungzip_folder("BeerAdvocate")
ungzip_txt_file("BeerAdvocate","ratings.txt")
ungzip_txt_file("BeerAdvocate","reviews.txt")

df_reviews_BA = txt_to_dataframe("BeerAdvocate","reviews.txt", reviews_header)
df_ratings_BA = txt_to_dataframe("BeerAdvocate","ratings.txt", ratings_header)

df_reviews_BA.to_csv (r'../Data/BeerAdvocate/reviews.csv', index=None)
df_ratings_BA.to_csv (r'../Data/BeerAdvocate/ratings.csv', index=None)

gunzip_folder('BeerAdvocate') 

copied and compressed ../Data/BeerAdvocate/beers.csv
copied and compressed ../Data/BeerAdvocate/breweries.csv
copied and compressed ../Data/BeerAdvocate/reviews.csv
copied and compressed ../Data/BeerAdvocate/users.csv
copied and compressed ../Data/BeerAdvocate/ratings.csv
