In [13]:
#libraries :
import os
import tarfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import seaborn as sns

# Define the list of filenames to be extracted
filenames = ["matched_beer_data.tar.gz", "BeerAdvocate.tar.gz", "RateBeer.tar.gz"]
base_directory = "src/data"

# Loop through each filename in the list
# for fname in filenames:
#     # Create the extraction folder named data_{base_name}
#     base_name = os.path.splitext(os.path.splitext(fname)[0])[0]  # Remove both .tar and .gz
#     extract_folder = os.path.join(base_directory, f"data_{base_name}")
    
#     # Create the folder if it does not exist
#     os.makedirs(extract_folder, exist_ok=True)
    
#     # Check the file extension and open the tar file accordingly
#     if fname.endswith("tar.gz"):
#         with tarfile.open(fname, "r:gz") as tar:
#             tar.extractall(path=extract_folder)  
#             print(f"Extracted all contents from {fname} to {extract_folder}")
#     elif fname.endswith("tar"):
#         with tarfile.open(fname, "r:") as tar:
#             tar.extractall(path=extract_folder)
#             print(f"Extracted all contents from {fname} to {extract_folder}")
#     else:
#         print(f"The file {fname} is not a recognized .tar.gz or .tar file.")





In [17]:
def load_txt_data(file_path: str, fields: list[str] = None) -> pd.DataFrame:
    """
    :param file_path: a file_path for the .txt file you want to extract
    :param fields: a list of fields to extract from the .txt file, if None the function will find the fields for you
    :return: a df containing all extracted data from the dataframe
    """
    data_list = []
    dico = {}

    with open(file_path, 'r') as f:
        lines = f.readlines()

    # if fields is None try to find all the fields automatically
    if fields is None:
        fields_cpy = []
        for line in lines:
            if line.strip() == '':
                continue
            field = re.match(r'^[^:]+', line).group(0)
            if field in fields_cpy:
                break
            fields_cpy.append(field)
    else:
        fields_cpy = fields

    for line in lines:
        if line.strip() == '':
            continue

        # check if the line starts with the first field and if that is so and the dico is not empty then start new dico
        if line.startswith(f"{fields_cpy[0]}:") and dico:
            data_list.append(dico)
            dico = {}

        for field in fields_cpy:
            if line.startswith(f"{field}:"):
                dico[field] = line[len(field) + 2:].strip()

    return pd.DataFrame(data_list)

#convert txt to csv by keeping only the columns we need
# Define file paths and column names
input_file_path = 'src/data/data_matched_beer_data/ratings_with_text_ba.txt'
output_file_path = 'src/data/data_matched_beer_data/ratings_with_text_ba.csv'
columns = [
    'beer_name', 'beer_id', 'brewery_name', 'brewery_id', 'style', 'abv', 'date',
    'user_name', 'user_id', 'appearance', 'aroma', 'palate', 'taste', 
    'overall', 'rating'
]  # we exclude "text" and "review" 

def parse_entry(lines):
    entry = {}
    for line in lines:
        if ': ' in line:
            key, value = line.split(': ', 1)
            if key not in ['text', 'review']: 
                entry[key] = value.strip()
    return entry

data = []
entry_lines = []

with open(input_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        if line.strip():  
            entry_lines.append(line.strip())
        else:
            if entry_lines:  
                data.append(parse_entry(entry_lines))
                entry_lines = []

    if entry_lines:
        data.append(parse_entry(entry_lines))


df = pd.DataFrame(data, columns=columns)
df.to_csv(output_file_path, index=False, encoding='utf-8')

print(f"Data saved to {output_file_path}")



Data saved to src/data/data_matched_beer_data/ratings_with_text_ba.csv
