In [2]:
import os 
import re 
import pandas as pd 

# Specify the directory containing the text files
directory = 'dataset_BeerReviews/RateBeer/rates_comp/'

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the files in reverse order (ratings-5.txt to ratings-1.txt)
for i in range(2, 0, -1):
    file_name = f'ratings-{i}.txt'
    file_path = os.path.join(directory, file_name)

    with open(file_path, 'r') as f:
        text = f.read()

    # Remove double quotes at the beginning of each line
    data = re.sub(r'^"', '', text, flags=re.MULTILINE)

    # Split the text into individual beer reviews
    beer_reviews = data.split('beer_name')

    # Extract the beer information from each review
    beer_data = []
    for review in beer_reviews:
        beer_info = {}
        for line in [entry.split(':', 1) for entry in review.split('\n') if ':' in entry and 'text' not in entry]:
            if line:  # Check if the list is not empty
                key, value = line[0].strip(), line[1].strip()
                beer_info[key] = value
        beer_data.append(beer_info)

    # Convert the beer data into a DataFrame
    df = pd.DataFrame(beer_data)

    # Append the DataFrame to the list
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(dfs, ignore_index=True)

# Rename the columns
final_df.columns = [
    'beer_name',
    'beer_id',
    'brewery_name',
    'brewery_id',
    'style',
    'abv',
    'date',
    'user_name',
    'user_id',
    'appearance',
    'aroma',
    'palate',
    'taste',
    'overall',
    'rating'
]

# Display the resulting DataFrame
print(final_df.sample(5), final_df.shape, final_df.columns)

                                            beer_name      beer_id  \
227774                         Fanø Julebryg (-2008)"  1197716400"   
716427                         Twin Sails High Socks"  1499421600"   
480560  Alexander Keiths Nova Scotia Style Brown Ale"  1307440800"   
708092                     Spinnakers Roundhouse Red"  1400925600"   
634375                               Unibroue Maudite   1474797600   

           brewery_name brewery_id style abv date user_name user_id  \
227774  HenrikSoegaard"     17613"    3"  6"   3"        7"     13"   
716427    JulienHuxley"     79809"    3"  8"   3"        7"     15"   
480560      ChadPolenz"     99523"    4"  7"   3"        7"     14"   
708092        Lagerboy"     68142"    3"  5"   3"        6"     11"   
634375           gpekar      60653     4   6    3         7      14   

       appearance    aroma                             palate   taste  \
227774      3.20"   70347"                      Fanø Bryghus"   7707"   
716427