In [6]:
import pandas as pd
import csv
import os

from beerdata_loader import BeerDataLoader

# BeerAdvocate

**Loading the data**

In [7]:
data_loader = BeerDataLoader(data_dir="../data/BeerAdvocate", force_process=False)

ba_reviews_df, ba_ratings_df, ba_beers_df, ba_breweries_df, ba_users_df = (
    data_loader.load_all_data()
)

print("Reviews DataFrame:")
print(ba_reviews_df.head())

print("\nRatings DataFrame:")
print(ba_ratings_df.head())

print("\nBeers DataFrame:")
print(ba_beers_df.head())

print("\nBreweries DataFrame:")
print(ba_breweries_df.head())

print("\nUsers DataFrame:")
print(ba_users_df.head())

Processed file '../data/BeerAdvocate\reviews_processed.csv' already exists. Skipping processing.
Processed file '../data/BeerAdvocate\ratings_processed.csv' already exists. Skipping processing.
Reviews DataFrame:
       beer_name  beer_id                               brewery_name  \
0          Régab   142544  Societe des Brasseries du Gabon (SOBRAGA)   
1  Barelegs Brew    19590       Strangford Lough Brewing Company Ltd   
2  Barelegs Brew    19590       Strangford Lough Brewing Company Ltd   
3  Barelegs Brew    19590       Strangford Lough Brewing Company Ltd   
4  Barelegs Brew    19590       Strangford Lough Brewing Company Ltd   

   brewery_id             style  abv        date        user_name  \
0       37262   Euro Pale Lager  4.5  1440064800          nmann08   
1       10093  English Pale Ale  4.5  1235127600      StJamesGate   
2       10093  English Pale Ale  4.5  1142247600          mdagnew   
3       10093  English Pale Ale  4.5  1101898800  helloloser12345   
4       1

In [9]:
# Function to describe and explore each dataset
def explore_data(df, df_name):
    print(f"\n--- {df_name} ---\n")
    # Show the first few rows
    print("First few rows:")
    print(df.head())
    
    # Show basic information (column types, non-null counts, memory usage)
    print("\nDataFrame info:")
    df.info()
    
    # Describe numerical columns
    print("\nNumerical Data Summary:")
    print(df.describe())

    # Separate handling for object and category columns
    non_numerical_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(non_numerical_cols) > 0:
        print("\nNon-Numerical Data Summary (objects and categories):")
        print(df[non_numerical_cols].describe())

    # Check for missing values
    print("\nMissing Values:")
    print(df.isnull().sum())
    
    # Check the number of unique values for each column
    print("\nUnique values per column:")
    print(df.nunique())

# Explore each dataset
explore_data(ba_reviews_df, "Reviews DataFrame")
explore_data(ba_ratings_df, "Ratings DataFrame")
explore_data(ba_beers_df, "Beers DataFrame")
explore_data(ba_breweries_df, "Breweries DataFrame")
explore_data(ba_users_df, "Users DataFrame")



--- Reviews DataFrame ---

First few rows:
       beer_name  beer_id                               brewery_name  \
0          Régab   142544  Societe des Brasseries du Gabon (SOBRAGA)   
1  Barelegs Brew    19590       Strangford Lough Brewing Company Ltd   
2  Barelegs Brew    19590       Strangford Lough Brewing Company Ltd   
3  Barelegs Brew    19590       Strangford Lough Brewing Company Ltd   
4  Barelegs Brew    19590       Strangford Lough Brewing Company Ltd   

   brewery_id             style  abv        date        user_name  \
0       37262   Euro Pale Lager  4.5  1440064800          nmann08   
1       10093  English Pale Ale  4.5  1235127600      StJamesGate   
2       10093  English Pale Ale  4.5  1142247600          mdagnew   
3       10093  English Pale Ale  4.5  1101898800  helloloser12345   
4       10093  English Pale Ale  4.5  1093860000       cypressbob   

                 user_id  appearance  aroma  palate  taste  overall  rating  \
0         nmann08.184925     

In [3]:
data_loader = BeerDataLoader(data_dir="../data/RateBeer", force_process=False)

rb_reviews_df, rb_ratings_df, rb_beers_df, rb_breweries_df, rb_users_df = (
    data_loader.load_all_data()
)

print("Reviews DataFrame:")
print(rb_reviews_df.head())

print("\nRatings DataFrame:")
print(rb_ratings_df.head())

print("\nBeers DataFrame:")
print(rb_beers_df.head())

print("\nBreweries DataFrame:")
print(rb_breweries_df.head())

print("\nUsers DataFrame:")
print(rb_users_df.head())

Processed file '../data/RateBeer\reviews_processed.csv' already exists. Skipping processing.
Processed file '../data/RateBeer\ratings_processed.csv' already exists. Skipping processing.
Reviews DataFrame:
             beer_name  beer_id brewery_name  brewery_id       style  abv  \
0    33 Export (Gabon)   410549      Sobraga        3198  Pale Lager  5.0   
1  Castel Beer (Gabon)   105273      Sobraga        3198  Pale Lager  5.2   
2  Castel Beer (Gabon)   105273      Sobraga        3198  Pale Lager  5.2   
3  Castel Beer (Gabon)   105273      Sobraga        3198  Pale Lager  5.2   
4  Castel Beer (Gabon)   105273      Sobraga        3198  Pale Lager  5.2   

         date     user_name  user_id  appearance  aroma  palate  taste  \
0  1461664800       Manslow   175852           2      4       2      4   
1  1487329200  MAGICuenca91   442761           2      3       2      4   
2  1466762400        Sibarh   288889           3      3       2      3   
3  1451646000       fombe89   250510