In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm as tqdm
import os

In [2]:
def parse_ratings_file(file_path):
    data = []
    current_review = {}

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace
            if line:  # If the line is not empty
                if ': ' in line:  # Check if the line contains the delimiter ': '
                    key, value = line.split(': ', 1)  # Split by the first occurrence of ': '
                    current_review[key] = value
                else:
                    # Handle lines that do not follow the expected key-value format
                    # For example, this could happen for text reviews that span multiple lines
                    if 'text' in current_review:
                        current_review['text'] += ' ' + line  # Append to the 'text' field
                    else:
                        current_review['text'] = line  # Create the 'text' field
            else:  # Blank line indicates the end of a review
                if current_review:  # Ensure that the review is not empty
                    data.append(current_review)
                current_review = {}  # Reset for the next review

        if current_review:  # Catch the last review if there's no trailing blank line
            data.append(current_review)

    # Convert the list of dictionaries to a DataFrame
    return pd.DataFrame(data)

In [4]:
#importing data
BA_path = 'baseData/BeerAdvocate/'
RB_path = 'baseData/RateBeer/'

BA_beers = pd.read_csv(BA_path + 'beers.csv')
BA_breweries = pd.read_csv(BA_path + 'breweries.csv')
BA_users = pd.read_csv(BA_path + 'users.csv')

RB_beers = pd.read_csv(RB_path + 'beers.csv')
RB_breweries = pd.read_csv(RB_path + 'breweries.csv')
RB_users = pd.read_csv(RB_path + 'users.csv')


In [5]:
#BA_ratings = pd.read_csv(BA_path + 'ratings.csv', low_memory=False)
#BA_reviews = pd.read_csv(BA_path + 'reviews.csv', low_memory=False)

#RB_ratings = pd.read_csv(RB_path + 'ratings.csv', low_memory=False)
RB_reviews = pd.read_csv(RB_path + 'reviews.csv', low_memory=False)

In [2]:
# dtypes for BeerAdvocate

dtype_BA_beers = {
    'beer_id': 'int64',
    'beer_name': 'string',
    'brewery_id': 'int64',
    'brewery_name': 'string',
    'style': 'string',
    'nbr_ratings': 'int64',
    'nbr_reviews': 'int64',
    'avg': 'float64',
    'ba_score': 'float64',
    'bros_score': 'float64',
    'abv': 'float64',
    'avg_computed': 'float64',
    'zscore': 'float64',
    'nbr_matched_valid_ratings': 'int64',
    'avg_matched_valid_ratings': 'float64'
}

dtype_BA_breweries = {
    'id': 'int64',
    'location': 'string',
    'name': 'string',
    'nbr_beers': 'int64'
}

dtype_BA_ratings = {
    'Unnamed: 0': 'int64', #currently idx column, to be removed --> will remake the csv for all 4
    'beer_name': 'string',
    'beer_id': 'int64',
    'brewery_name': 'string',
    'brewery_id': 'int64',
    'style': 'string',
    'abv': 'float64',
    'date': 'int64',
    'user_name': 'string',
    'user_id': 'string',    # This is 'user_name' + '.' + 'int64', I put it as a string as it would be easier to modify
    'appearance': 'float64',
    'aroma': 'float64',
    'palate': 'float64',
    'taste': 'float64',
    'overall': 'float64',
    'rating': 'float64',
    'text': 'string',
    'review': 'bool'  #Gives True and False so boolean dtype
}

dtype_BA_reviews = {
    'Unnamed: 0': 'int64',  #currently idx column, to be removed --> will remake the csv for all 4
    'beer_name': 'string',
    'beer_id': 'int64',
    'brewery_name': 'string',
    'brewery_id': 'int64',
    'style': 'string',
    'abv': 'float64',
    'date': 'int64',
    'user_name': 'string',
    'user_id': 'string',    # This is 'user_name' + '.' + 'int64', I put it as a string as it would be easier to modify
    'appearance': 'float64',
    'aroma': 'float64',
    'palate': 'float64',
    'taste': 'float64',
    'overall': 'float64',
    'rating': 'float64',
    'text': 'string'
}

dtype_BA_users = {
    'nbr_ratings': 'int64',
    'nbr_reviews': 'int64',
    'user_id': 'string',    # This is 'user_name' + '.' + 'int64', I put it as a string as it would be easier to modify
    'user_name': 'string',
    'joined': 'float64',
    'location': 'string'
}



# dtypes for RateBeer

dtype_RB_beers = {
    'beer_id': 'int64',
    'beer_name': 'string',
    'brewery_id': 'int64',
    'brewery_name': 'string',
    'style': 'string',
    'nbr_ratings': 'int64',
    'overall_score': 'float64',
    'style_score': 'float64',
    'avg': 'float64',
    'abv': 'float64',
    'avg_computed': 'float64',
    'zscore': 'float64',
    'nbr_matched_valid_ratings': 'int64',
    'avg_matched_valid_ratings': 'float64'
}

dtype_RB_breweries = {
    'id': 'int64',
    'location': 'string',
    'name': 'string',
    'nbr_beers': 'int64'
}

dtype_RB_rating = {
    'Unnamed: 0': 'int64',  #currently idx column, to be removed --> will remake the csv for all 4
    'beer_name': 'string',
    'beer_id': 'int64',
    'brewery_name': 'string',
    'brewery_id': 'int64',
    'style': 'string',
    'abv': 'float64',
    'date': 'int64',
    'user_name': 'string',
    'user_id': 'int64',
    'appearance': 'int64',
    'aroma': 'int64',
    'palate': 'int64',
    'taste': 'int64',
    'overall': 'int64',
    'rating': 'float64',
    'text': 'string'
}

dtype_RB_reviews = {
    'Unnamed: 0': 'int64',  #currently idx column, to be removed --> will remake the csv for all 4
    'beer_name': 'string',
    'beer_id': 'int64',
    'brewery_name': 'string',
    'brewery_id': 'int64',
    'style': 'string',
    'abv': 'float64',
    'date': 'int64',
    'user_name': 'string',
    'user_id': 'int64',
    'appearance': 'int64',
    'aroma': 'int64',
    'palate': 'int64',
    'taste': 'int64',
    'overall': 'int64',
    'rating': 'float64',
    'text': 'string',
}

dtype_RB_users = {
    'nbr_ratings': 'int64',
    'user_id': 'int64',     #different from user_ID in BeerAdvocate
    'user_name': 'string',
    'joined': 'float64',
    'location': 'string'
}

In [6]:
RB_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7122074 entries, 0 to 7122073
Data columns (total 16 columns):
 #   Column        Dtype  
---  ------        -----  
 0   beer_name     object 
 1   beer_id       int64  
 2   brewery_name  object 
 3   brewery_id    int64  
 4   style         object 
 5   abv           float64
 6   date          int64  
 7   user_name     object 
 8   user_id       int64  
 9   appearance    int64  
 10  aroma         int64  
 11  palate        int64  
 12  taste         int64  
 13  overall       int64  
 14  rating        float64
 15  text          object 
dtypes: float64(2), int64(9), object(5)
memory usage: 869.4+ MB


In [6]:
filtered_rows = BA_breweries[BA_breweries['name'] == 'Browar Jabłonowo S.C.']
print("Rows with exact match:\n", filtered_rows)

Rows with exact match:
         id location                   name  nbr_beers
7658  5508   Poland  Browar Jabłonowo S.C.         12


In [7]:
filtered_rows_with_contains = BA_breweries[BA_breweries['name'].str.contains("Browar Jagie&#322", na=False)]
print("\nRows with str.contains match:\n", filtered_rows_with_contains)


Rows with str.contains match:
          id location                       name  nbr_beers
7659  23473   Poland  Browar Jagie&#322;&#322;o         12


In [47]:
BA_ratings = parse_ratings_file(BA_path + 'ratings.txt')
print('ratings Done')
BA_reviews = parse_ratings_file(BA_path + 'reviews.txt')

ratings Done


In [42]:
#automatically remove NaN values, can be modified
def columns_to_remove(data, value, threshold):
    cols_to_remove = []
    
    for col in data.columns:
        if (data[col] == value).mean() > threshold:
            cols_to_remove.append(col)
        elif data[col].isna().mean() > threshold:
            cols_to_remove.append(col)

    return cols_to_remove

def drop_columns(data, cols_to_remove, save_path):
    data_new = data.drop(columns=cols_to_remove, errors='ignore')
    data_new.to_csv(save_path, index=False)

In [40]:
#allows application of multiple values
def column_delete_save(data, values, save_path, threshold):
    data_new = data.copy()

    for val in values:
        cols_remove = columns_to_remove(data_new, val, threshold)
        print(cols_remove)
        data_new = data_new.drop(columns=cols_remove, errors='ignore')
    
    data_new.to_csv(save_path, index=False)

In [43]:
#Test with simple df
data = {
    'A': [1, 2, 3, 4, np.NaN],
    'B': [0, 0, 0, 0, 0],
    'C': [1, 2, 1, 2, 1],
    'D': [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN],
    'E': [5, 6, 7, 8, 9]
    }

df = pd.DataFrame(data)

print(df.head())

print(columns_to_remove(df, 0, 0.8))
print(columns_to_remove(df, 'NaN', 0.8))

column_delete_save(df, [0], 'test.csv', 0.8)

     A  B  C   D  E
0  1.0  0  1 NaN  5
1  2.0  0  2 NaN  6
2  3.0  0  1 NaN  7
3  4.0  0  2 NaN  8
4  NaN  0  1 NaN  9
['B', 'D']
['D']
['B', 'D']


In [44]:
#make new directories for save
def create_directory(directory_path):
    """
    Create a directory if it does not exist.

    Parameters:
    - directory_path (str): The path of the directory to create.
    """
    try:
        # Create the directory
        os.makedirs(directory_path, exist_ok=True)  # exist_ok=True avoids errors if the directory already exists
        print(f"Directory '{directory_path}' created successfully.")
    except Exception as e:
        print(f"Error creating directory: {e}")

create_directory('modData')
create_directory('modData/BeerAdvocate')
create_directory('modData/RateBeer')

Directory 'modData' created successfully.
Directory 'modData/BeerAdvocate' created successfully.
Directory 'modData/RateBeer' created successfully.


In [46]:
column_delete_save(BA_beers, [0], 'modData/BeerAdvocate/beers.csv', 0.8)
column_delete_save(BA_breweries, [0], 'modData/BeerAdvocate/breweries.csv', 0.8)
column_delete_save(BA_users, [0], 'modData/BeerAdvocate/users.csv', 0.8)


column_delete_save(RB_beers, [0], 'modData/RateBeer/beers.csv', 0.8)
column_delete_save(RB_breweries, [0], 'modData/RateBeer/breweries.csv', 0.8)
column_delete_save(RB_users, [0], 'modData/RateBeer/users.csv', 0.8)


['bros_score', 'zscore', 'nbr_matched_valid_ratings', 'avg_matched_valid_ratings']
[]
[]
['zscore', 'nbr_matched_valid_ratings', 'avg_matched_valid_ratings']
[]
[]
