In [1]:
import pandas as pd
from src.data.some_dataloader import *
from collections import Counter

In [2]:
df_ba_ratings, df_rb_ratings = load_rating_data(ba_path="../../data/BeerAdvocate/BA_ratings.csv", rb_path="../../data/RateBeer/RB_ratings.csv")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
df_ba_ratings.head()

In [None]:
def split_ratings_by_threshold(df, user_id, upper_threshold, lower_threshold):
    """
    Splits the ratings of a user into two DataFrames based on thresholds.
    
    Parameters:
        df (pd.DataFrame): The DataFrame containing the ratings.
        user_id (int or str): The ID of the user whose ratings are to be filtered.
        upper_threshold (float): The threshold above which ratings are considered "good".
        lower_threshold (float): The threshold below which ratings are considered "bad".
        
    Returns:
        tuple: Two DataFrames - one for ratings >= upper_threshold, and one for ratings <= lower_threshold.
    """
    # Filter DataFrame for the specific user
    user_ratings = df[df['user_id'] == user_id]
    
    # Create a DataFrame for ratings >= upper_threshold
    good_ratings = user_ratings[user_ratings['rating'] >= upper_threshold]
    
    # Create a DataFrame for ratings <= lower_threshold
    bad_ratings = user_ratings[user_ratings['rating'] <= lower_threshold]
    
    return good_ratings, bad_ratings

In [ ]:
def count_word_frequencies(df, text_column, word_list):
    """
    Counts the frequencies of words from a given word list in a text column of a DataFrame.
    
    Parameters:
        df (pd.DataFrame): The DataFrame containing the text column.
        text_column (str): The name of the column that contains the text data.
        word_list (list): A list of words whose frequencies should be counted.
    
    Returns:
        dict: A dictionary with words as keys and their frequencies as values.
    """
    # Combine all texts in the column into one large string
    all_text = " ".join(df[text_column].dropna()).lower()  # Convert to lowercase for consistent counting

    # Tokenize the text (split into words)
    words = all_text.split()
    
    # Filter only the words from the provided word list
    filtered_words = [word for word in words if word in word_list]
    
    # Count the frequencies of the filtered words
    word_frequencies = Counter(filtered_words)
    
    # Return the result as a dictionary
    return dict(word_frequencies)

In [None]:
positive_words = [
    "malty", "hoppy", "fruity", "sweet", "smooth", "crisp", "refreshing", "balanced",
    "caramel", "chocolatey", "nutty", "citrusy", "spicy",
    "creamy", "full-bodied", "light", "dry", "velvety",
    "excellent", "amazing", "delicious", "perfect", "great", "fantastic", "lovely",
    "enjoyable", "favorite", "wonderful", 
    "classic", "authentic", "well-crafted", "artisanal", "clean"
]

negative_words = [
    "bitter", "sour", "bland", "stale", "metallic", "burnt", "overpowering", 
    "flat", "watery", "cloying", "harsh", "astringent",
    "thin", "weak", "overly carbonated", "unbalanced",
    "bad", "disappointing", "boring", "unpleasant", "off-putting", "weird", 
    "mediocre", "not great", "subpar", "average",
    "artificial", "generic", "industrial", "chemical"
]

exp_words1 = [
    "Lacing",
    "Ester",
    "Diacetyl",
    "Phenol",
    "Dry Hop",
    "DMS",
    "Oxidation",
    "catty",
    "resinous",
    "astringent",
    "Effervescent",
    "Tannic",
    "Brettanomyces",
    "lactic",
    "autolysis",
    "Krausen",
]

words = list(set(positive_words + negative_words + exp_words1))

In [None]:
print(len(words))

We see that we have a |L| of 78. Let's convert distributions to a latent space with a smaller dimension, so that we can later 

In [ ]:
def pre_stats(df, style, date):
    df = df[df["date"] < date]
    mean = df["rating"].mean()
    style_mean = df[df["style"] == style]["rating"].mean()
    num_ratings = len(df)

In [ ]:
def is_experienced(user_id, exp_user_ids):
    return user_id in exp_user_ids

In [None]:
print(df_ba_ratings["style"].unique())

In [None]:
print(len(df_ba_ratings["style"].unique()))

In [None]:
def get_top_styles(df_ratings, threshold):
    return df_ratings['style'].value_counts().head(threshold)

In [None]:
print(get_top_styles(df_ba_ratings, threshold=50))

In [None]:
get_top_styles(df_ba_ratings, threshold=100)

We see that there is strong drop in the first few beer styles and then it knd of flattens out. The strongest drop can be seen in the first 3 categories. Therefore, we take these beer styles as their own categories, while grouping the others.

In [None]:
import pandas as pd

def categorize_beer_styles(df):
    # Definiere die Kategorien
    categories = {
        "Lager": ["Euro Pale Lager", "German Pilsener", "Munich Helles Lager", "Czech Pilsener", 
                  "Vienna Lager", "Light Lager", "Munich Dunkel Lager", "Schwarzbier", 
                  "Euro Dark Lager", "Märzen / Oktoberfest", "Doppelbock", "Eisbock", 
                  "Maibock / Helles Bock", "Baltic Porter", "Euro Strong Lager"],
        "Ale": ["English Pale Ale", "American Pale Ale (APA)", "English Bitter", "Extra Special / Strong Bitter (ESB)", 
                "Belgian Pale Ale", "Irish Red Ale", "American Amber / Red Ale", "Scottish Ale", 
                "English Brown Ale", "American Brown Ale", "Old Ale", "English Strong Ale", 
                "American Strong Ale", "Scotch Ale / Wee Heavy", "English Barleywine", 
                "American Barleywine", "Belgian Dark Ale", "Belgian Strong Dark Ale", 
                "Quadrupel (Quad)", "Dubbel", "Tripel"],
        "IPA": ["English India Pale Ale (IPA)", "American IPA", "American Double / Imperial IPA", "Belgian IPA"],
        "Stout": ["Irish Dry Stout", "Milk / Sweet Stout", "Oatmeal Stout", "Foreign / Export Stout", 
                  "Russian Imperial Stout", "American Stout", "English Stout"],
        "Porter": ["English Porter", "American Porter", "Baltic Porter"],
        "Wheat Beer": ["Hefeweizen", "Kristalweizen", "Dunkelweizen", "Weizenbock", "Witbier", 
                       "Berliner Weissbier", "Gose", "Roggenbier"],
        "Belgian Styles": ["Saison / Farmhouse Ale", "Bière de Garde", "Lambic - Fruit", 
                            "Lambic - Unblended", "Gueuze", "Faro"],
        "Specialty": ["Smoked Beer", "Herbed / Spiced Beer", "Pumpkin Ale", "Chile Beer", 
                      "Scottish Gruit / Ancient Herbed Ale", "American Wild Ale", "Bière de Champagne / Bière Brut", 
                      "Wheatwine", "Sahti", "Kvass", "Braggot"],
        "Hybrid": ["Kölsch", "Altbier", "California Common / Steam Beer", "Cream Ale"],
        "Light/Low Alcohol": ["Low Alcohol Beer", "American Adjunct Lager", "American Pale Lager", 
                               "Japanese Rice Lager", "Happoshu"]
    }

    # Find the 3 most frequent beer styles
    top_styles = df['style'].value_counts().nlargest(3).index.tolist()

    # Replace beer styles with their categories, except for the top 3 styles
    def map_to_category(style):
        if style in top_styles:
            return style
        for category, styles in categories.items():
            if style in styles:
                return category
        return "Other"  # Catch-all for uncategorized styles

    # Apply the mapping
    df['style'] = df['style'].apply(map_to_category)
    return df

In [None]:
df_ba_ratings_categorized = categorize_beer_styles(df_ba_ratings)

In [None]:
df_ba_ratings_categorized.head()

In [None]:
df_ba_ratings_categorized["style"].value_counts()

# Okay, from here on we start testing the feature extraction in `rating_prediction.py`

In [1]:
from src.models.rating_prediction import *

We calculate the distribution for 77 words


In [2]:
# had a problem that I picked a guy with only NaN texts so here we go
df_valid_texts = df_ba_ratings[df_ba_ratings["text"].notna()]
user_rating_counts = df_valid_texts["user_id"].value_counts()
user_ids = user_rating_counts[user_rating_counts > 100].index.tolist()
print(len(user_ids))

4507


In [3]:
import random
random.seed(42)

random_user_id = random.choice(user_ids)
print(random_user_id)

deebo.102660


In [4]:
print(len(df_ba_ratings[df_ba_ratings["user_id"] == random_user_id]))

927


In [5]:
beer_ids = list(df_ba_ratings[df_ba_ratings["user_id"] == random_user_id]["beer_id"])

In [6]:
from src.models.experience_words import  *
exp_user_ids_ba = get_experienced_users2(df_ba_ratings, exp_words1)

In [7]:
user_stats, beer_stats = init_features(df_ba_ratings, [random_user_id], beer_ids, exp_user_ids_ba, words, lower_threshold=2.7, upper_threshold=3.8)

In [8]:
print(user_stats)

{'deebo.102660': {'good_distr': [0.02516940948693127, 0.0968054211035818, 0, 0, 0, 0.05517909002904162, 0.006776379477250726, 0, 0.008712487899322363, 0.001936108422071636, 0.030977734753146177, 0, 0.00968054211035818, 0.000968054211035818, 0, 0.002904162633107454, 0.027105517909002903, 0.002904162633107454, 0, 0.007744433688286544, 0, 0, 0, 0, 0, 0.008712487899322363, 0.007744433688286544, 0.001936108422071636, 0.02904162633107454, 0, 0.030009680542110357, 0, 0.03969022265246854, 0.005808325266214908, 0, 0.015488867376573089, 0.005808325266214908, 0.005808325266214908, 0.000968054211035818, 0.05227492739593417, 0, 0, 0, 0.0590513068731849, 0.006776379477250726, 0.005808325266214908, 0, 0.030009680542110357, 0, 0, 0.026137463697967087, 0.001936108422071636, 0.05227492739593417, 0, 0.007744433688286544, 0, 0, 0.05808325266214908, 0.023233301064859633, 0, 0, 0.02032913843175218, 0.00484027105517909, 0.12778315585672798, 0.04549854791868345, 0.000968054211035818, 0, 0, 0.00193610842207163

In [9]:
print(beer_stats)

{12719: {'distr': [0.026767676767676767, 0.16717171717171717, 0, 0, 0, 0.021717171717171718, 0.007575757575757576, 0.000505050505050505, 0.0065656565656565654, 0.00808080808080808, 0.006060606060606061, 0.007070707070707071, 0.008585858585858586, 0, 0.000505050505050505, 0.0065656565656565654, 0.005050505050505051, 0.000505050505050505, 0, 0.006060606060606061, 0.00202020202020202, 0.000505050505050505, 0, 0, 0.00202020202020202, 0.012626262626262626, 0.005050505050505051, 0.004545454545454545, 0.01616161616161616, 0.0015151515151515152, 0.012626262626262626, 0.000505050505050505, 0.04242424242424243, 0, 0.000505050505050505, 0.07474747474747474, 0.012121212121212121, 0.004545454545454545, 0.0015151515151515152, 0.047474747474747475, 0, 0, 0, 0.00909090909090909, 0.011111111111111112, 0.006060606060606061, 0, 0.04090909090909091, 0, 0, 0.007070707070707071, 0.0025252525252525255, 0.016666666666666666, 0, 0.000505050505050505, 0.007575757575757576, 0, 0.07121212121212121, 0.006565656565

In [10]:
user_rows = df_ba_ratings[df_ba_ratings["user_id"] == random_user_id]
random_index = random.choice(user_rows.index)
print(random_index)

296536


In [11]:
features = get_features(random_index, user_stats, beer_stats)

Distance:  296536    1887.08679
Name: distance_user_brewery, dtype: float64


In [12]:
print(features)

[3.605017301038062, 4.177142857142857, 289, 1, 296536    1887.08679
Name: distance_user_brewery, dtype: float64, 4.318389408719889, 0.02516940948693127, 0.0968054211035818, 0, 0, 0, 0.05517909002904162, 0.006776379477250726, 0, 0.008712487899322363, 0.001936108422071636, 0.030977734753146177, 0, 0.00968054211035818, 0.000968054211035818, 0, 0.002904162633107454, 0.027105517909002903, 0.002904162633107454, 0, 0.007744433688286544, 0, 0, 0, 0, 0, 0.008712487899322363, 0.007744433688286544, 0.001936108422071636, 0.02904162633107454, 0, 0.030009680542110357, 0, 0.03969022265246854, 0.005808325266214908, 0, 0.015488867376573089, 0.005808325266214908, 0.005808325266214908, 0.000968054211035818, 0.05227492739593417, 0, 0, 0, 0.0590513068731849, 0.006776379477250726, 0.005808325266214908, 0, 0.030009680542110357, 0, 0, 0.026137463697967087, 0.001936108422071636, 0.05227492739593417, 0, 0.007744433688286544, 0, 0, 0.05808325266214908, 0.023233301064859633, 0, 0, 0.02032913843175218, 0.004840271