In [5]:
import pandas as pd
from src.data.some_dataloader import *
from collections import Counter

In [None]:
df_ba_ratings, df_rb_ratings = load_rating_data(ba_path="../../data/BeerAdvocate/BA_ratings.csv", rb_path="../../data/RateBeer/RB_ratings.csv")

In [3]:
df_ba_ratings.head()

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,aroma,palate,taste,overall,rating,text,review
0,Régab,142544,Societe des Brasseries du Gabon (SOBRAGA),37262,Euro Pale Lager,4.5,1440064800,nmann08,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ...",True
1,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,1235127600,StJamesGate,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67,Pours pale copper with a thin head that quickl...,True
2,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,1142247600,mdagnew,mdagnew.19527,4.0,3.5,3.5,4.0,3.5,3.73,"500ml Bottle bought from The Vintage, Antrim.....",True
3,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,1101898800,helloloser12345,helloloser12345.10867,4.0,3.5,4.0,4.0,4.5,3.98,Serving: 500ml brown bottlePour: Good head wit...,True
4,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,1093860000,cypressbob,cypressbob.3708,4.0,4.0,4.0,4.0,4.0,4.0,"500ml bottlePours with a light, slightly hazy ...",True


In [4]:
def split_ratings_by_threshold(df, user_id, upper_threshold, lower_threshold):
    """
    Splits the ratings of a user into two DataFrames based on thresholds.
    
    Parameters:
        df (pd.DataFrame): The DataFrame containing the ratings.
        user_id (int or str): The ID of the user whose ratings are to be filtered.
        upper_threshold (float): The threshold above which ratings are considered "good".
        lower_threshold (float): The threshold below which ratings are considered "bad".
        
    Returns:
        tuple: Two DataFrames - one for ratings >= upper_threshold, and one for ratings <= lower_threshold.
    """
    # Filter DataFrame for the specific user
    user_ratings = df[df['user_id'] == user_id]
    
    # Create a DataFrame for ratings >= upper_threshold
    good_ratings = user_ratings[user_ratings['rating'] >= upper_threshold]
    
    # Create a DataFrame for ratings <= lower_threshold
    bad_ratings = user_ratings[user_ratings['rating'] <= lower_threshold]
    
    return good_ratings, bad_ratings

In [ ]:
def count_word_frequencies(df, text_column, word_list):
    """
    Counts the frequencies of words from a given word list in a text column of a DataFrame.
    
    Parameters:
        df (pd.DataFrame): The DataFrame containing the text column.
        text_column (str): The name of the column that contains the text data.
        word_list (list): A list of words whose frequencies should be counted.
    
    Returns:
        dict: A dictionary with words as keys and their frequencies as values.
    """
    # Combine all texts in the column into one large string
    all_text = " ".join(df[text_column].dropna()).lower()  # Convert to lowercase for consistent counting

    # Tokenize the text (split into words)
    words = all_text.split()
    
    # Filter only the words from the provided word list
    filtered_words = [word for word in words if word in word_list]
    
    # Count the frequencies of the filtered words
    word_frequencies = Counter(filtered_words)
    
    # Return the result as a dictionary
    return dict(word_frequencies)

In [6]:
positive_words = [
    "malty", "hoppy", "fruity", "sweet", "smooth", "crisp", "refreshing", "balanced",
    "caramel", "chocolatey", "nutty", "citrusy", "spicy",
    "creamy", "full-bodied", "light", "dry", "velvety",
    "excellent", "amazing", "delicious", "perfect", "great", "fantastic", "lovely",
    "enjoyable", "favorite", "wonderful", 
    "classic", "authentic", "well-crafted", "artisanal", "clean"
]

negative_words = [
    "bitter", "sour", "bland", "stale", "metallic", "burnt", "overpowering", 
    "flat", "watery", "cloying", "harsh", "astringent",
    "thin", "weak", "overly carbonated", "unbalanced",
    "bad", "disappointing", "boring", "unpleasant", "off-putting", "weird", 
    "mediocre", "not great", "subpar", "average",
    "artificial", "generic", "industrial", "chemical"
]

exp_words1 = [
    "Lacing",
    "Ester",
    "Diacetyl",
    "Phenol",
    "Dry Hop",
    "DMS",
    "Oxidation",
    "catty",
    "resinous",
    "astringent",
    "Effervescent",
    "Tannic",
    "Brettanomyces",
    "lactic",
    "autolysis",
    "Krausen",
]

words = list(set(positive_words + negative_words + exp_words1))

In [7]:
print(len(words))

78


We see that we have a |L| of 78. Let's convert distributions to a latent space with a smaller dimension, so that we can later 

In [ ]:
def pre_stats(df, style, date):
    df = df[df["date"] < date]
    mean = df["rating"].mean()
    style_mean = df[df["style"] == style]["rating"].mean()
    num_ratings = len(df)

In [ ]:
def is_experienced(user_id, exp_user_ids):
    return user_id in exp_user_ids

In [8]:
print(df_ba_ratings["style"].unique())

['Euro Pale Lager' 'English Pale Ale' 'English Bitter'
 'American Pale Wheat Ale' 'American Blonde Ale' 'Irish Red Ale'
 'American Stout' 'American Pale Ale (APA)' 'Milk / Sweet Stout'
 'American Double / Imperial IPA' 'German Pilsener' 'American IPA'
 'Irish Dry Stout' 'Munich Helles Lager' 'English Brown Ale'
 'English India Pale Ale (IPA)' 'English Porter' 'Saison / Farmhouse Ale'
 'Foreign / Export Stout' 'Fruit / Vegetable Beer'
 'American Double / Imperial Stout' 'American Porter'
 'English Dark Mild Ale' 'Berliner Weissbier' 'Hefeweizen' 'English Stout'
 'California Common / Steam Beer' 'American Black Ale'
 'American Amber / Red Ale' 'Tripel' 'Kölsch' 'Altbier' 'Smoked Beer'
 'Bière de Garde' 'Oatmeal Stout' 'Extra Special / Strong Bitter (ESB)'
 'Rye Beer' 'Russian Imperial Stout' 'American Wild Ale' 'Winter Warmer'
 'English Pale Mild Ale' 'Belgian Pale Ale' 'Belgian IPA' 'Czech Pilsener'
 'Belgian Strong Pale Ale' 'Old Ale' 'Dunkelweizen' 'English Strong Ale'
 'Schwarzbier' 

In [13]:
print(len(df_ba_ratings["style"].unique()))

104


In [20]:
def get_top_styles(df_ratings, threshold):
    return df_ratings['style'].value_counts().head(threshold)

In [31]:
print(get_top_styles(df_ba_ratings, threshold=50))

style
American IPA                           997814
American Double / Imperial IPA         739982
American Double / Imperial Stout       500265
American Pale Ale (APA)                388139
Saison / Farmhouse Ale                 301953
American Wild Ale                      269132
American Porter                        257758
Russian Imperial Stout                 256330
American Amber / Red Ale               202389
Fruit / Vegetable Beer                 167968
American Brown Ale                     126663
Witbier                                124850
American Stout                         124615
Belgian Strong Dark Ale                123882
American Strong Ale                    123066
Tripel                                 114549
Milk / Sweet Stout                     114248
Belgian Strong Pale Ale                108466
American Adjunct Lager                 107107
American Pale Wheat Ale                106749
Hefeweizen                             100584
American Barleywine         

In [32]:
get_top_styles(df_ba_ratings, threshold=100)

style
American IPA                        997814
American Double / Imperial IPA      739982
American Double / Imperial Stout    500265
American Pale Ale (APA)             388139
Saison / Farmhouse Ale              301953
                                     ...  
Low Alcohol Beer                      3993
Sahti                                 3832
Japanese Rice Lager                   3827
Bière de Champagne / Bière Brut       3348
English Pale Mild Ale                 2443
Name: count, Length: 100, dtype: int64

We see that there is strong drop in the first few beer styles and then it knd of flattens out. The strongest drop can be seen in the first 3 categories. Therefore, we take these beer styles as their own categories, while grouping the others.

In [33]:
import pandas as pd

def categorize_beer_styles(df):
    # Definiere die Kategorien
    categories = {
        "Lager": ["Euro Pale Lager", "German Pilsener", "Munich Helles Lager", "Czech Pilsener", 
                  "Vienna Lager", "Light Lager", "Munich Dunkel Lager", "Schwarzbier", 
                  "Euro Dark Lager", "Märzen / Oktoberfest", "Doppelbock", "Eisbock", 
                  "Maibock / Helles Bock", "Baltic Porter", "Euro Strong Lager"],
        "Ale": ["English Pale Ale", "American Pale Ale (APA)", "English Bitter", "Extra Special / Strong Bitter (ESB)", 
                "Belgian Pale Ale", "Irish Red Ale", "American Amber / Red Ale", "Scottish Ale", 
                "English Brown Ale", "American Brown Ale", "Old Ale", "English Strong Ale", 
                "American Strong Ale", "Scotch Ale / Wee Heavy", "English Barleywine", 
                "American Barleywine", "Belgian Dark Ale", "Belgian Strong Dark Ale", 
                "Quadrupel (Quad)", "Dubbel", "Tripel"],
        "IPA": ["English India Pale Ale (IPA)", "American IPA", "American Double / Imperial IPA", "Belgian IPA"],
        "Stout": ["Irish Dry Stout", "Milk / Sweet Stout", "Oatmeal Stout", "Foreign / Export Stout", 
                  "Russian Imperial Stout", "American Stout", "English Stout"],
        "Porter": ["English Porter", "American Porter", "Baltic Porter"],
        "Wheat Beer": ["Hefeweizen", "Kristalweizen", "Dunkelweizen", "Weizenbock", "Witbier", 
                       "Berliner Weissbier", "Gose", "Roggenbier"],
        "Belgian Styles": ["Saison / Farmhouse Ale", "Bière de Garde", "Lambic - Fruit", 
                            "Lambic - Unblended", "Gueuze", "Faro"],
        "Specialty": ["Smoked Beer", "Herbed / Spiced Beer", "Pumpkin Ale", "Chile Beer", 
                      "Scottish Gruit / Ancient Herbed Ale", "American Wild Ale", "Bière de Champagne / Bière Brut", 
                      "Wheatwine", "Sahti", "Kvass", "Braggot"],
        "Hybrid": ["Kölsch", "Altbier", "California Common / Steam Beer", "Cream Ale"],
        "Light/Low Alcohol": ["Low Alcohol Beer", "American Adjunct Lager", "American Pale Lager", 
                               "Japanese Rice Lager", "Happoshu"]
    }

    # Find the 3 most frequent beer styles
    top_styles = df['style'].value_counts().nlargest(3).index.tolist()

    # Replace beer styles with their categories, except for the top 3 styles
    def map_to_category(style):
        if style in top_styles:
            return style
        for category, styles in categories.items():
            if style in styles:
                return category
        return "Other"  # Catch-all for uncategorized styles

    # Apply the mapping
    df['style'] = df['style'].apply(map_to_category)
    return df

In [34]:
df_ba_ratings_categorized = categorize_beer_styles(df_ba_ratings)

In [35]:
df_ba_ratings_categorized.head()

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,aroma,palate,taste,overall,rating,text,review
0,Régab,142544,Societe des Brasseries du Gabon (SOBRAGA),37262,Lager,4.5,1440064800,nmann08,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ...",True
1,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,Ale,4.5,1235127600,StJamesGate,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67,Pours pale copper with a thin head that quickl...,True
2,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,Ale,4.5,1142247600,mdagnew,mdagnew.19527,4.0,3.5,3.5,4.0,3.5,3.73,"500ml Bottle bought from The Vintage, Antrim.....",True
3,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,Ale,4.5,1101898800,helloloser12345,helloloser12345.10867,4.0,3.5,4.0,4.0,4.5,3.98,Serving: 500ml brown bottlePour: Good head wit...,True
4,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,Ale,4.5,1093860000,cypressbob,cypressbob.3708,4.0,4.0,4.0,4.0,4.0,4.0,"500ml bottlePours with a light, slightly hazy ...",True


In [40]:
df_ba_ratings_categorized["style"].value_counts()

style
Ale                                 1926279
American IPA                         997814
Other                                898540
American Double / Imperial IPA       739982
Stout                                655963
Lager                                629722
American Double / Imperial Stout     500265
Specialty                            496980
Belgian Styles                       428267
Wheat Beer                           390873
Porter                               298594
Light/Low Alcohol                    178592
IPA                                  137468
Hybrid                               113693
Name: count, dtype: int64