# Beer Recommender System
This notebook implements data preprocessing and modeling techniques to create a beer recommender system. I

In [1]:
#Imports
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt





### Preprocessing Functions
These functions clean the dataset by handling duplicates, missing values, and incorrect formats. They prepare the data for splitting and analysis.

In [2]:
def preprocess_data(df):
    df_filtered = df.drop_duplicates(["name", "reviewer", "review_text"]) # Remove duplicate entries
    print("Size after drop_duplicates: ", len(df_filtered))
    
    df_filtered['rating'] = pd.to_numeric(df_filtered['rating'], errors='coerce')  # Set erros to NaN
    df_filtered = df_filtered.dropna(subset=['rating'])  # Drop rows where 'rating' is NaN
    print("Size after drop rating NA: ", len(df_filtered))
    
    df_filtered['abv'] = pd.to_numeric(df_filtered['abv'].str.rstrip('%'), errors='coerce') 
    df_filtered = df_filtered.dropna(subset=['abv'])
    print("Size after drop abv NA: ", len(df_filtered))
    
    return df_filtered

def create_test_train(df, reviewer_col="reviewer", random_state=7, test_size=100, mask_percentage=0.10):
    """
    Splits a dataset into training and test sets, masking a portion of test set entries.
    
    Parameters:
    - df (pd.DataFrame): The dataset to split.
    - reviewer_col (str): The column name containing reviewer IDs.
    - random_state (int): The random state for reproducibility.
    - test_size (int): The number of reviewers to sample for the test set.
    - mask_percentage (float): The percentage of beers to mask for each reviewer in the test set.

    Returns:
    - df_train (pd.DataFrame): The training set.
    - df_test_masked (pd.DataFrame): The test set with masked entries.
    """
    # Randomly sample reviewers
    sampled_reviewers = df[reviewer_col].sample(n=test_size, random_state=random_state)
    
    # Get reviews from the sampled reviewers
    df_test = df[df[reviewer_col].isin(sampled_reviewers)]
    
    # Group by reviewer to get each user's beers
    df_test_grouped = df_test.groupby(reviewer_col)
    
    # Randomly mask a percentage of beers for each reviewer
    test_set_masked = []
    for reviewer, group in df_test_grouped:
        # Calculate how many beers to mask
        num_to_mask = max(int(len(group) * mask_percentage), 1)
        
        # Sample the calculated number of beers
        masked_group = group.sample(n=num_to_mask, random_state=random_state)
        test_set_masked.append(masked_group)
    
    # Combine masked reviews into a single DataFrame
    df_test_masked = pd.concat(test_set_masked)
    
    # Remove masked reviews from the training data
    df_train = df.drop(df_test_masked.index)
    
    # Display dataset summaries
    print("\n### Dataset Summary ###")
    print(f"Total reviewers sampled: {len(sampled_reviewers)}")
    print(f"Training set size: {df_train.shape}")
    print(f"Test set size: {df_test_masked.shape}")
    
    return df_train, df_test_masked

In [None]:
# Load data and preprocess
df = pd.read_pickle('encoded_beers_SBERT.pkl')

df_filtered = preprocess_data(df)
del df

Size after drop_duplicates:  1157819


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['rating'] = pd.to_numeric(df_filtered['rating'], errors='coerce')  # Set erros to NaN


Size after drop rating NA:  1157807
Size after drop abv NA:  1154739


In [4]:
df_filtered.head()

Unnamed: 0,id,name,brewery,subgenre,abv,location,rating,average_rating,reviewer,review_date,review_text,algorithm_rating,total_reviews,sbert_embedding
0,1,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,🇯🇪Jersey,3.5,2.99,Jerseyislandbeer,"December 14, 2023",330ml can from Shoprite in Livingstone. At hom...,28.0,11,"[0.037878353, 0.00593541, 0.0062317043, -0.011..."
1,2,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,"🇬🇧Ipswich, England",3.2,2.99,Grumbo,"February 28, 2022","18/2/2022. Can sample courtesy of fonefan, che...",28.0,11,"[-0.037820198, -0.044825517, 0.07764052, 0.065..."
2,3,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,"🇸🇪Tyresö, Sweden",3.5,2.99,omhper,"February 19, 2022","--Sample, thanks fonefan! -- Hazy deep golden,...",28.0,11,"[0.056960188, -0.00059301173, 0.11057871, 0.02..."
3,4,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,"🇫🇮Vasa, Finland",2.8,2.99,oh6gdx,"January 31, 2022","Panda from a can, thanks fonefan!. Golden colo...",28.0,11,"[0.003549767, -0.010705345, 0.02083684, 0.0106..."
4,6,Wild Dog Pale Ale,Wild Dog (Tiemann Beer),American Pale Ale,5.2,"🇩🇰Haderslev, Denmark",2.6,2.99,martin00sr,"January 8, 2022","Can @Ulfborg. Cloudy amber, white head. Malty ...",28.0,11,"[-0.01005388, -0.02942978, 0.0016338513, 0.017..."


In [5]:
df_train, df_test_masked = create_test_train(df_filtered)


### Dataset Summary ###
Total reviewers sampled: 100
Training set size: (1149910, 14)
Test set size: (4829, 14)


### Create locality-sensitive hashing (LSH)

In [6]:
# Initialize a variable to store the model
sbert_model = None

def encode_sbert(query, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    """
    Encodes a query using SBERT. Loads the model if not already loaded.
    
    Parameters:
        query (str or list of str): The query or list of queries to encode.
        model_name (str): The name of the SBERT model to load (default is 'all-MiniLM-L6-v2').
    
    Returns:
        numpy.ndarray: The embedding(s) for the input query/queries.
    """
    global sbert_model  # Use the global variable to store the model
    
    # Load the model if it's not already loaded
    if sbert_model is None:
        sbert_model = SentenceTransformer(model_name)
    
    # Encode the query and return the embeddings
    return sbert_model.encode(query)

def generate_hyperplanes(dim, num_hash_functions):
    """
    Generate random hyperplanes for hash functions.
    
    Parameters:
    - dim: Dimensionality of the embeddings.
    - num_hash_functions: Number of hash functions per table.
    
    Returns:
    - A matrix of shape (num_hash_functions, dim) where each row is a hyperplane.
    """
    return np.random.randn(num_hash_functions, dim)

def hash_vectors(vectors, hyperplanes):
    """
    Hash a batch of vectors using a set of hyperplanes.

    Parameters:
    - vectors: Input vectors (2D array of shape [n_samples, d]).
    - hyperplanes: Matrix of hyperplanes (2D array of shape [k, d]).

    Returns:
    - A matrix of binary hash values (shape [n_samples, k]).
    """
    # Compute dot products and return binary hash values
    return (np.dot(vectors, hyperplanes.T) > 0).astype(int)

class LSHVectorized:
    def __init__(self, d, k, L):
        """
        Initialize the LSH scheme with vectorized support.

        Parameters:
        - d: Dimensionality of the input vectors.
        - k: Number of hash functions per table.
        - L: Number of hash tables.
        """
        self.L = L
        self.tables = [defaultdict(list) for _ in range(L)]
        self.hyperplanes = [generate_hyperplanes(d, k) for _ in range(L)]

    def add_vectors(self, vectors, identifiers):
        """
        Add a batch of vectors to the LSH index.

        Parameters:
        - vectors: Input vectors (2D array of shape [n_samples, d]).
        - identifiers: A list of unique identifiers for the vectors.
        """
        for table, hyperplanes in zip(self.tables, self.hyperplanes):
            # Compute hash values for all vectors at once
            hash_values = hash_vectors(vectors, hyperplanes)
            
            # Convert binary hash values to tuples for dictionary keys
            hash_keys = [tuple(h) for h in hash_values]
            
            # Add vectors to their corresponding buckets
            for identifier, key in zip(identifiers, hash_keys):
                table[key].append(identifier)

    def query(self, vectors):
        """
        Query the LSH index to find similar items for a batch of vectors.

        Parameters:
        - vectors: Query vectors (2D array of shape [n_samples, d]).

        Returns:
        - A list of sets, where each set contains the candidates for a query vector.
        """
        candidates = [set() for _ in range(len(vectors))]
        for table, hyperplanes in zip(self.tables, self.hyperplanes):
            # Compute hash values for all query vectors
            hash_values = hash_vectors(vectors, hyperplanes)
            
            # Convert binary hash values to tuples for dictionary keys
            hash_keys = [tuple(h) for h in hash_values]
            
            # Retrieve candidates for each query
            for i, key in enumerate(hash_keys):
                candidates[i].update(table.get(key, []))
        return candidates

In [7]:
vectors = np.vstack(df_filtered["sbert_embedding"].values)  # Combine embeddings into a 2D array
identifiers = df_filtered.index.tolist()  # Use review IDs as identifiers

In [8]:
## Run LSH ##
# Initialize LSH scheme
d = 384
k = 14 
L = 7

lsh = LSHVectorized(d, k, L)

In [9]:
# Add vectors to the LSH index
lsh.add_vectors(vectors, identifiers)

## Make reccomendations

In [10]:

def predict_ratings_user_based(user_item_matrix, similarity_matrix):
    # TODO Hvordan håndterer vi øl personen allerede har rated
    # Convert to a numpy array for computation
    user_item_matrix = user_item_matrix.values

    # Compute mean ratings for each user
    user_means = np.ma.masked_equal(user_item_matrix, 0).mean(axis=1).filled(0)
    
    # Center the matrix by subtracting user means
    ratings_diff = user_item_matrix - user_means[:, None]
    ratings_diff[np.isnan(ratings_diff)] = 0  # Replace NaN deviations with 0

    # Compute predictions
    similarity_sum = np.abs(similarity_matrix).sum(axis=1)[:, None]
    pred = user_means[:, None] + np.dot(similarity_matrix, ratings_diff) / (similarity_sum + 1e-8)
    
    return pred

def collaborative_filtering(df_user):
    
    user_matrix = df_user.pivot_table(
        index="reviewer",     # Rows: Reviewers
        columns="name",       # Columns: Beer names
        values="rating",      # Values: Ratings
        fill_value=0          # Fill missing ratings with 0
    )
    
    # Compute cosine similarity
    cosine_similarity_matrix = cosine_similarity(user_matrix)
    
    # Predict ratings
    predicted_ratings = predict_ratings_user_based(user_matrix, cosine_similarity_matrix)

    df_out = pd.DataFrame(predicted_ratings, index=user_matrix.index, columns=user_matrix.columns)


    return df_out

collab_df = collaborative_filtering(df_filtered)


In [33]:

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from matplotlib.colors import ListedColormap
from sklearn.feature_extraction.text import CountVectorizer

# Define reference context for flavor-related words. The first 20 is from a aromatic kit used for sommeliers, the rest is ai-generated.
context_words = [
    "bitter", "sweet", "salt", "sour", "umami",
    "lemon", "grapefruit", "apple", "pear", "blackcurrant", "prune", "melon", 
    "banana", "acacia", "rose", "cut grass", "hay", "bay leaf", "thyme", 
    "tomato", "pepper", "nutmeg", "clove", "bread", "butter", "vanilla", 
    "hazelnut", "toast", "malt", "caramel", "honey", "coffee", "licorice",
    "pine", "grass", "resin", "floral", "perfume", "incense", "cinnamon",
    "ginger", "anise", "nut", "almond", "walnut", "chestnut", "peanut",
    "soy", "mushroom", "earth", "dust", "wood", "barnyard", "horse",
    "wet", "dry", "metallic", "sulfur", "fish", "cheese", "butter",
    "cream", "leather", "silk", "rubber", "barnyard", "ammonia",
    "rotten", "acid"
]
custom_stop_words = ["beer", "beers", "bottle", "taste", "nice", "aroma", "like", "good", "great", "head"]
context_embeddings = encode_sbert(context_words)

# Function to filter terms dynamically
def is_flavor_related(term, context_embeddings, threshold=0.35):
    term_embedding = sbert_model.encode([term])[0]
    cosine_similarity = lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
    max_similarity = max(cosine_similarity(term_embedding, context) for context in context_embeddings)
    return max_similarity > threshold

def plot_bucket(bucket_vectors, cluster_labels, subgenres, perplexity=30, n_iter=5000, learning_rate=200):
    """
    Visualizes differences within an LSH bucket using t-SNE with configurable parameters.
    
    Args:
        bucket_vectors (np.ndarray): High-dimensional vectors of beers in the bucket.
        cluster_labels (np.ndarray): Cluster labels assigned to each vector.
        subgenres (np.ndarray): Subgenre or categorical labels for each beer.
        perplexity (int): The t-SNE perplexity parameter, balancing local/global data views.
        n_iter (int): Number of iterations for t-SNE optimization.
        learning_rate (float): Learning rate for t-SNE optimization.
    """
    # t-SNE reducer with tuned parameters
    reducer = TSNE(
        n_components=2, 
        random_state=42, 
        perplexity=perplexity, 
        n_iter=n_iter, 
        learning_rate=learning_rate
    )
    reduced_vectors = reducer.fit_transform(bucket_vectors)
    
    # Plotting
    plt.figure(figsize=(16, 10))
    scatter = plt.scatter(
        reduced_vectors[:, 0],
        reduced_vectors[:, 1],
        c=cluster_labels,
        cmap='plasma',
        alpha=0.7
    )
    plt.colorbar(scatter, label='Cluster Label')
    plt.title(f"t-SNE Visualization (Perplexity={perplexity}, n_iter={n_iter}, LR={learning_rate})")
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.grid(alpha=0.3)
    plt.show()

def recommend_beer(query, user_name, abv_desired, n_clusters=15):
    # Encode the query
    query_embedding = encode_sbert(query).reshape(1, -1)
    
    # Query the LSH index
    candidates = lsh.query(query_embedding)

    # Filter bucket vectors and metadata
    bucket_data = df_filtered[df_filtered["id"].isin(list(candidates[0]))]
    bucket_vectors = np.vstack(bucket_data["sbert_embedding"].to_numpy())
    
    # Extract subgenre information
    subgenres = bucket_data["subgenre"].values  # Adjust column name as necessary
    
    # Perform clustering on bucket vectors
    kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=42)
    cluster_labels = kmeans.fit_predict(bucket_vectors)
    
    # Assign query to the nearest cluster
    query_cluster = kmeans.predict(query_embedding)[0]
    
    perplexities = [50]
    n_iters = [10000]
    learning_rates = [100, 200]
    
    param_combinations = [(p, n, lr) for p in perplexities for n in n_iters for lr in learning_rates]
    
    #for perplexity, n_iter, learning_rate in param_combinations:
    #    plot_bucket(bucket_vectors, cluster_labels, subgenres, perplexity, n_iter, learning_rate)
        
    # Filter beers in the same cluster as the query
    cluster_indices = np.where(cluster_labels == query_cluster)[0]
    cluster_vectors = bucket_vectors[cluster_indices]
    cluster_beers = bucket_data.iloc[cluster_indices]
    
    # Initialize CountVectorizer with custom stopwords
    default_stop_words = CountVectorizer(stop_words='english').get_stop_words()
    all_stop_words = list(set(default_stop_words).union(custom_stop_words))

    # Pass the combined stop words to CountVectorizer
    vectorizer = CountVectorizer(max_features=100, stop_words=all_stop_words)
        
    # Extract top terms from cluster reviews
    term_matrix = vectorizer.fit_transform(cluster_beers["review_text"])
    terms = vectorizer.get_feature_names_out()
    term_counts = np.array(term_matrix.sum(axis=0)).flatten()
    top_terms = [terms[i] for i in term_counts.argsort()[-5:]]  # Top 5 terms
    
    # Compute similarities within the selected cluster
    sims = cosine_similarity(query_embedding, cluster_vectors)[0]
    
    filtered_top_terms = [term for term in top_terms if is_flavor_related(term, context_embeddings)]

    # Perform collaborative filtering
    predcicted_rating_user = collab_df.loc[user_name]
    
    beer_LSH = pd.DataFrame({
        'similarity': sims,
        'beer': cluster_beers["name"].values,  # Adjust column name if necessary
    })
    
    LSH_score = beer_LSH.groupby('beer')['similarity'].mean()
    collab_filtering_scores = predcicted_rating_user[LSH_score.index.tolist()]

    abv = cluster_beers.set_index("name").loc[LSH_score.index, "abv"]
    
    alpha = 0.03
    if abv_desired == 0:
        ABV_weight = - 2 * abs(abv - abv_desired) # Ensure zero percent alchol
    else:
        ABV_weight =  - alpha * ((abv - abv_desired)**2) / (abv_desired**1.5 + 1)
    
    weighted_score = 0.85*LSH_score + 0.15*collab_filtering_scores + ABV_weight
    
    beer_weighted_score= pd.DataFrame({
        'score': weighted_score,
        'beer': cluster_beers["name"].values,  # Adjust column name if necessary
    })

    return beer_weighted_score.sort_values(by='score', ascending=False), filtered_top_terms


# Create a query
test_query = "I like dark beers with a sweet chocolate with a hint of cherry"
user_name = "Jerseyislandbeer"

beer_recommendations, theme = recommend_beer(test_query, user_name, 7)

print("Top 5 recommended beers:")
print(beer_recommendations.head())
print("\nTop flavor-related terms:")
print(theme)

Top 5 recommended beers:
                                        score                       beer
beer                                                                    
Schneider Weisse Tap 06 - Aventinus  0.649334  Firestone Walker Parabola
Unibroue Trois Pistoles              0.599655             Brooklyn Lager
Ayinger Celebrator Doppelbock        0.586081   Hitachino Nest White Ale
Trappistes Rochefort 10              0.559206             Maine Beer Zoe
Trappistes Rochefort 10              0.559206             Maine Beer Zoe

Top flavor-related terms:
['dark', 'flavor', 'sweet']




### Example:

### Evaluation settup