In [2]:
#Imports
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict


### Data preprocessing

In [3]:
# Load data and preprocess
df = pd.read_pickle('encoded_beers_SBERT.pkl')

df_filtered = df.drop_duplicates(["name", "reviewer", "review_text"]) # Remove duplicate entries
print("Size after drop_duplicates: ", len(df_filtered))

#convert numeric values to floats
df_filtered['rating'] = pd.to_numeric(df_filtered['rating'], errors='coerce')  # Set erros to NaN
df_filtered = df_filtered.dropna(subset=['rating'])  # Drop rows where 'rating' is NaN
print("Size after drop rating NA: ", len(df_filtered))
df_filtered['abv'] = pd.to_numeric(df_filtered['abv'].str.rstrip('%'), errors='coerce') 
df_filtered = df_filtered.dropna(subset=['abv'])
print("Size after drop abv NA: ", len(df_filtered))

Size after drop_duplicates:  1157819


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['rating'] = pd.to_numeric(df_filtered['rating'], errors='coerce')  # Set erros to NaN


Size after drop rating NA:  1157807
Size after drop abv NA:  1154739


In [8]:
# Create test set

# Randomly sample 100 reviewers from the dataset
sampled_reviewers = df_filtered["reviewer"].sample(n=100, random_state=7)
# Get reviews from the sampled reviewers
df_test = df_filtered[df_filtered['reviewer'].isin(sampled_reviewers)]
# Remove the sampled reviews
#df_filtered = df_filtered[~df_filtered['reviewer'].isin(sampled_reviewers)]

# Group by reviewer to get each user's beers
df_test_grouped = df_test.groupby('reviewer')

# Randomly mask 10% of beers for each reviewer
test_set_masked = []

for reviewer, group in df_test_grouped:
    # Calculate how many beers to mask (10% of the beers this reviewer has rated or 1 in case 10 % is less than 1)
    num_to_mask = max(int(len(group) * 0.10), 1)
    
    # Sample 10% of the beers for this reviewer
    masked_group = group.sample(n=num_to_mask, random_state=7)
    test_set_masked.append(masked_group)
    
df_test_masked = pd.concat(test_set_masked)

# remove test reviews from train data
df_filtered = df_filtered.drop(df_test_masked.index)

# Display dataset summaries
print("\n### Dataset Summary ###")
print(f"Total reviewers sampled: {len(sampled_reviewers)}")
print(f"Training set size: {df_filtered.shape}")
print(f"Test set size: {df_test_masked.shape}")



### Dataset Summary ###
Total reviewers sampled: 100
Training set size: (1132916, 14)
Test set size: (5746, 14)


In [72]:
# Create dataframe for retrieving beer information
beer_info = df_filtered[['name', 'abv', 'subgenre']]

# Drop duplicate rows based on the 'name' column (i.e. beers)
beer_info = beer_info.drop_duplicates(subset='name')

beer_info.set_index('name', inplace=True)

### Create locality-sensitive hashing (LSH)

In [11]:
# Make LSH
# Extract vectors and identifiers
vectors = np.vstack(df_filtered["sbert_embedding"].values)  # Combine embeddings into a 2D array
identifiers = df_filtered.index.tolist()  # Use review IDs as identifiers

In [12]:
def generate_hyperplanes(dim, num_hash_functions):
    """
    Generate random hyperplanes for hash functions.
    
    Parameters:
    - dim: Dimensionality of the embeddings.
    - num_hash_functions: Number of hash functions per table.
    
    Returns:
    - A matrix of shape (num_hash_functions, dim) where each row is a hyperplane.
    """
    return np.random.randn(num_hash_functions, dim)

def hash_vectors(vectors, hyperplanes):
    """
    Hash a batch of vectors using a set of hyperplanes.

    Parameters:
    - vectors: Input vectors (2D array of shape [n_samples, d]).
    - hyperplanes: Matrix of hyperplanes (2D array of shape [k, d]).

    Returns:
    - A matrix of binary hash values (shape [n_samples, k]).
    """
    # Compute dot products and return binary hash values
    return (np.dot(vectors, hyperplanes.T) > 0).astype(int)

In [13]:
# Build LSH framework
class LSHVectorized:
    def __init__(self, d, k, L):
        """
        Initialize the LSH scheme with vectorized support.

        Parameters:
        - d: Dimensionality of the input vectors.
        - k: Number of hash functions per table.
        - L: Number of hash tables.
        """
        self.L = L
        self.tables = [defaultdict(list) for _ in range(L)]
        self.hyperplanes = [generate_hyperplanes(d, k) for _ in range(L)]

    def add_vectors(self, vectors, identifiers):
        """
        Add a batch of vectors to the LSH index.

        Parameters:
        - vectors: Input vectors (2D array of shape [n_samples, d]).
        - identifiers: A list of unique identifiers for the vectors.
        """
        for table, hyperplanes in zip(self.tables, self.hyperplanes):
            # Compute hash values for all vectors at once
            hash_values = hash_vectors(vectors, hyperplanes)
            
            # Convert binary hash values to tuples for dictionary keys
            hash_keys = [tuple(h) for h in hash_values]
            
            # Add vectors to their corresponding buckets
            for identifier, key in zip(identifiers, hash_keys):
                table[key].append(identifier)

    def query(self, vectors):
        """
        Query the LSH index to find similar items for a batch of vectors.

        Parameters:
        - vectors: Query vectors (2D array of shape [n_samples, d]).

        Returns:
        - A list of sets, where each set contains the candidates for a query vector.
        """
        candidates = [set() for _ in range(len(vectors))]
        for table, hyperplanes in zip(self.tables, self.hyperplanes):
            # Compute hash values for all query vectors
            hash_values = hash_vectors(vectors, hyperplanes)
            
            # Convert binary hash values to tuples for dictionary keys
            hash_keys = [tuple(h) for h in hash_values]
            
            # Retrieve candidates for each query
            for i, key in enumerate(hash_keys):
                candidates[i].update(table.get(key, []))
        return candidates

In [14]:
## Run LSH ##
# Initialize LSH scheme
d = 384
k = 14 
L = 7

lsh = LSHVectorized(d, k, L)

In [15]:
# Add vectors to the LSH index
lsh.add_vectors(vectors, identifiers)

### Create Collaborative filtering (CF)

In [43]:
# TODO lÃ¦s hele igennem og lav kommentarer
def predict_ratings_user_based(user_item_matrix, similarity_matrix):
       # TODO Hvordan hÃ¥ndterer vi Ã¸l personen allerede har rated

    # Compute predictions
    similarity_sum = np.abs(similarity_matrix).sum(axis=1)[:, None]
    pred = pred = np.dot(similarity_matrix, user_item_matrix) / (similarity_sum + 1e-8)

    return pred


def collaborative_filtering(df, drop_cols = ["brewery", "subgenre", "abv", "sbert_embedding"]):
    df_colab = df


    user_item_matrix = df_colab.pivot_table(
    index="reviewer",     # Rows: Reviewers
    columns="name",       # Columns: Beer names
    values="rating",      # Values: Ratings
    fill_value=0          # Fill missing ratings with 0
    )
    

    user_means = user_item_matrix.replace(0, np.nan).mean(axis=1).fillna(0).to_numpy()
    
    user_item_np = np.where(user_item_matrix != 0, user_item_matrix - user_means[:, None], 0)

    user_item_matrix = pd.DataFrame(user_item_np, index=user_item_matrix.index, columns=user_item_matrix.columns)


    # Compute cosine similarity
    #cosine_similarity = compute_cosine_similarity_manual(utility_matrix.values)
    cosine_similarity_matrix = cosine_similarity(user_item_matrix)
    
    # Predict ratings
    predicted_ratings = predict_ratings_user_based(user_item_matrix, cosine_similarity_matrix)

    pr_df = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)
   
    return pr_df

In [44]:
# Make collaborative filtering and achieve "predicted" normalised rating for each beer for each user
pr_df = collaborative_filtering(df_filtered)

In [45]:
# Caculate fraction of beers per user which is zero
zero_percentage = (pr_df == 0).mean(axis=1) * 100
zero_percentage.describe()

count    33526.000000
mean        43.932347
std         49.235832
min          0.008544
25%          0.111073
50%          0.845865
75%        100.000000
max        100.000000
dtype: float64

## Make reccomendations

In [16]:
test_vectors = np.vstack(df_test_masked["sbert_embedding"].values) 

### Example:

In [28]:
# Get a single example that we will try and reccomend a beer to based on review_text, abv, and subgenre
df_test_masked.iloc[1]

id                                                            2233306
name                                         Vrijstaat Vanmol Janimal
brewery                                      Vrijstaat Vanmol Brewing
subgenre                                                  Session IPA
abv                                                               3.3
location                                         ðŸ‡§ðŸ‡ªAntwerpen, Belgium
rating                                                            3.9
average_rating                                                   3.57
reviewer                                                      77ships
review_date                                               May 7, 2017
review_text         Thank you Kraddel! 330 ml. bottle sampled @ In...
algorithm_rating                                                 89.0
total_reviews                                                      49
sbert_embedding     [-0.01595407, 0.010769329, -0.020604532, -0.01...
Name: 1124265,

In [35]:
# First use LSH to find similar candidates
test_vector = test_vectors[1].reshape(1, -1)  # Use the first vector as an example query
candidates = lsh.query(test_vector)
len(candidates[0]) # Check number of reviews in bucket

4508

In [41]:
print("Number of fifferent beer in bucket: ", len(set(df_filtered[df_filtered["id"].isin(list(candidates[0]))]["name"]))) 
bucket_vectors = df_filtered[df_filtered["id"].isin(list(candidates[0]))]["sbert_embedding"]
bucket_vectors = np.vstack(bucket_vectors.to_numpy())
# Calculate the similarity between each word embedding vector in bucket and example review_text:
similarities = cosine_similarity(test_vector, bucket_vectors)[0]

beer_similarities = pd.DataFrame({
    'similarity': similarities,
    'beer': df_filtered[df_filtered["id"].isin(list(candidates[0]))]["name"].values 
})
# Calculate average similarity between promt (i.e. example review_text) and each beer in bucket
average_similarities_per_beer = beer_similarities.groupby('beer')['similarity'].mean()

Number of fifferent beer in bucket:  1475


In [42]:
# Show most similar beers based on LSH
average_similarities_per_beer.nlargest(5)

beer
5th Element BONNER                 0.689045
Domaine Neige Bulle de Neige       0.674373
Kehrwieder Ã¼.NN IPA alkoholfrei    0.674135
Birrificio Rurale 405040           0.673291
Craftwerk Holy Cowl                0.669864
Name: similarity, dtype: float32

In [46]:
# Next use collaborative filtering output to get cf-scores
cf_scores = pr_df[average_similarities_per_beer.index.tolist()] # Only extract similarities for the beers from the LSH bucket
cf_scores_ex = cf_scores.loc["77ships"] # Get cf_scores for our example user

In [47]:
# Show most similar beers based on CF
cf_scores_ex.nlargest(5)

name
Westvleteren 12                            0.258554
Trappistes Rochefort 10                    0.243262
AleSmith Speedway Stout                    0.206176
Goose Island Bourbon County Stout          0.180837
Founders KBS (Kentucky Breakfast Stout)    0.176311
Name: 77ships, dtype: float64

**Combine CF and LSH + extra attributes into a final score**

In [64]:
beta = 0.4 + 0.006*zero_percentage.loc["77ships"] # Linear function to scale beta such that persons with few reviews rely more on LSH and vice versa
total_score = beta*average_similarities_per_beer  + (1-beta)* cf_scores_ex

In [56]:
total_score.nlargest(5)

beer
Westvleteren 12            0.360347
Trappistes Rochefort 10    0.333341
AleSmith Speedway Stout    0.313445
St. Bernardus Abt 12       0.305852
Struise Pannepot           0.302391
dtype: float64

In [65]:
# Penalise difference in abv by using a nonlinear function penalising greater differences more
abv = beer_info.loc[average_similarities_per_beer.index.tolist()]["abv"]
abv_2 = df_test_masked.iloc[1]["abv"] # Extract from example
alpha = 0.01

if abv_2 == 0:
    total_score = total_score - 2 * abs(abv - abv_2) # Ensure zero percent alchol
else:
    total_score = total_score - alpha * ((abv - abv_2)**2) / (abv_2**1.5 + 1)

In [58]:
total_score.nlargest(5)

beer
Westvleteren 12                   0.292282
Ayinger Celebrator Doppelbock     0.277752
5th Element BONNER                0.274321
3 Fonteinen Schaarbeekse Kriek    0.273181
Westvleteren 8                    0.268034
dtype: float64

In [66]:
# Add bonus for match in style
relevant_styles = beer_info.loc[average_similarities_per_beer.index.tolist()]["subgenre"]
style = df_test_masked.iloc[1]["subgenre"] # Extract from example
style_mask = relevant_styles == style
style_bonus = np.zeros(len(total_score))
style_bonus[style_mask] = 0.05

total_score += style_bonus

In [67]:
# print final scores which are the resulting beer reccomendations
total_score.nlargest(10)

beer
Pulfer Landsky Session IPA       0.307425
Kees Session IPA                 0.301117
Westvleteren 12                  0.292282
Oersoep Hopfather                0.289826
LOC Dinky Mosaic                 0.279713
Ayinger Celebrator Doppelbock    0.277752
Bevog Zo Session IPA             0.276543
Garage Triangles                 0.275013
5th Element BONNER               0.274321
Mikkeller Session IPA Nelson     0.274057
dtype: float64

### Evaluation settup

In [73]:
def give_reccomendation(lsh, test_vector, df_filtered, pr_df, user, beer_info, abv_2, style):
    candidates = lsh.query(test_vector)
    bucket_vectors = df_filtered[df_filtered["id"].isin(list(candidates[0]))]["sbert_embedding"]
    bucket_vectors = np.vstack(bucket_vectors.to_numpy())

    # Calculate the similarity between each word embedding vector in bucket and example review_text:
    similarities = cosine_similarity(test_vector, bucket_vectors)[0]

    beer_similarities = pd.DataFrame({
        'similarity': similarities,
        'beer': df_filtered[df_filtered["id"].isin(list(candidates[0]))]["name"].values 
    })
    # Calculate average similarity between prompt (i.e. example review_text) and each beer in bucket
    average_similarities_per_beer = beer_similarities.groupby('beer')['similarity'].mean()

    # Next use collaborative filtering output to get cf-scores
    cf_scores = pr_df[average_similarities_per_beer.index.tolist()] # Only extract similarities for the beers from the LSH bucket
    cf_scores_ex = cf_scores.loc[user] # Get cf_scores for our example user

    beta = 0.4 + 0.006*zero_percentage.loc[user] # Linear function to scale beta such that persons with few reviews rely more on LSH and vice versa
    total_score = beta*average_similarities_per_beer  + (1-beta)* cf_scores_ex

    # Penalise difference in abv by using a nonlinear function penalising greater differences more
    abv = beer_info.loc[average_similarities_per_beer.index.tolist()]["abv"]
    alpha = 0.01
    if abv_2 == None:
        total_score = total_score
    elif abv_2 == 0:
        total_score = total_score - 2 * abs(abv - abv_2) # Ensure zero percent alchol
    else:
        total_score = total_score - alpha * ((abv - abv_2)**2) / (abv_2**1.5 + 1)
    
    
    # Add bonus for match in style
    if style:
        relevant_styles = beer_info.loc[average_similarities_per_beer.index.tolist()]["subgenre"]
        style_mask = relevant_styles == style
        style_bonus = np.zeros(len(total_score))
        style_bonus[style_mask] = 0.05

        total_score += style_bonus
        
    return total_score

In [None]:
# TODO Make code to loop through examples