In [2]:
#Imports
import pandas as pd
import numpy as np


In [3]:
# Load data and preprocess
df = pd.read_pickle('encoded_beers_SBERT.pkl')


In [4]:
df_filtered = df.drop_duplicates(["name", "reviewer", "review_text"])
print("Size after drop_duplicates: ", len(df_filtered))
df_filtered['rating'] = pd.to_numeric(df_filtered['rating'], errors='coerce')  # set erros to NaN
df_filtered = df_filtered.dropna(subset=['rating'])  # Drop rows where 'rating' is NaN
print("Size after drop rating NA: ", len(df_filtered))
df_filtered['abv'] = pd.to_numeric(df_filtered['abv'].str.rstrip('%'), errors='coerce') 
df_filtered = df_filtered.dropna(subset=['abv'])
print("Size after drop abv NA: ", len(df_filtered))


Size after drop_duplicates:  1157819


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['rating'] = pd.to_numeric(df_filtered['rating'], errors='coerce')  # set erros to NaN


Size after drop rating NA:  1157807
Size after drop abv NA:  1154739


In [5]:
# Preprocessing
# TODO: Remove duplicates, lav ny id kollonne, lav ratings og abv om til floats

In [4]:
#drop_cols = ["brewery", "subgenre", "abv", "sbert_embedding"]
#df_collab = df.drop(drop_cols, axis=1)

In [6]:
# Make LSH
# Extract vectors and identifiers
vectors = np.vstack(df_filtered["sbert_embedding"].values)  # Combine embeddings into a 2D array
identifiers = df_filtered.index.tolist()  # Use review IDs as identifiers

In [7]:
def generate_hyperplanes(dim, num_hash_functions):
    """
    Generate random hyperplanes for hash functions.
    
    Parameters:
    - dim: Dimensionality of the embeddings.
    - num_hash_functions: Number of hash functions per table.
    
    Returns:
    - A matrix of shape (num_hash_functions, dim) where each row is a hyperplane.
    """
    return np.random.randn(num_hash_functions, dim)

def hash_vectors(vectors, hyperplanes):
    """
    Hash a batch of vectors using a set of hyperplanes.

    Parameters:
    - vectors: Input vectors (2D array of shape [n_samples, d]).
    - hyperplanes: Matrix of hyperplanes (2D array of shape [k, d]).

    Returns:
    - A matrix of binary hash values (shape [n_samples, k]).
    """
    # Compute dot products and return binary hash values
    return (np.dot(vectors, hyperplanes.T) > 0).astype(int)

In [8]:
# Build LSH framework
from collections import defaultdict

class LSHVectorized:
    def __init__(self, d, k, L):
        """
        Initialize the LSH scheme with vectorized support.

        Parameters:
        - d: Dimensionality of the input vectors.
        - k: Number of hash functions per table.
        - L: Number of hash tables.
        """
        self.L = L
        self.tables = [defaultdict(list) for _ in range(L)]
        self.hyperplanes = [generate_hyperplanes(d, k) for _ in range(L)]

    def add_vectors(self, vectors, identifiers):
        """
        Add a batch of vectors to the LSH index.

        Parameters:
        - vectors: Input vectors (2D array of shape [n_samples, d]).
        - identifiers: A list of unique identifiers for the vectors.
        """
        for table, hyperplanes in zip(self.tables, self.hyperplanes):
            # Compute hash values for all vectors at once
            hash_values = hash_vectors(vectors, hyperplanes)
            
            # Convert binary hash values to tuples for dictionary keys
            hash_keys = [tuple(h) for h in hash_values]
            
            # Add vectors to their corresponding buckets
            for identifier, key in zip(identifiers, hash_keys):
                table[key].append(identifier)

    def query(self, vectors):
        """
        Query the LSH index to find similar items for a batch of vectors.

        Parameters:
        - vectors: Query vectors (2D array of shape [n_samples, d]).

        Returns:
        - A list of sets, where each set contains the candidates for a query vector.
        """
        candidates = [set() for _ in range(len(vectors))]
        for table, hyperplanes in zip(self.tables, self.hyperplanes):
            # Compute hash values for all query vectors
            hash_values = hash_vectors(vectors, hyperplanes)
            
            # Convert binary hash values to tuples for dictionary keys
            hash_keys = [tuple(h) for h in hash_values]
            
            # Retrieve candidates for each query
            for i, key in enumerate(hash_keys):
                candidates[i].update(table.get(key, []))
        return candidates


In [9]:
## Run LSH ##
# Initialize LSH scheme
d = 384
k = 14 
L = 7

lsh = LSHVectorized(d, k, L)

In [10]:
# Add vectors to the LSH index
lsh.add_vectors(vectors[1:], identifiers)

In [73]:
# Test query
test_vector = vectors[0].reshape(1, -1)  # Use the first vector as an example query
candidates = lsh.query(test_vector)
len(candidates[0])

4104

In [12]:
len(set(df_filtered[df_filtered["id"].isin(list(candidates[0]))]["name"])) # Different beer in bucket

1109

In [13]:
bucket_vectors = df_filtered[df_filtered["id"].isin(list(candidates[0]))]["sbert_embedding"]

In [14]:
index = df_filtered[df_filtered["id"].isin(list(candidates[0]))].index

In [15]:
bucket_vectors = np.vstack(bucket_vectors.to_numpy())

In [16]:
# Calculate the similarity in bucket:
from sklearn.metrics.pairwise import cosine_similarity

sims = cosine_similarity(test_vector, bucket_vectors)[0]

In [17]:
beer_similarities = pd.DataFrame({
    'similarity': sims,
    'beer': df_filtered[df_filtered["id"].isin(list(candidates[0]))]["name"].values  # Ensure this corresponds to your bucket_vectors
})

In [56]:
beer_similarities

Unnamed: 0,similarity,beer
0,0.322743,Oitava Colina Vila Berta
1,0.387336,Dois Corvos / Frontaal Guanabana Mañana Smooth...
2,0.240947,Dois Corvos Creature
3,0.447152,Dois Corvos Creature
4,0.344319,Dois Corvos Matiné Session IPA
...,...,...
2012,0.214279,Leroy Stout
2013,0.303932,Gruut Amber Ale
2014,0.501188,Mongozo Palmnut
2015,0.496383,Seefbier


In [18]:
average_similarities_per_beer = beer_similarities.groupby('beer')['similarity'].mean()
# TODO Kommenter på at mean kan ødelægge modsættende reviews

In [54]:
sims[:-20]

array([0.32274258, 0.38733554, 0.24094652, ..., 0.48198116, 0.4326756 ,
       0.32503977], dtype=float32)

In [60]:
df_filtered.iloc[0]

id                                                                  1
name                                                Wild Dog Pale Ale
brewery                                       Wild Dog (Tiemann Beer)
subgenre                                            American Pale Ale
abv                                                               5.2
location                                                     🇯🇪Jersey
rating                                                            3.5
average_rating                                                   2.99
reviewer                                             Jerseyislandbeer
review_date                                         December 14, 2023
review_text         330ml can from Shoprite in Livingstone. At hom...
algorithm_rating                                                 28.0
total_reviews                                                      11
sbert_embedding     [0.037878353, 0.00593541, 0.0062317043, -0.011...
Name: 0, dtype: obje

In [59]:
average_similarities_per_beer.nlargest(5)

beer
Protokoll Billie is a Punk Rocker                           0.649359
CRAK Mundaka Session IPA                                    0.632073
Hoppin' Frog Frogichlaus                                    0.619754
Lough Gill Spear                                            0.617472
De Dochter van de Korenaar L'Ensemble Double Barrel-Aged    0.614338
Name: similarity, dtype: float32

In [None]:
# Make CF
def collaborative_filtering(df, user_id,  k,  drop_cols = ["brewery", "subgenre", "abv", "sbert_embedding"]):
    df_colab = df.drop(drop_cols, axis=1)

    # TODO Tænk over om normalisering giver mening ift. at man måske bare godt kan lide alle øl?
    # For man har ikke alle personens anmeldelser kun dens anmeldelser af gode øl
    # Alternativt "normaliser" ved at ændre skal ved threshold, så dårlig rating bliver negativ (hvis man bruger cosine)
    # Evt. normalise mellem 0 og 5
    threshold = 3 # Threshold for like/hate
    # TODO Maybe when inserting threshold determine it based on the average rating of a user?
    df_colab["like/hate"] = df_colab["rating"].apply(lambda x: 1 if x >= threshold else 0)
    utility_matrix = df_colab.pivot_table(index='reviewer', columns='name', values='like/hate')
    # fill missing values with NaN or 0 (change depending on  approach)
    utility_matrix = utility_matrix.fillna(0)
    binary_matrix = utility_matrix.values
    user = binary_matrix[user_id,:]
    # Compute intersections
    intersection = np.dot(user, binary_matrix.T)

    # Compute union
    union = np.bitwise_or(user.reshape(1, -1).astype(int), binary_matrix.astype(int)).sum(axis=1)

    # Compute Jaccard similarity
    jaccard_similarity = intersection / union
    similarity_df = pd.DataFrame(jaccard_similarity, index=utility_matrix.index, columns=['Jaccard Similarity'])

    # Compute cosine
    norm_product = np.linalg.norm(user) * np.linalg.norm(binary_matrix, axis=1)
    cosine_similarity = intersection / norm_product
    similarity_df["Cosine"] = cosine_similarity

    # Get the top-k similar users for each user
    # TODO filter away all similarities below a certain threshold
    # K Number of neighbors
    top_k_neighbors_jaccard = similarity_df.nlargest(k, "Jaccard Similarity")
    top_k_neighbors_cosine = similarity_df.nlargest(k, "Cosine Similarity")
    return top_k_neighbors_jaccard, top_k_neighbors_cosine # Top k users with most similar taste

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def predict_ratings_user_based(user_item_matrix, similarity_matrix):
    # TODO Hvordan håndterer vi øl personen allerede har rated
    # Convert to a numpy array for computation
    user_item_matrix = user_item_matrix.values

    # Compute mean ratings for each user
    user_means = np.ma.masked_equal(user_item_matrix, 0).mean(axis=1).filled(0)
    
    # Center the matrix by subtracting user means
    ratings_diff = user_item_matrix - user_means[:, None]
    ratings_diff[np.isnan(ratings_diff)] = 0  # Replace NaN deviations with 0

    # Compute predictions
    similarity_sum = np.abs(similarity_matrix).sum(axis=1)[:, None]
    pred = user_means[:, None] + np.dot(similarity_matrix, ratings_diff) / (similarity_sum + 1e-8)
    
    return pred


def collaborative_filtering(df, drop_cols = ["brewery", "subgenre", "abv", "sbert_embedding"]):
    #df_colab = df.drop(drop_cols, axis=1)
    df_colab = df
    #df_colab['rating'] = pd.to_numeric(df_colab['rating'], errors='coerce')  # Coerce invalid parsing to NaN
    #df_colab = df_colab.dropna(subset=['rating'])  # Drop rows where 'rating' is NaN


    user_item_matrix = df_colab.pivot_table(
        index="reviewer",     # Rows: Reviewers
        columns="name",       # Columns: Beer names
        values="rating",      # Values: Ratings
        fill_value=0          # Fill missing ratings with 0
    )
    

    # Compute cosine similarity
    #cosine_similarity = compute_cosine_similarity_manual(utility_matrix.values)
    cosine_similarity_matrix = cosine_similarity(user_item_matrix)
    
    # Predict ratings
    predicted_ratings = predict_ratings_user_based(user_item_matrix, cosine_similarity_matrix)

    pr_df = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)


    return pr_df

In [None]:
pr_df = collaborative_filtering(df_filtered).loc["Jerseyislandbeer"]

In [23]:
df_filtered.iloc[0]

id                                                                  1
name                                                Wild Dog Pale Ale
brewery                                       Wild Dog (Tiemann Beer)
subgenre                                            American Pale Ale
abv                                                               5.2
location                                                     🇯🇪Jersey
rating                                                            3.5
average_rating                                                   2.99
reviewer                                             Jerseyislandbeer
review_date                                         December 14, 2023
review_text         330ml can from Shoprite in Livingstone. At hom...
algorithm_rating                                                 28.0
total_reviews                                                      11
sbert_embedding     [0.037878353, 0.00593541, 0.0062317043, -0.011...
Name: 0, dtype: obje

In [24]:
pr_df.loc["Jerseyislandbeer"].nlargest(5)

name
Trappistes Rochefort 10                 1.272651
Chimay Grande Réserve (Bleue / Blue)    1.253129
St. Bernardus Abt 12                    1.171847
Orval                                   1.135382
Duvel                                   1.068443
Name: Jerseyislandbeer, dtype: float64

In [None]:
# Make reccomendation

In [25]:
cf_scores = pr_df[average_similarities_per_beer.index.tolist()]


In [29]:
average_similarities_per_beer.nlargest(10)

beer
Protokoll Billie is a Punk Rocker                           0.649359
CRAK Mundaka Session IPA                                    0.632073
Hoppin' Frog Frogichlaus                                    0.619754
Lough Gill Spear                                            0.617472
De Dochter van de Korenaar L'Ensemble Double Barrel-Aged    0.614338
Camba Bavaria Pale Ale                                      0.611723
Hill Farmstead Aaron                                        0.604761
Hop Hooligans Royal Execution                               0.602853
Galway Bay / Pühaste Tharapita                              0.601325
Aecht Schlenkerla Weichsel Rotbier                          0.599909
Name: similarity, dtype: float32

In [28]:
cf_scores.loc["Jerseyislandbeer"].nlargest(10)
# TODO normalization works? values larger tahn 1

name
Trappistes Rochefort 10                1.272651
St. Bernardus Abt 12                   1.171847
Ayinger Celebrator Doppelbock          1.003624
Schneider Weisse Tap 06 - Aventinus    0.962352
Westmalle Dubbel                       0.808483
Hoegaarden                             0.793364
Weihenstephaner Hefeweissbier          0.762673
Westvleteren 12                        0.727192
Founders Breakfast Stout               0.703024
AleSmith Speedway Stout                0.675715
Name: Jerseyislandbeer, dtype: float64

In [30]:
totoal_score = 1* cf_scores.loc["Jerseyislandbeer"] + 1* average_similarities_per_beer
# TODO Tilføj alpha og gang med hhv. a og (1-a)

In [31]:
totoal_score.nlargest(20)

name
Trappistes Rochefort 10                        1.684413
St. Bernardus Abt 12                           1.575763
Ayinger Celebrator Doppelbock                  1.366831
Schneider Weisse Tap 06 - Aventinus            1.344995
Westmalle Dubbel                               1.238715
Hoegaarden                                     1.144097
Westvleteren 12                                1.138287
Weihenstephaner Hefeweissbier                  1.116987
Bell's Expedition Stout                        1.089095
Paulaner Salvator                              1.081327
Rodenbach Grand Cru                            1.059474
Unibroue Trois Pistoles                        1.046831
Saison Dupont                                  1.043657
Schneider Weisse Tap 09 - Aventinus Eisbock    1.033184
AleSmith Speedway Stout                        1.025854
Founders Breakfast Stout                       1.009977
Founders KBS (Kentucky Breakfast Stout)        1.000817
Stone Imperial Russian Stout               

In [32]:
beer_info = df_filtered[['name', 'abv', 'subgenre']]

# Drop duplicate rows based on the 'name' column
beer_info = beer_info.drop_duplicates(subset='name')

# Set the 'name' column as the index
beer_info.set_index('name', inplace=True)

In [33]:
# Penalise difference in abv
abv = beer_info.loc[average_similarities_per_beer.index.tolist()]["abv"]
abv_2 = 5.2
alpha = 0.03
if abv_2 == 0:
    total_score_plus = totoal_score - 2 * abs(abv - abv_2) # Ensure zero percent alchol
else:
    total_score_plus = totoal_score - alpha * ((abv - abv_2)**2) / (abv_2**1.5 + 1)
# TODO how to penalise abv: linearly, logarithmic, squared?

In [269]:
# Test abv scaling
alpha = 0.02
abv_1 = 9
abv_2 = 6

print("squared: ", alpha * (abv_1 - abv_2)**2)
print("linear", alpha*(abv_1 - abv_2))
print("logarithmic: ", alpha * np.log(1 + abs(abv_1 - abv_2)))
print("logarithmic scaled: ", alpha * np.log(1 + abs(abv_1 - abv_2) / abv_2))
print("linear scaled: ", alpha * abs(abv_1 - abv_2) / abv_2)
print("Squared scaled: ", alpha * (abs(abv_1 - abv_2)**2) / (abv_2**1.5 + 1))

squared:  0.18
linear 0.06
logarithmic:  0.027725887222397813
logarithmic scaled:  0.008109302162163289
linear scaled:  0.01
Squared scaled:  0.011467204289329453


In [35]:
# Add bonus for match in style
relevant_styles = beer_info.loc[average_similarities_per_beer.index.tolist()]["subgenre"]
style = "American Pale Ale"
style_mask = relevant_styles == style
style_bonus = np.zeros(len(totoal_score))
style_bonus[style_mask] = 0.1 

total_score_plus += style_bonus

In [58]:
total_score_plus.nlargest(10)

name
Trappistes Rochefort 10                1.597595
St. Bernardus Abt 12                   1.522006
Ayinger Celebrator Doppelbock          1.361581
Schneider Weisse Tap 06 - Aventinus    1.323996
Westmalle Dubbel                       1.231155
Hoegaarden                             1.143887
Weihenstephaner Hefeweissbier          1.116893
Westvleteren 12                        1.079957
Paulaner Salvator                      1.064318
Rodenbach Grand Cru                    1.057981
dtype: float64

In [52]:
# TODO Tilføj overkategorier

{'Abbey Dubbel',
 'Abbey Tripel',
 'Abt/Quadrupel',
 'All Cider Styles',
 'All Sake Styles',
 'Altbier',
 'Amber Ale',
 'Amber Lager/Vienna',
 'American Pale Ale',
 'American Strong Ale',
 'Anglo-American Ales',
 'Baltic Porter',
 'Barley Wine',
 'Belgian Ale',
 'Belgian Strong Ale',
 'Belgian-Style Ales',
 'Berliner Weisse',
 'Bitter',
 'Bière de Garde',
 'Black IPA',
 'Brown Ale',
 'California Common',
 'Cider',
 'Cream Ale',
 'Czech Pilsner (Svetlý)',
 'Doppelbock',
 'Dortmunder/Helles',
 'Dry Stout',
 'Dunkel/Tmavý',
 'Dunkelweizen',
 'Dunkler Bock',
 'Eisbock',
 'English Pale Ale',
 'English Strong Ale',
 'Foreign Stout',
 'Fruit Beer',
 'German Hefeweizen',
 'German Kristallweizen',
 'Golden Ale/Blond Ale',
 'Grodziskie/Gose/Lichtenhainer',
 'Heller Bock',
 'Ice Cider/Perry',
 'Imperial Pils/Strong Pale Lager',
 'Imperial Stout',
 'Imperial/Double IPA',
 'Imperial/Strong Porter',
 'India Pale Ale (IPA)',
 'Irish Ale',
 'Kölsch',
 'Lagers',
 'Lambic and Sour Ale',
 'Low Alcohol',


In [None]:
# Evaluate
# Evaluer tid det tager
# Evalear precision og recall på reccomendations
# Evaluer og tune paramteres LSH

