Let's start by importing the data for beers and from user

In [1]:
import pandas as pd

# Load the datasets
beer_df = pd.read_csv('archive/beer_profile_and_ratings.csv')
users_df = pd.read_csv('archive/users_Y.csv')

# Display the first few rows of each dataframe to understand their structure
beer_df.head(), users_df.head()


(                           Name    Style  \
 0                         Amber  Altbier   
 1                    Double Bag  Altbier   
 2                Long Trail Ale  Altbier   
 3                  Doppelsticke  Altbier   
 4  Sleigh'r Dark Doüble Alt Ale  Altbier   
 
                                             Brewery  \
 0                               Alaskan Brewing Co.   
 1                            Long Trail Brewing Co.   
 2                            Long Trail Brewing Co.   
 3  Uerige Obergärige Hausbrauerei GmbH / Zum Uerige   
 4                           Ninkasi Brewing Company   
 
                                     Beer Name (Full)  \
 0                  Alaskan Brewing Co. Alaskan Amber   
 1                  Long Trail Brewing Co. Double Bag   
 2              Long Trail Brewing Co. Long Trail Ale   
 3  Uerige Obergärige Hausbrauerei GmbH / Zum Ueri...   
 4  Ninkasi Brewing Company Sleigh'r Dark Doüble A...   
 
                                          Desc

In [2]:
beer_df.shape, beer_df.columns, users_df.shape, users_df.columns

((3197, 25),
 Index(['Name', 'Style', 'Brewery', 'Beer Name (Full)', 'Description', 'ABV',
        'Min IBU', 'Max IBU', 'Astringency', 'Body', 'Alcohol', 'Bitter',
        'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy', 'Spices', 'Malty',
        'review_aroma', 'review_appearance', 'review_palate', 'review_taste',
        'review_overall', 'number_of_reviews'],
       dtype='object'),
 (3197, 10),
 Index(['Column_1', 'Column_2', 'Column_3', 'Column_4', 'Column_5', 'Column_6',
        'Column_7', 'Column_8', 'Column_9', 'Column_10'],
       dtype='object'))

Modify the user dataframe to make it clearer

In [3]:
# Rename the columns in users_df to more meaningful names
users_df.columns = [f'User {i+1}' for i in range(users_df.shape[1])]

# Display the first few rows of the renamed users_df
users_df.head()

Unnamed: 0,User 1,User 2,User 3,User 4,User 5,User 6,User 7,User 8,User 9,User 10
0,0,4,2,2,0,0,4,0,0,1
1,3,4,0,0,0,3,0,0,4,0
2,0,0,0,0,0,4,0,2,0,1
3,0,0,4,0,4,0,0,5,0,2
4,5,5,3,0,0,0,0,3,0,0


In [4]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Prepare data for the Surprise library
# Convert users_df to long format (user, item, rating)
ratings = users_df.reset_index().melt(id_vars=['index'], var_name='user', value_name='rating')
ratings.columns = ['beer_id', 'user', 'rating']

# Remove rows where rating is 0 (assuming 0 means no rating)
ratings = ratings[ratings['rating'] != 0]

# Define a Reader object and load the data
reader = Reader(rating_scale=(1, 5))  
data = Dataset.load_from_df(ratings[['user', 'beer_id', 'rating']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Train the SVD model
algo = SVD()
algo.fit(trainset)

# Evaluate the model
predictions = algo.test(testset)
accuracy.rmse(predictions)


RMSE: 1.4744


1.4744395505537649

In [5]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Assuming users_df is already loaded and columns renamed
ratings = users_df.reset_index().melt(id_vars=['index'], var_name='user', value_name='rating')
ratings.columns = ['beer_id', 'user', 'rating']
ratings = ratings[ratings['rating'] != 0]

# Define a Reader object and load the data
reader = Reader(rating_scale=(1, 5))  # Binary rating scale
data = Dataset.load_from_df(ratings[['user', 'beer_id', 'rating']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Train the SVD model
algo = SVD()
algo.fit(trainset)

# Evaluate the model
predictions = algo.test(testset)
print(f"RMSE: {accuracy.rmse(predictions)}")


RMSE: 1.4866
RMSE: 1.4865652960981417


In [8]:
def get_top_n(predictions, n=10, id_to_name=None):
    from collections import defaultdict
    #create a dictionary containing as keys the users
    #each item contains an item id and the predicted rating
    top_n = defaultdict(list)
    #uid: User ID
    #iid: Item ID (beer ID in this case)
    #true_r: True rating (provided by the user)
    #est: Estimated rating (predicted by the model)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    
    #Sort the items
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        if id_to_name:
            user_ratings = [(id_to_name[iid], est) for (iid, est) in user_ratings[:n]]
        else:
            user_ratings = user_ratings[:n]
        top_n[uid] = user_ratings
    
    return top_n
# Generate recommendations for all users
#build_anti_testset(): This method generates an "anti-testset" from the training set. The anti-testset consists of all user-item pairs that are not present in the training set, i.e., all items that a user has not rated. Essentially, it includes every possible user-item pair that could be recommended, minus the pairs that were in the training set.
#algo: This is the trained collaborative filtering model (in this case, an instance of SVD).
# The test method takes a set of user-item pairs and predicts ratings for them. The input should be a list of tuples 
predictions = algo.test(trainset.build_anti_testset())
beer_id_to_name = beer_df.reset_index().set_index('index')['Name'].to_dict()
top_n = get_top_n(predictions, n=10, id_to_name=beer_id_to_name)

# Print the recommended items for each user
for user, user_ratings in top_n.items():
    print(f"User {user}: {user_ratings}")


User User 10: [('Cuvee Freddy', 4.233329338935087), ('Heineken Tarwebok', 4.2243549168586), ("Milwaukee's Best Light", 4.176924492735365), ('Saranac Chocolate Lager', 4.169674555757348), ('De Koninck APA (Antwaarpse Pale Ale)', 4.142330068784463), ('Speedway Stout', 4.1182297526512865), ('Wolters Pilsener', 4.117519083791828), ('Koko Brown', 4.114865430573337), ('Ola Dubh Special Reserve 12', 4.0824672241846285), ('Imperial Stout', 4.075612606325105)]
User User 5: [('Dos Equis Ambar', 4.43420830920791), ('The Saints Whisky Beer', 4.342168563309224), ('White Birch Hooksett Ale', 4.272017556853708), ('Oude Lambik', 4.256099147652705), ('Strike Out Stout', 4.249504283042972), ('Estrella Damm Inedit', 4.202355689496885), ('Witkap Pater Singel / Stimulo', 4.194083525751382), ('Lou Pepe - Gueuze', 4.173838233828795), ('Pipeline Porter', 4.161166880172982), ('Bone Warmer Imperial Amber Ale', 4.16040395160125)]
User User 1: [('Old Man Winter', 4.197204411203563), ('Westmalle Trappist Dubbel', 