> Chalkiopoulos Georgios, Electrical and Computer Engineer NTUA <br />
> Data Science postgraduate Student <br />
> gchalkiopoulos@aueb.gr

# Import Libraries

In [45]:
from logging import INFO, WARNING, DEBUG, ERROR
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds

# utils
from utils import helper_functions
from utils import lda
from utils import reader
from utils.Loggers import BaseLogger
from utils import svd

# Set Logging level

In [2]:
BaseLogger.level = INFO

# Read data

In [3]:
file_path: str = "beer_reviews.csv"
reader = reader.ReviewReader(file_path=file_path)
reviews: pd.DataFrame = reader.read_reviews()

[2023-03-04 19:10:58,907] INFO [ReviewReader] - Loading beer_reviews.csv
[2023-03-04 19:11:06,287] INFO [ReviewReader] - 300000 reviews loaded.
[2023-03-04 19:11:12,490] INFO [ReviewReader] - 600000 reviews loaded.
[2023-03-04 19:11:18,856] INFO [ReviewReader] - 900000 reviews loaded.
[2023-03-04 19:11:25,028] INFO [ReviewReader] - 1200000 reviews loaded.
[2023-03-04 19:11:31,426] INFO [ReviewReader] - 1500000 reviews loaded.
[2023-03-04 19:11:33,258] INFO [ReviewReader] - All reviews loaded. Total reviews: 1586614


In [4]:
print("Shape: ", reviews.shape)
reviews.head()

Shape:  (1399623, 12)


Unnamed: 0,user_id,user_name,beer_id,beer_name,beer_style,review_overall,review_aroma,review_appearance,review_taste,review_palate,brewery_id,rating
10,7,fodeeoz,436,Amstel Light,Light Lager,3.0,2.0,3.0,2.5,2.5,163,A
18,15,jdhilt,436,Amstel Light,Light Lager,2.5,3.0,3.0,2.0,2.0,163,A
19,16,UCLABrewN84,58046,Rauch Ür Bock,Rauchbier,4.5,4.5,3.0,4.5,4.0,1075,P
20,17,zaphodchak,58046,Rauch Ür Bock,Rauchbier,4.0,4.0,4.0,4.0,3.0,1075,P
21,18,Tilley4,58046,Rauch Ür Bock,Rauchbier,4.0,4.5,4.0,4.0,3.5,1075,P


## Item-Based and User-Based Recommendations with Hashing

* Due to the size of the data, we will use Hashing in order to be able to perform calculations

In [5]:
indexer_user = helper_functions.load_indexer(focus="usr",
                             reviews=reader.train, beer_id_col='beer_id', user_id_col='user_id',
                             beer_mapping=reader.beer_mapping, user_mapping=reader.user_mapping,
                             indexer_path="indexer_usr_subset_test.pkl"
                            )

indexer_beer = helper_functions.load_indexer(focus="beer",
                             reviews=reader.train, beer_id_col='beer_id', user_id_col='user_id',
                             beer_mapping=reader.beer_mapping, user_mapping=reader.user_mapping,
                             indexer_path="indexer_beer_subset_test.pkl"
                            )

[2023-03-04 19:12:21,536] INFO [BaseLogger] - Trying to load indexer_usr_subset_test.pkl file.


Data loaded
Loading ratings: [████████████████████████████████████████████████████████████] 10596/10596

user ratings processed
10500 out of 10596 entities indexed.

[2023-03-04 19:15:17,755] INFO [BaseLogger] - Saving indexer_usr_subset_test.pkl file.


usr-based index created


[2023-03-04 19:15:22,077] INFO [BaseLogger] - indexer_usr_subset_test.pkl file saved.
[2023-03-04 19:15:22,079] INFO [BaseLogger] - Trying to load indexer_beer_subset_test.pkl file.


Data loaded
Loading ratings: [████████████████████████████████████████████████████████████] 3929/3929

beer ratings processed
3500 out of 3929 entities indexed.

[2023-03-04 19:16:32,459] INFO [BaseLogger] - Saving indexer_beer_subset_test.pkl file.


beer-based index created


[2023-03-04 19:16:34,865] INFO [BaseLogger] - indexer_beer_subset_test.pkl file saved.


In [10]:
already_rated = helper_functions.recommend_ub(indexer_user, user=10, rec_num=10, verbose=1)

print("\n\nPositive ratio:", f'{list(already_rated.values()).count("P")/len(already_rated.values())*100:.2f}%')
print("Already Rated number of beers:", f'{len(already_rated.values())}')


I suggest the following beers because they have received positive ratings
from users who tend to like what you like:

15881 Tröegs Nugget Nectar 27.07
21822 Founders Imperial Stout 25.81
33644 B.O.R.I.S. The Crusher Oatmeal-Imperial Stout 24.27
5441 Founders Centennial IPA 23.89
34146 Founders Double Trouble 20.92
7348 Founders Porter 20.52
6368 Masala Mama India Pale Ale 19.36
30288 Double Simcoe IPA 18.86
3635 La Terrible 18.78
35036 Founders Backwoods Bastard 18.57


Positive ratio: 73.21%
Already Rated number of beers: 56


In [11]:
already_rated = helper_functions.recommend_mb(indexer_beer, user=10, rec_num=10, verbose=1)

print("\n\nPositive ratio:", f'{list(already_rated.values()).count("P")/len(already_rated.values())*100:.2f}%')
print("Already Rated number of beers:", f'{len(already_rated.values())}')


I suggest the following beers because they are similar to the beers you already like:


 21822 Founders Imperial Stout 10.09

 5441 Founders Centennial IPA 9.82

 33644 B.O.R.I.S. The Crusher Oatmeal-Imperial Stout 9.58

 15881 Tröegs Nugget Nectar 9.12

 30288 Double Simcoe IPA 8.01

 7348 Founders Porter 7.92

 8023 Siberian Night Imperial Stout 7.91

 6368 Masala Mama India Pale Ale 7.82

 34146 Founders Double Trouble 7.78

 7463 Founders Dirty Bastard 7.24


Positive ratio: 76.00%
Already Rated number of beers: 50


# LDA

In [12]:


# list of reviews (beer_id+rating) per user
BaseLogger().logger.info("Creating list of reviews per user.")
reviews_user = reviews[["user_id", "beer_id", "rating"]].groupby('user_id').apply(lambda x: list(x['beer_id'].astype(str).str.cat(x['rating'])))

print("Number of users:", reviews_user.shape[0])
reviews_user.head()

BaseLogger().logger.info("Creating User docs.")
docs: dict = dict(zip(reviews_user.index, reviews_user.values))
BaseLogger().logger.info("Docs created.")

[2023-03-04 19:20:07,823] INFO [BaseLogger] - Creating list of reviews per user.
[2023-03-04 19:20:17,154] INFO [BaseLogger] - Creating User docs.
[2023-03-04 19:20:17,159] INFO [BaseLogger] - Docs created.


Number of users: 10707


In [14]:
groups = lda.setup_LDA(docs=docs, k_range=(3, 7), k_step=2,
                   iteration_range=(0, 500), iterations_step=20,
                   top_n=100, reviews=reviews, verbose=0)

[2023-03-04 19:21:46,644] INFO [BaseLogger] - Iteration: 0	Log-likelihood: -9.507766652569602	k: 3
[2023-03-04 19:21:47,175] INFO [BaseLogger] - Iteration: 20	Log-likelihood: -9.42487249723757	k: 3
[2023-03-04 19:21:47,632] INFO [BaseLogger] - Iteration: 40	Log-likelihood: -9.39951295040399	k: 3
[2023-03-04 19:21:48,073] INFO [BaseLogger] - Iteration: 60	Log-likelihood: -9.388558425373347	k: 3
[2023-03-04 19:21:48,507] INFO [BaseLogger] - Iteration: 80	Log-likelihood: -9.383892648196417	k: 3
[2023-03-04 19:21:48,930] INFO [BaseLogger] - Iteration: 100	Log-likelihood: -9.381045812750724	k: 3
[2023-03-04 19:21:49,356] INFO [BaseLogger] - Iteration: 120	Log-likelihood: -9.380649582809836	k: 3
[2023-03-04 19:21:49,781] INFO [BaseLogger] - Iteration: 140	Log-likelihood: -9.380847096122544	k: 3
[2023-03-04 19:21:50,216] INFO [BaseLogger] - Iteration: 160	Log-likelihood: -9.381494160803975	k: 3
[2023-03-04 19:21:50,631] INFO [BaseLogger] - Iteration: 180	Log-likelihood: -9.381604493704442	k: 

Best Model: k=3, iterations=120, Log-likelihood=-9.38


In [17]:
best_recommendations = lda.recommend_lda(groups=groups, user=10, rec_num=10, _user_ratings=indexer_user.user_ratings, _beer_mapping=reader.beer_mapping)

print("Best Recommendations\n")
for beer_id, beer in best_recommendations:
    print(f"{beer_id:<6}: {beer}")

Group: 1
Positive ratio: 50.00%
Already Rated number of beers: 4

Group: 2
Positive ratio: 77.78%
Already Rated number of beers: 9

Group: 3
Positive ratio: 100.00%
Already Rated number of beers: 2

Best Recommendations

7971  : Pliny The Elder
17112 : Bell's Hopslam Ale
34483 : Ten FIDY
34420 : The Abyss
29619 : Sculpin India Pale Ale
10672 : Bourbon County Brand Stout
15881 : Tröegs Nugget Nectar
35738 : Hop Stoopid
28203 : Furious
1093  : Two Hearted Ale


# Model Based

In [49]:
for k in range(1, 302, 50):
    # convert the ratings frame to a user X movies matrix
    ratings_matrix = reviews.sort_values(by=["user_id", "beer_id"])[["user_id", "beer_id", "review_overall"]].drop_duplicates().pivot_table(index = 'user_id', columns ='beer_id', values = 'review_overall').fillna(reviews.review_overall.mean())

    ratings_matrix.head()

    #convert the matrix into an array
    ratings_np = ratings_matrix.values
    #compute the average rating per user
    user_means=np.mean(ratings_np, axis = 1)
    #subtract the mean from each rating
    ratings_np_centered = ratings_np - user_means.reshape(-1, 1)


    #unique user and movie num
    user_num = reviews.user_id.unique().shape[0]
    movie_num = reviews.beer_id.unique().shape[0]
    sparsity = round(1.0 - len(reviews) / float(user_num * movie_num), 3)

    #perform svd with k singular values (features)
    U, sigma, Vt = svds(ratings_np_centered, k = k)
    sigma = np.diag(sigma)

    for user in [5, 10, 20, 30]:
        try:
            print(k, user, svd.validate(user, reviews, U, sigma, Vt, user_means, ratings_matrix))
        except:
            print("user not found")

1 0.5749482085053034
51 0.5621308443837764
101 0.5707728964123
151 0.5783694398844864
201 0.5857972296143055
251 0.5863823845452202


KeyboardInterrupt: 

In [47]:
svd.recommend(uid=5, reviews=reviews,U=U,sigma=sigma,Vt=Vt,user_means=user_means,rec_num=10, ratings_matrix=ratings_matrix)


Unnamed: 0,beer_id,beer_name,beer_style,predicted_rating
156621,689,Red Stripe Jamaican Lager,American Adjunct Lager,5.182655
661937,2435,Beck's,German Pilsener,5.136614
1050868,630,Blue Heron Pale Ale,American Pale Ale (APA),5.135503
1246101,406,ACME California Pale Ale,American Pale Ale (APA),5.133698
1254548,403,ACME California IPA,American IPA,5.131946
1054684,4970,White Hawk Original IPA,American IPA,5.124118
1448674,33609,St. Peter's India Pale Ale,English India Pale Ale (IPA),5.117401
276553,27243,Atwater Salvation IPA,American IPA,5.110951
390113,26497,Fleur-de-lis Restoration Ale,American Pale Ale (APA),5.107196
1000753,1337,Berghoff Famous Red Ale,American Amber / Red Ale,5.10589


0.9210697086963819

In [44]:
svd.validate(5,reviews,U,sigma,Vt,user_means,ratings_matrix)

0.5621307685626032

In [None]:


# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD

# Load Reader library
reader = Reader(rating_scale=(0.5, 5))

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(reviews[['user_id', 'beer_id', 'review_overall']].drop_duplicates(), reader)

In [None]:
from surprise.model_selection.validation import cross_validate


for factors in [10, 25, 50]:
    print(factors)
    # Use the SVD algorithm.
    svd = SVD(n_factors=factors)

    # Compute the RMSE of the SVD algorithm.
    cross_validate(svd, data, measures=['RMSE'],cv=5,verbose=True)
    print("done")

In [None]:
# create a training set
trainset = data.build_full_trainset()

svd.fit(trainset)# fit the svd

In [None]:
def recommend_surprise(uid:int,
              reviews:pd.core.frame.DataFrame,
              model,
              rec_num:int
             ):

    #get all the ratings by this user
    my_ratings=reviews[["user_id", "beer_id", "review_overall"]][reviews.user_id==uid]

    # beers df
    beers_df = reviews[["beer_id", "beer_name", "beer_style"]].drop_duplicates()

    #zip the ratings into a dict
    already_rated=dict(zip(my_ratings.beer_id,my_ratings.review_overall))

    pred_dict={}# store predicted ratings

    for index,row in beers_df.iterrows(): # for every movie

        pred_dict[row.beer_id]=model.predict(uid=uid,iid= row.beer_id).est# get the pred for this user

    # sort the movies by predicted ratings
    srt=sorted(pred_dict.items(),key=lambda x:x[1],reverse=True)

    rec_set=set()# set of movie ids to be recommended

    for mid,pred in srt: # for each movie id
        if mid not in already_rated: # movie has not already been rated

            rec_set.add(mid) # add to the set

            if len(rec_set)==rec_num:break

    # make a data frame with only the recommended movies
    rec_df=pd.DataFrame(beers_df[beers_df.beer_id.isin(rec_set)])

    #add the predicted rating as a new column
    rec_df['predicted_rating']=rec_df['beer_id'].map(pred_dict)

    #sort the df by the new col
    rec_df=rec_df.sort_values(['predicted_rating'], ascending=False)

    return rec_df

In [None]:
recommend_surprise(5,
              reviews,
              svd,
              20
             )

In [None]:
def validate(uid:int,
              reviews:pd.core.frame.DataFrame,
              model
             ):

    #get all the ratings by this user
    my_ratings=reviews[["user_id", "beer_id", "review_overall"]][reviews.user_id==uid]

    # beers df
    beers_df = reviews[["beer_id", "beer_name", "beer_style"]].drop_duplicates()

    #zip the ratings into a dict
    already_rated=dict(zip(my_ratings.beer_id,my_ratings.review_overall))

    pred_dict={}# store predicted ratings

    for index,row in beers_df.iterrows(): # for every movie

        pred_dict[row.beer_id]=model.predict(uid=uid,iid= row.beer_id).est# get the pred for this user

    actual,pred=[],[]
    for mid in already_rated: # for each movie id
        actual.append(already_rated[mid])
        pred.append(pred_dict[mid])

    return mean_squared_error(actual,pred,squared=False)

In [None]:
validate(5,
          reviews,
          svd
         )

In [None]:
from surprise.prediction_algorithms.knns import KNNBasic

In [None]:
knn=KNNBasic(k=40, sim_options={'user_based':False,'min_support':5})
knn.fit(trainset)# fit the svd

In [None]:
# Compute the RMSE of the KNN algorithm.
cross_validate(knn, data, measures=['RMSE'],cv=5,verbose=True)

In [None]:
knn=KNNBasic(k=40, sim_options={'user_based':False,'min_support':5})
knn.fit(trainset)# fit the svd

In [None]:
# Compute the RMSE of the KNN algorithm.
cross_validate(knn, data, measures=['RMSE'],cv=5,verbose=True)

In [None]:
knn2=KNNBasic(k=40, sim_options={'item_based':False,'min_support':5})
knn2.fit(trainset)# fit the svd

In [None]:
# Compute the RMSE of the KNN algorithm.
cross_validate(knn2, data, measures=['RMSE'],cv=5,verbose=True)