> Chalkiopoulos Georgios, Electrical and Computer Engineer NTUA <br />
> Data Science postgraduate Student <br />
> gchalkiopoulos@aueb.gr

# Import Libraries

In [1]:
import csv
from pathlib import Path
from dataclasses import dataclass, fields
from datetime import datetime

from typing import List, Optional, Union, Set


import pandas as pd

import pickle
from collections import defaultdict
from logging import INFO, WARNING, DEBUG, ERROR

# utils
from utils.Loggers import BaseLogger
from utils.esim import Esim
from utils.helper_functions import *

# Set Logging level

In [2]:
BaseLogger.level = INFO

# Read data

In [3]:
@dataclass
class BeerReview(object):
    index: int
    brewery_id: str
    brewery_name: str
    review_time: datetime.timestamp
    review_overall: float
    review_aroma: float
    review_appearance: float
    review_profilename: str
    beer_style: str
    review_palate: float
    review_taste: float
    beer_name: str
    beer_abv: float
    beer_beerid: int
    user_id: Optional[Union[int, None]] = None

In [106]:
class ReviewReader(BaseLogger):
    """Read reviews based on an input file"""
    reviews: List[BeerReview] = []

    def __init__(self, file_path: str,
                 user_reviews_threshold: int = 10,
                 beer_reviews_threshold: int = 10,
                 subset: float = 1.0,
                 *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.file_path = file_path
        self.user_reviews_threshold = user_reviews_threshold
        self.beer_reviews_threshold = beer_reviews_threshold
        self.subset = subset

        # initialize users and beers dict
        self.users: defaultdict = defaultdict(int) # to keep track of user reviews counts
        self.beers: defaultdict = defaultdict(int) # to keep track of beer reviews counts
            
        self.user_mapping: defaultdict = defaultdict(str)
        self.beer_mapping: defaultdict = defaultdict(str)



    @property
    def input_file(self) -> Path:
        """
        Creates a Path object containing the input file.
        Raises an exception if the file doesn't exist

        Returns:
            Path object
        """
        input_file: Path = Path(self.file_path)
        if not input_file.is_file():
            self.logger.error(f"{input_file.name} file doesn't exist.")

        return input_file

    @property
    def valid_users(self) -> list:
        """Returns that have more than user_reviews_threshold reviews"""
        return [user for user, total_ratings in self.users.items() if total_ratings >= self.user_reviews_threshold]

    @property
    def valid_beers(self) -> list:
        """Returns that have more than user_reviews_threshold reviews"""
        return [beer for beer, total_ratings in self.beers.items() if total_ratings >= self.beer_reviews_threshold]
    
    @staticmethod
    def make_array(_review: BeerReview):
        """Select the necessary columns needed from the BeerReview object
        user_id, review_profilename
        beer_beerid, review_overall
        review_aroma, review_appearance, review_taste, review_palate"""

        return [_review.user_id, _review.review_profilename,
                _review.beer_beerid,_review.beer_name, _review.beer_style,
                _review.review_overall, _review.review_aroma, _review.review_appearance, _review.review_taste, _review.review_palate,
                _review.brewery_id]


    def filtered_reviews(self, reviews_df: pd.DataFrame) -> pd.DataFrame:
        """Filters the reviews dataframe to keep users based on defined thresholds"""

        # filter users
        return reviews_df.loc[(reviews_df.user_name.isin(self.valid_users))
                              & (reviews_df.beer_id.isin(self.valid_beers))
                            & (reviews_df.user_name != "")].iloc[:int((reviews_df.shape[0]*self.subset)), :]


    def read_reviews(self) -> pd.DataFrame:
        """
        Read the reviews based on the input file. Returns a list of reviews.        """
        with open(self.input_file, encoding="utf8") as f:

            self.logger.info(f"Loading {self.input_file}")

            # initialize user_id
            user_id: dict = {}
            id: int = 0

            for i, row in enumerate(csv.DictReader(f)):

                # add user_id and user_mapping
                if user_id.get(row["review_profilename"].strip()) is None:
                    user_id[row["review_profilename"].strip()] = id                    
                    self.user_mapping[id] = row["review_profilename"].strip()
                    id += 1
                
                # add beer_mapping
                if not self.beer_mapping.get(row["beer_beerid"]):
                    self.beer_mapping[row["beer_beerid"]] = row["beer_name"]
                
                
                # create a review object
                review: BeerReview = BeerReview(
                    int(row["index"]) if row["index"].isnumeric() else None,
                    row["brewery_id"],
                    row["brewery_name"].strip(),
                    datetime.fromtimestamp(int(row["review_time"])),
                    float(row["review_overall"]),
                    float(row["review_aroma"]),
                    float(row["review_appearance"]),
                    row["review_profilename"].strip(),
                    row["beer_style"].strip(),
                    row["review_palate"],
                    row["review_taste"],
                    row["beer_name"],
                    row["beer_abv"],
                    int(row["beer_beerid"]),
                    user_id[row["review_profilename"]])

                # add the review object to the total reviews
                self.reviews.append(self.make_array(review))

                # Keep user count
                self.users[review.review_profilename] += 1
                self.beers[review.beer_beerid] += 1

                if i % 300000 == 0 and i != 0:
                    self.logger.info(f"{i} reviews loaded.")


        self.logger.info(f"All reviews loaded. Total reviews: {len(self.reviews)}")
        f.close()

        # Convert to Pandas Dataframe
        reviews_df = pd.DataFrame(self.reviews,
                                  columns=["user_id", "user_name",
                                            "beer_id","beer_name", "beer_style",
                                           "review_overall", "review_aroma", "review_appearance", "review_taste", "review_palate",
                                            "brewery_id"])
        reviews_df.drop_duplicates(inplace=True)

        return self.filtered_reviews(reviews_df)


In [107]:
file_path: str = "beer_reviews.csv"
reader = ReviewReader(file_path=file_path, subset=0.1)
reviews: pd.DataFrame = reader.read_reviews()

[2023-03-01 00:03:36,569] INFO [ReviewReader] - Loading beer_reviews.csv
[2023-03-01 00:03:43,269] INFO [ReviewReader] - 300000 reviews loaded.
[2023-03-01 00:03:50,532] INFO [ReviewReader] - 600000 reviews loaded.
[2023-03-01 00:03:56,293] INFO [ReviewReader] - 900000 reviews loaded.
[2023-03-01 00:04:02,039] INFO [ReviewReader] - 1200000 reviews loaded.
[2023-03-01 00:04:07,727] INFO [ReviewReader] - 1500000 reviews loaded.
[2023-03-01 00:04:11,273] INFO [ReviewReader] - All reviews loaded. Total reviews: 1586614


In [108]:
print("Shape: ", reviews.shape)
reviews.head()

Shape:  (158584, 11)


Unnamed: 0,user_id,user_name,beer_id,beer_name,beer_style,review_overall,review_aroma,review_appearance,review_taste,review_palate,brewery_id
10,7,fodeeoz,436,Amstel Light,Light Lager,3.0,2.0,3.0,2.5,2.5,163
18,15,jdhilt,436,Amstel Light,Light Lager,2.5,3.0,3.0,2.0,2.0,163
19,16,UCLABrewN84,58046,Rauch Ür Bock,Rauchbier,4.5,4.5,3.0,4.5,4.0,1075
20,17,zaphodchak,58046,Rauch Ür Bock,Rauchbier,4.0,4.0,4.0,4.0,3.0,1075
21,18,Tilley4,58046,Rauch Ür Bock,Rauchbier,4.0,4.5,4.0,4.0,3.5,1075


## Item-Based and User-Based Recommendations with Hashing

* Due to the size of the data, we will use Hashing in order to be able to perform calculations

In [7]:
indexer_beer = load_indexer(focus="beer",
                             reviews=reviews, beer_id_col='beer_id', user_id_col='user_id',
                             beer_mapping=reader.beer_mapping, user_mapping=reader.user_mapping,
                             indexer_path="indexer_beer_subset.pkl")

indexer_user = load_indexer(focus="usr",
                             reviews=reviews, beer_id_col='beer_id', user_id_col='user_id',
                             beer_mapping=reader.beer_mapping, user_mapping=reader.user_mapping,
                             indexer_path="indexer_usr_subset.pkl")

[2023-02-28 22:24:49,131] INFO [BaseLogger] - Trying to load indexer_beer_subset.pkl file.


Data loaded
Loading ratings: [████████████████████████████████████████████████████████████] 1566/1566

beer ratings processed
beer-based index created indexed.

[2023-02-28 22:25:41,447] INFO [BaseLogger] - Saving indexer_beer_subset.pkl file.
[2023-02-28 22:25:42,691] INFO [BaseLogger] - indexer_beer_subset.pkl file saved.
[2023-02-28 22:25:42,692] INFO [BaseLogger] - Trying to load indexer_usr_subset.pkl file.


Data loaded
Loading ratings: [████████████████████████████████████████████████████████████] 9583/9583

user ratings processed
usr-based index createds indexed.

[2023-02-28 22:28:01,343] INFO [BaseLogger] - Saving indexer_usr_subset.pkl file.
[2023-02-28 22:28:04,632] INFO [BaseLogger] - indexer_usr_subset.pkl file saved.


In [8]:
already_rated = recommend_ub(indexer_user, user=5, rec_num=10, verbose=1)


I suggest the following beers because they have received positive ratings
from users who tend to like what you like:


 1904 Sierra Nevada Celebration Ale 173.26021184610593

 2751 Racer 5 India Pale Ale 159.4540317927605

 6549 Northern Hemisphere Harvest Wet Hop Ale 125.42356879863041

 7348 Founders Porter 120.33105725423805

 33644 B.O.R.I.S. The Crusher Oatmeal-Imperial Stout 106.89627620980662

 48434 Sierra Nevada Kellerweis Hefeweizen 101.81794420610073

 7463 Founders Dirty Bastard 98.8171774678231

 283 Sierra Nevada Stout 92.61508135562863

 47658 Founders CBS Imperial Stout 88.0095267433147

 1658 Big Bear Black Stout 87.52241823901221


In [9]:
already_rated = recommend_mb(indexer_beer, user=5, rec_num=10, verbose=1)


I suggest the following beers because they are similar to the beers you already like:


 33644 B.O.R.I.S. The Crusher Oatmeal-Imperial Stout 5.292497785676612

 7348 Founders Porter 4.755895965254549

 6549 Northern Hemisphere Harvest Wet Hop Ale 4.595827154944165

 2751 Racer 5 India Pale Ale 4.2599818067698365

 48434 Sierra Nevada Kellerweis Hefeweizen 3.6296267722769553

 47658 Founders CBS Imperial Stout 3.6202756962742395

 1658 Big Bear Black Stout 3.5124413421421354

 49286 Mokah 3.4997675126927885

 7463 Founders Dirty Bastard 3.355135886452315

 1904 Sierra Nevada Celebration Ale 3.1824369219308495


# LDA

In [12]:
import tomotopy as tp

# discretize ratings
reviews["rating"] = reviews.apply(lambda x: Esim.discretize_rating(x["review_overall"]), axis=1)

reivews_user = reviews[["user_id", "beer_id", "rating"]].groupby('user_id').apply(lambda x: list(x['beer_id'].str.cat(x['rating'])))

print("Number of users:", reivews_user.shape[0])
reivews_user.head()

docs: dict = dict(zip(reivews_user.index, reivews_user.values))
BaseLogger().logger.info("Docs created.")

[2023-02-28 22:30:19,893] INFO [BaseLogger] - Docs created.


Number of users: 9583


In [23]:
from copy import deepcopy
min_likelihood = -100
for k in range(3, 7, 2):
    #new LDA model
    lda = tp.LDAModel(k=k)

    for doc in docs:
        lda.add_doc(docs[doc])

    #train LDA model
    for i in range(0, 500, 20):
        lda.train(20)
        BaseLogger().logger.info('Iteration: {}\tLog-likelihood: {}\tk: {}'.format(i, lda.ll_per_word, k))
        if lda.ll_per_word > min_likelihood:
            min_data = (i, lda.ll_per_word, k)
            min_likelihood = lda.ll_per_word
            best_lda = lda
min_data

[2023-02-28 22:39:02,879] INFO [BaseLogger] - Iteration: 0	Log-likelihood: -7.269106293037566	k: 3
[2023-02-28 22:39:03,029] INFO [BaseLogger] - Iteration: 20	Log-likelihood: -7.185922215075826	k: 3
[2023-02-28 22:39:03,218] INFO [BaseLogger] - Iteration: 40	Log-likelihood: -7.151881912767065	k: 3
[2023-02-28 22:39:03,433] INFO [BaseLogger] - Iteration: 60	Log-likelihood: -7.138551838631168	k: 3
[2023-02-28 22:39:03,613] INFO [BaseLogger] - Iteration: 80	Log-likelihood: -7.134495909027678	k: 3
[2023-02-28 22:39:03,788] INFO [BaseLogger] - Iteration: 100	Log-likelihood: -7.129393466905827	k: 3
[2023-02-28 22:39:03,965] INFO [BaseLogger] - Iteration: 120	Log-likelihood: -7.1297267411696845	k: 3
[2023-02-28 22:39:04,124] INFO [BaseLogger] - Iteration: 140	Log-likelihood: -7.125713644413285	k: 3
[2023-02-28 22:39:04,291] INFO [BaseLogger] - Iteration: 160	Log-likelihood: -7.1243277112538745	k: 3
[2023-02-28 22:39:04,439] INFO [BaseLogger] - Iteration: 180	Log-likelihood: -7.123485392196994

(440, -7.1045615516585325, 5)

In [24]:
#print topic info
for k in range(best_lda.k):

    topk_words=[pair[0] for pair in best_lda.get_topic_words(k, top_n=100)]

    titles=[(reviews[reviews.beer_id==label[:-1]].beer_name.array[0],reviews[reviews.beer_id==label[:-1]].beer_style.array[0],label[-1]) for label in topk_words]
    print(k)
    for title in titles[:10]:
        print(title)
    print('--------------------------------------')


    print()



0
('Sierra Nevada Pale Ale', 'American Pale Ale (APA)', 'P')
('Sierra Nevada Celebration Ale', 'American IPA', 'P')
('Sierra Nevada Porter', 'American Porter', 'P')
('Sierra Nevada Summerfest Lager', 'Czech Pilsener', 'P')
('Heineken Lager Beer', 'Euro Pale Lager', 'A')
('Sierra Nevada Bigfoot Barleywine Style Ale', 'American Barleywine', 'P')
('Sierra Nevada Stout', 'American Stout', 'P')
('Pilsner Urquell', 'Czech Pilsener', 'P')
('Hop Rod Rye', 'Rye Beer', 'P')
('Red Stripe Jamaican Lager', 'American Adjunct Lager', 'A')
--------------------------------------

1
('High Tide Fresh Hop IPA', 'American IPA', 'P')
('Hop 15', 'American Double / Imperial IPA', 'P')
('Old Viscosity', 'American Double / Imperial Stout', 'P')
('Wipeout I.P.A.', 'American IPA', 'P')
('Racer X', 'American Double / Imperial IPA', 'P')
('Older Viscosity', 'American Double / Imperial Stout', 'P')
("Santa's Little Helper", 'Russian Imperial Stout', 'P')
('Racer 5 India Pale Ale', 'American IPA', 'P')
('Hop Rod Rye

# Model Based

In [110]:
# convert the ratings frame to a user X movies matrix
ratings_matrix = reviews.sort_values(by=["user_id", "beer_id"])[["user_id", "beer_id", "review_overall"]].drop_duplicates().pivot_table(index = 'user_id', columns ='beer_id', values = 'review_overall').fillna(reviews.review_overall.mean())
ratings_matrix.head()

beer_id,4,11,12,13,80,81,82,175,176,178,...,74898,75087,75160,75491,75559,75907,76028,76348,76440,76816
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,...,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045
1,3.874045,3.874045,3.874045,3.874045,3.874045,5.0,3.874045,3.874045,3.874045,3.874045,...,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,4.0,3.874045,3.874045,3.874045
2,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,...,3.5,2.5,3.874045,3.874045,3.874045,3.5,4.0,3.874045,3.874045,3.874045
3,3.874045,3.5,4.5,3.0,4.0,4.0,3.0,3.874045,3.874045,3.874045,...,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045
4,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,3.874045,...,3.874045,3.874045,3.874045,3.874045,3.5,3.874045,3.874045,3.874045,3.874045,3.874045


In [111]:
#convert the matrix into an array
ratings_np = ratings_matrix.values
ratings_np

array([[3.87404467, 3.87404467, 3.87404467, ..., 3.87404467, 3.87404467,
        3.87404467],
       [3.87404467, 3.87404467, 3.87404467, ..., 3.87404467, 3.87404467,
        3.87404467],
       [3.87404467, 3.87404467, 3.87404467, ..., 3.87404467, 3.87404467,
        3.87404467],
       ...,
       [3.87404467, 3.87404467, 3.87404467, ..., 3.87404467, 3.87404467,
        3.87404467],
       [3.87404467, 3.87404467, 3.87404467, ..., 3.87404467, 3.87404467,
        3.87404467],
       [3.87404467, 3.87404467, 3.87404467, ..., 3.87404467, 3.87404467,
        3.87404467]])

In [112]:
import numpy as np

#compute the average rating per user
user_means=np.mean(ratings_np, axis = 1)
user_means.shape

(9583,)

In [113]:
#subtract the mean from each rating
ratings_np_centered = ratings_np - user_means.reshape(-1, 1)
ratings_np_centered

#unique user and movie num
user_num = reviews.user_id.unique().shape[0]
movie_num = reviews.beer_id.unique().shape[0]

user_num,movie_num

(9583, 1566)

In [114]:
sparsity = round(1.0 - len(reviews) / float(user_num * movie_num), 3)
print (sparsity)

0.989


In [115]:
#svd library
from scipy.sparse.linalg import svds

#perform svd with k singular values (features)
U, sigma, Vt = svds(ratings_np_centered, k = 50)

In [116]:
U.shape

(9583, 50)

In [117]:
sigma.shape

(50,)

In [118]:
Vt.shape

(50, 1566)

In [119]:
sigma = np.diag(sigma) # put the singular values in a diag matrix

In [120]:
sigma.shape

(50, 50)

In [121]:
sigma

array([[16.56594438,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , 16.61405619,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        , 16.70747356, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., 34.4742787 ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        40.71782598,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , 60.95235791]])

In [155]:
def recommend(uid:int,
              reviews:pd.DataFrame,
              U:np.ndarray,
              sigma:np.ndarray,
              Vt:np.ndarray,
              user_means:np.ndarray,
              rec_num:int,
              ratings_matrix:pd.DataFrame
             ):

    #get all the ratings by this user
    my_ratings=reviews[["user_id", "beer_id", "review_overall"]][reviews.user_id==uid]

    # beers df
    beers_df = reviews[["beer_id", "beer_name", "beer_style"]].drop_duplicates()

    #zip the ratings into a dict
    already_rated=dict(zip(my_ratings.beer_id,my_ratings.review_overall))

    #predict the rating of this user for all movies
    predicted_ratings=np.dot(np.dot(U[uid-1], sigma),Vt)+user_means[uid-1]

    # get the indexes of the ratings sorted in descending order
    indexes_sorted=np.argsort(predicted_ratings)[::-1]

    # get the scores for the sorted indexes

    predicted_ratings_sorted=predicted_ratings[indexes_sorted]

    # get the original movie indexes
    original_indexes_sorted=[ratings_matrix.columns[i] for i in indexes_sorted]

    pred_dict=dict(zip(original_indexes_sorted,predicted_ratings_sorted))

    rec_set=set()# set of movie ids to be recommended

    for mid in original_indexes_sorted: # for each movie id
        if mid not in already_rated: # movie has not already been rated
            rec_set.add(mid) # add to the set

            if len(rec_set)==rec_num:break

    # make a data frame with only the recommended movies
    rec_df=pd.DataFrame(beers_df[beers_df.beer_id.isin(rec_set)])

    #add the predicted rating as a new column
    rec_df['predicted_rating']=rec_df['beer_id'].map(pred_dict)

    #sort the df by the new col
    rec_df=rec_df.sort_values(['predicted_rating'], ascending=False)

    return rec_df

In [156]:
recommend(5, reviews,U,sigma,Vt,user_means,10, ratings_matrix)

Unnamed: 0,beer_id,beer_name,beer_style,predicted_rating
140067,47022,Hunahpu's Imperial Stout,American Double / Imperial Stout,4.16155
124964,50081,Jai Alai IPA - Cedar Aged (Humidor Series),American IPA,4.075355
58570,51257,Black Tuesday,American Double / Imperial Stout,4.074444
132191,47731,Maduro Oatmeal Brown Ale,American Brown Ale,4.063899
36969,16638,Cherry Chocolate Beer,Fruit / Vegetable Beer,4.033394
82961,6549,Northern Hemisphere Harvest Wet Hop Ale,American IPA,4.003618
115194,45973,Marshal Zhukov's Imperial Stout,Russian Imperial Stout,3.999396
68590,3981,Hite,Euro Pale Lager,3.986999
154433,1658,Big Bear Black Stout,American Double / Imperial Stout,3.978038
56140,42434,Saison Rue,Saison / Farmhouse Ale,3.972606


In [161]:
from sklearn.metrics import mean_squared_error

def validate(uid:int,
              reviews:pd.DataFrame,
              U:np.ndarray,
              sigma:np.ndarray,
              Vt:np.ndarray,
              user_means:np.ndarray,
              ratings_matrix:pd.DataFrame
             ):

    #get all the ratings by this user
    my_ratings=reviews[["user_id", "beer_id", "review_overall"]][reviews.user_id==uid]

    # beers df
    beers_df = reviews[["beer_id", "beer_name", "beer_style"]].drop_duplicates()

    #zip the ratings into a dict
    already_rated=dict(zip(my_ratings.beer_id,my_ratings.review_overall))

    #predict the rating of this user for all movies
    predicted_ratings=np.dot(np.dot(U[uid-1], sigma),Vt)+user_means[uid-1]

    # get the indexes of the ratings sorted in descending order
    indexes_sorted=np.argsort(predicted_ratings)[::-1]

    # get the scores for the sorted indexes
    predicted_ratings_sorted=predicted_ratings[indexes_sorted]

    # get the original movie indexes
    original_indexes=[ratings_matrix.columns[i] for i in indexes_sorted]

    pred_dict=dict(zip(original_indexes,predicted_ratings_sorted))

    actual,pred=[],[]
    for mid in already_rated: # movie has not already been rated
        actual.append(already_rated[mid])
        pred.append(pred_dict[mid])

    return mean_squared_error(actual,pred,squared=False)

In [162]:
validate(5, reviews,U,sigma,Vt,user_means,ratings_matrix)

0.43703330428654824

In [163]:


# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD

# Load Reader library
reader = Reader(rating_scale=(0.5, 5))

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(reviews[['user_id', 'beer_id', 'review_overall']].drop_duplicates(), reader)

In [164]:
from surprise.model_selection.validation import cross_validate


for factors in [10, 25, 50]:
    print(factors)
    # Use the SVD algorithm.
    svd = SVD(n_factors=factors)

    # Compute the RMSE of the SVD algorithm.
    cross_validate(svd, data, measures=['RMSE'],cv=5,verbose=True)
    print("done")

10
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.5819  0.5829  0.5763  0.5824  0.5883  0.5824  0.0038  
Fit time          1.84    1.32    1.01    1.86    1.01    1.41    0.38    
Test time         0.71    0.32    0.34    0.47    0.30    0.43    0.15    
done
25
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.5791  0.5811  0.5853  0.5850  0.5848  0.5831  0.0025  
Fit time          1.12    1.14    1.19    1.14    1.14    1.15    0.02    
Test time         0.31    0.30    0.30    0.32    0.31    0.31    0.01    
done
50
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.5866  0.5864  0.5883  0.5808  0.5840  0.5852  0.0026  
Fit time          1.27    1.33    1.31    1.32    1.32    1.31    0.02    
Test time

In [165]:
# create a training set
trainset = data.build_full_trainset()

svd.fit(trainset)# fit the svd

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20caf9b7d08>

In [168]:
def recommend_surprise(uid:int,
              reviews:pd.core.frame.DataFrame,
              model,
              rec_num:int
             ):

    #get all the ratings by this user
    my_ratings=reviews[["user_id", "beer_id", "review_overall"]][reviews.user_id==uid]

    # beers df
    beers_df = reviews[["beer_id", "beer_name", "beer_style"]].drop_duplicates()

    #zip the ratings into a dict
    already_rated=dict(zip(my_ratings.beer_id,my_ratings.review_overall))

    pred_dict={}# store predicted ratings

    for index,row in beers_df.iterrows(): # for every movie

        pred_dict[row.beer_id]=model.predict(uid=uid,iid= row.beer_id).est# get the pred for this user

    # sort the movies by predicted ratings
    srt=sorted(pred_dict.items(),key=lambda x:x[1],reverse=True)

    rec_set=set()# set of movie ids to be recommended

    for mid,pred in srt: # for each movie id
        if mid not in already_rated: # movie has not already been rated

            rec_set.add(mid) # add to the set

            if len(rec_set)==rec_num:break

    # make a data frame with only the recommended movies
    rec_df=pd.DataFrame(beers_df[beers_df.beer_id.isin(rec_set)])

    #add the predicted rating as a new column
    rec_df['predicted_rating']=rec_df['beer_id'].map(pred_dict)

    #sort the df by the new col
    rec_df=rec_df.sort_values(['predicted_rating'], ascending=False)

    return rec_df

In [169]:
recommend_surprise(5,
              reviews,
              svd,
              20
             )

Unnamed: 0,beer_id,beer_name,beer_style,predicted_rating
16137,47658,Founders CBS Imperial Stout,American Double / Imperial Stout,4.67758
62924,69569,Cuir (100% Bourbon Barrel Aged),Old Ale,4.595334
132108,65472,Nickel Bag - O'Brien's 17th Anniversary,American Double / Imperial IPA,4.581614
139796,56764,Hunahpu's Imperial Stout - Laird's Apple Brand...,American Double / Imperial Stout,4.534668
135235,18201,Mo' Betta Bretta,American Wild Ale,4.514267
64110,51557,Humulus Lager,American Double / Imperial Pilsner,4.490672
16850,69894,Founders Cashew Mountain Brown,American Brown Ale,4.473522
135260,5933,Frank Double IPA,American Double / Imperial IPA,4.452857
144570,54647,Hunahpu's Imperial Stout - Bourbon Barrel Aged,American Double / Imperial Stout,4.446617
63926,50668,Papier (Rye Whiskey Barrel),Old Ale,4.446371


In [170]:
def validate(uid:int,
              reviews:pd.core.frame.DataFrame,
              model
             ):

    #get all the ratings by this user
    my_ratings=reviews[["user_id", "beer_id", "review_overall"]][reviews.user_id==uid]

    # beers df
    beers_df = reviews[["beer_id", "beer_name", "beer_style"]].drop_duplicates()

    #zip the ratings into a dict
    already_rated=dict(zip(my_ratings.beer_id,my_ratings.review_overall))

    pred_dict={}# store predicted ratings

    for index,row in beers_df.iterrows(): # for every movie

        pred_dict[row.beer_id]=model.predict(uid=uid,iid= row.beer_id).est# get the pred for this user

    actual,pred=[],[]
    for mid in already_rated: # for each movie id
        actual.append(already_rated[mid])
        pred.append(pred_dict[mid])

    return mean_squared_error(actual,pred,squared=False)

In [171]:
validate(5,
          reviews,
          svd
         )

0.3461003991463652

In [172]:
from surprise.prediction_algorithms.knns import KNNBasic

In [173]:
knn=KNNBasic(k=40, sim_options={'user_based':False,'min_support':5})
knn.fit(trainset)# fit the svd

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x20c5a7d8608>

In [174]:
# Compute the RMSE of the KNN algorithm.
cross_validate(knn, data, measures=['RMSE'],cv=5,verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6715  0.6629  0.6670  0.6646  0.6566  0.6645  0.0049  
Fit time          1.83    1.07    1.77    0.58    0.61    1.17    0.54    
Test time         5.34    6.43    5.24    3.08    3.12    4.65    1.33    


{'test_rmse': array([0.67153682, 0.66287777, 0.66698089, 0.6646103 , 0.65656991]),
 'fit_time': (1.8290042877197266,
  1.0729997158050537,
  1.770998239517212,
  0.5776538848876953,
  0.6099984645843506),
 'test_time': (5.343993902206421,
  6.433997869491577,
  5.240999937057495,
  3.0839972496032715,
  3.122999906539917)}

In [175]:
knn=KNNBasic(k=40, sim_options={'user_based':False,'min_support':5})
knn.fit(trainset)# fit the svd

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x20c59874e48>

In [176]:
# Compute the RMSE of the KNN algorithm.
cross_validate(knn, data, measures=['RMSE'],cv=5,verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6655  0.6619  0.6684  0.6662  0.6632  0.6650  0.0023  
Fit time          0.56    0.59    0.59    0.60    0.66    0.60    0.03    
Test time         3.09    3.11    3.11    3.12    3.06    3.10    0.02    


{'test_rmse': array([0.66547246, 0.66191358, 0.66837954, 0.66616631, 0.66323764]),
 'fit_time': (0.5570054054260254,
  0.5889942646026611,
  0.5880053043365479,
  0.5989995002746582,
  0.658001184463501),
 'test_time': (3.092991590499878,
  3.1120002269744873,
  3.110999822616577,
  3.121004343032837,
  3.058999538421631)}

In [177]:
knn2=KNNBasic(k=40, sim_options={'item_based':False,'min_support':5})
knn2.fit(trainset)# fit the svd

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x20c5e18cd08>

In [178]:
# Compute the RMSE of the KNN algorithm.
cross_validate(knn2, data, measures=['RMSE'],cv=5,verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6111  0.6054  0.6044  0.6168  0.6086  0.6093  0.0045  
Fit time          7.28    6.91    7.01    7.49    7.07    7.15    0.21    
Test time         12.87   12.68   12.92   13.39   14.58   13.29   0.69    


{'test_rmse': array([0.61107363, 0.60540779, 0.60435604, 0.61682223, 0.60860252]),
 'fit_time': (7.283001184463501,
  6.912995100021362,
  7.014000177383423,
  7.4890007972717285,
  7.071999788284302),
 'test_time': (12.868999481201172,
  12.677999258041382,
  12.922994136810303,
  13.389997243881226,
  14.582995891571045)}