In [None]:
import os
import time
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns


from surprise import BaselineOnly, Dataset, Reader
from surprise.model_selection import cross_validate

In [None]:
from IPython.display import Markdown, display

In [None]:
cm = sns.light_palette("green", as_cmap=True)

In [None]:
userRatings = pd.read_csv("../data/reducedUserRatings.csv", index_col=0) # issues with outliers and usernames already fixed

In [None]:
userRatings

In [None]:
def naturalNumber(val):
    """Find all non-natural number ratings"""
    return round(val) ==val

## ADDITION EDA

In [None]:
userRatings["isNatural"] = userRatings.Rating.apply(lambda x: naturalNumber(x))

In [None]:
sns.histplot(data=userRatings,x="isNatural",hue="isNatural")

In [None]:
userRatings["Rating"] = userRatings.Rating.apply(lambda x: round(x))

In [None]:
# Number of users 
userRatings.reducedUsername.unique().shape[0]

In [None]:
# Number of items
userRatings.BGGId.unique().shape[0]

In [None]:
# Rating properties

In [None]:
def distplot(figRows,figCols,xSize, ySize, data, features, colors, kde=True, bins=None):
    f, axes = plt.subplots(figRows, figCols, figsize=(xSize, ySize))
    
    features = np.array(features).reshape(figRows, figCols)
    colors = np.array(colors).reshape(figRows, figCols)
    
    for row in range(figRows):
        for col in range(figCols):
            if (figRows == 1 and figCols == 1) :
                axesplt = axes
            elif (figRows == 1 and figCols > 1) :
                axesplt = axes[col]
            elif (figRows > 1 and figCols == 1) :
                axesplt = axes[row]
            else:
                axesplt = axes[row][col]
            plot = sns.distplot(data[features[row][col]], color=colors[row][col], bins=bins, ax=axesplt, kde=kde, hist_kws={"edgecolor":"k"})
            plot.set_xlabel(features[row][col],fontsize=20)

def scatterplot(rowFeature, colFeature, data):
    f, axes = plt.subplots(1, 1, figsize=(10, 8))
        
    plot=sns.scatterplot(x=rowFeature, y=colFeature, data=data, ax=axes)
    plot.set_xlabel(rowFeature,fontsize=20)
    plot.set_ylabel(colFeature,fontsize=20)  

In [None]:
pal = sns.color_palette(palette='Set1', n_colors=16)


In [None]:
# Rating Distribution

distplot(1, 1, 10, 7, data=userRatings, features=['Rating'], colors=['blue'])

In [None]:
groupby_products_Ratings = userRatings.groupby('BGGId')['Rating']

ratings_products = pd.DataFrame(groupby_products_Ratings.count().clip(upper=30))
ratings_products.rename(columns={"Rating": "Rating_Count"}, inplace=True)

In [None]:
(groupby_products_Ratings.count() ==1).sum()

In [None]:
(groupby_products_Ratings.count() ==1).sum()/userRatings.BGGId.unique().shape[0]

In [None]:
#Count Distribution grouped by Products
distplot(1, 1, 10, 7, data=pd.DataFrame(groupby_products_Ratings.count()).rename(columns={"Rating": "Rating_Count"}), features=['Rating_Count'], colors=['green'], kde=False)

In [None]:
# Top Rating Count Distribution grouped by Products upper 30
distplot(1, 1, 10, 7, data=ratings_products, features=['Rating_Count'], colors=['green'], kde=False)

In [None]:
# Top Rating Count Distribution grouped by Products
distplot(1, 1, 10, 7, data=pd.DataFrame(groupby_products_Ratings.count().clip(upper=10)).rename(columns={"Rating": "Rating_Count"}), features=['Rating_Count'], colors=['green'], kde=False)

In [None]:
groupby_users_Ratings = userRatings.groupby('reducedUsername')['Rating']
rating_users = pd.DataFrame(groupby_users_Ratings.count().clip(lower=1, upper=100))
rating_users.rename(columns={"Rating": "Rating_Count"}, inplace=True)

In [None]:
(groupby_users_Ratings.count() ==1).sum()

In [None]:
(groupby_users_Ratings.count() ==1).sum()/userRatings.reducedUsername.unique().shape[0]

In [None]:
# Top Rating Count Distribution grouped by Users
distplot(1, 1, 10, 7, data=pd.DataFrame(groupby_users_Ratings.count()).rename(columns={"Rating": "Rating_Count"}), features=['Rating_Count'], colors=['orange'], kde=False)

In [None]:
# Top Rating Count Distribution grouped by Users
distplot(1, 1, 10, 7, data=rating_users, features=['Rating_Count'], colors=['orange'], kde=False, bins=10)

In [None]:
# Top Rating Count Distribution grouped by Users
distplot(1, 1, 10, 7, data=pd.DataFrame(groupby_users_Ratings.count().clip(upper=20)).rename(columns={"Rating": "Rating_Count"}), features=['Rating_Count'], colors=['orange'], kde=False)

In [None]:
ratings = pd.DataFrame(userRatings.groupby('BGGId')['Rating'].mean())
ratings.rename(columns={"Rating": "Rating_Mean"}, inplace=True)

In [None]:
# Mean Rating Distribution grouped by Products
distplot(1, 1, 10, 7, data=ratings, features=['Rating_Mean'], colors=['brown'], kde=False, bins=50)

In [None]:
ratings['Rating_Count'] = userRatings.groupby('BGGId')['Rating'].count()

In [None]:
# Mean Rating - Rating Count Distribution grouped by Products
scatterplot('Rating_Mean', 'Rating_Count', data=ratings)

In [None]:
# Mean Rating Distribution grouped by Users
ratings = pd.DataFrame(userRatings.groupby('reducedUsername')['Rating'].mean())
ratings.rename(columns={"Rating": "Rating_Mean"}, inplace=True)
distplot(1, 1, 10, 7, data=ratings, features=['Rating_Mean'], colors=['brown'], kde=False, bins=50)

In [None]:
# Mean Rating - Rating Count Distribution grouped by Users
ratings['Rating_Count'] = userRatings.groupby('reducedUsername')['Rating'].count()
scatterplot('Rating_Mean', 'Rating_Count', data=ratings)

# BASIC CF AS IMPLEMETED IN [SURPRISE](https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html)

In [None]:
from surprise import BaselineOnly, Dataset, Reader, dataset
from surprise.model_selection import cross_validate,RandomizedSearchCV,GridSearchCV, KFold
from surprise import KNNWithMeans,KNNWithZScore,SVD,KNNBaseline,KNNBasic
from surprise import accuracy

from sklearn.pipeline import Pipeline

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Let's keep a subset of the data for final testing
trainset, testset = train_test_split(userRatings, test_size=.3, random_state=10)

In [None]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 10))

# The columns must correspond to user id, item id and ratings (in that order).
traindata = Dataset.load_from_df(trainset[["reducedUsername", "BGGId", "Rating"]], reader)

In [None]:
traindata.has_been_split

In [None]:
# Since we cannot do pipeline Gridsearch, we will fix the cross valdiation folds for model comparison
cv = KFold(n_splits=3, shuffle=False, random_state = 100)

In [None]:
model_list = {"KNNBasic_pearson_user":KNNBasic(sim_options={'name':'pearson' , 'user_based':True}),
             "KNNBasic_pearson_item":KNNBasic(sim_options={'name':'pearson' , 'user_based':False}),
             "KNNBasic_cosine_user":KNNBasic(sim_options={'name':'cosine' , 'user_based':True}),
             "KNNBasic_cosine_item":KNNBasic(sim_options={'name':'cosine' , 'user_based':False}),
             "KNNBasic_MSD_user":KNNBasic(sim_options={'name':'MSD' , 'user_based':True}),
             "KNNBasic_MSD_item":KNNBasic(sim_options={'name':'MSD' , 'user_based':False}),
             "KNNBasic_pearson_baseline_user":KNNBasic(sim_options={'name':'pearson_baseline' , 'user_based':True}),
             "KNNBasic_pearson_baseline_item":KNNBasic(sim_options={'name':'pearson_baseline' , 'user_based':False}),
              
             "KNNWithZScore_pearson_user":KNNWithZScore(sim_options={'name':'pearson' , 'user_based':True}),
             "KNNWithZScore_pearson_item":KNNWithZScore(sim_options={'name':'pearson' , 'user_based':False}),
             "KNNWithZScore_cosine_user":KNNWithZScore(sim_options={'name':'cosine' , 'user_based':True}),
             "KNNWithZScore_cosine_item":KNNWithZScore(sim_options={'name':'cosine' , 'user_based':False}),
             "KNNWithZScore_MSD_user":KNNWithZScore(sim_options={'name':'MSD' , 'user_based':True}),
             "KNNWithZScore_MSD_item":KNNWithZScore(sim_options={'name':'MSD' , 'user_based':False}),
             "KNNWithZScore_pearson_baseline_user":KNNWithZScore(sim_options={'name':'pearson_baseline' , 'user_based':True}),
             "KNNWithZScore_pearson_baseline_item":KNNWithZScore(sim_options={'name':'pearson_baseline' , 'user_based':False}),

             "KNNWithMeans_pearson_user":KNNWithMeans(sim_options={'name':'pearson' , 'user_based':True}),
             "KNNWithMeans_pearson_item":KNNWithMeans(sim_options={'name':'pearson' , 'user_based':False}),
             "KNNWithMeans_cosine_user":KNNWithMeans(sim_options={'name':'cosine' , 'user_based':True}),
             "KNNWithMeans_cosine_item":KNNWithMeans(sim_options={'name':'cosine' , 'user_based':False}),
             "KNNWithMeans_MSD_user":KNNWithMeans(sim_options={'name':'MSD' , 'user_based':True}),
             "KNNWithMeans_MSD_item":KNNWithMeans(sim_options={'name':'MSD' , 'user_based':False}),
             "KNNWithMeans_pearson_baseline_user":KNNWithMeans(sim_options={'name':'pearson_baseline' , 'user_based':True}),
             "KNNWithMeans_pearson_baseline_item":KNNWithMeans(sim_options={'name':'pearson_baseline' , 'user_based':False}),
              
             "KNNBaseline_pearson_user":KNNBaseline(sim_options={'name':'pearson' , 'user_based':True}),
             "KNNBaseline_pearson_item":KNNBaseline(sim_options={'name':'pearson' , 'user_based':False}),
             "KNNBaseline_cosine_user":KNNBaseline(sim_options={'name':'cosine' , 'user_based':True}),
             "KNNBaseline_cosine_item":KNNBaseline(sim_options={'name':'cosine' , 'user_based':False}),
             "KNNBaseline_MSD_user":KNNBaseline(sim_options={'name':'MSD' , 'user_based':True}),
             "KNNBaseline_MSD_item":KNNBaseline(sim_options={'name':'MSD' , 'user_based':False}),
             "KNNBaseline_pearson_baseline_user":KNNBaseline(sim_options={'name':'pearson_baseline' , 'user_based':True}),
             "KNNBaseline_pearson_baseline_item":KNNBaseline(sim_options={'name':'pearson_baseline' , 'user_based':False}),
             
             "SVD":SVD( n_epochs=50)}

In [None]:
# Here the metrics are the average scores over the cross validation folds
results = pd.DataFrame(index= model_list.keys(), columns=['test_rmse','test_mae','fit_time','test_time'])

In [None]:
%%time
for i,v in enumerate(model_list):
    dictonary = cross_validate(model_list[v], traindata, cv=cv, verbose=True,)
    for key in dictonary.keys():
        results.loc[v,key] = np.mean((dictonary[key]))

In [None]:
results.sort_values(by=['test_rmse','test_mae'], ascending=True).style.bar(color='#d65f5f')

## Gridsearch for tuning some parameters

In [None]:
model_dict = {"KNNBasic":KNNBasic,
             "KNNWithZScore":KNNWithZScore,
             "KNNWithMeans":KNNWithMeans,
             "SVD": SVD,
              "KNNBaseline":KNNBaseline
             }



In [None]:
similarities = ["pearson",
                "cosine",
                "MSD",
                "pearson_baseline"
                ]

In [None]:
param_dict = {"KNNBasic":{
                            'k': [101],
                            'sim_options': {
                                'name': similarities,
                                'min_support': [5, 10, 15],
                                'user_based': [False, True],
                                'shrinkage': [0, 10,100]
                            },
                        },
              "KNNWithZScore":{
                            'k': [101],
                            'sim_options': {
                                'name': similarities,
                                'min_support': [5, 10, 15],
                                'user_based': [False, True],
                                'shrinkage': [0, 10,100]
                            },
                        },
                "KNNWithMeans":{
                            'k': [101],
                            'sim_options': {
                                'name': similarities,
                                'min_support': [5, 10, 15],
                                'user_based': [False, True],
                                'shrinkage': [0, 10,100]
                            },
                        },
              "KNNBaseline":{
                            'k': [101],
                            'sim_options': {
                                'name': similarities,
                                'min_support': [5, 10, 15],
                                'user_based': [False, True],
                                'shrinkage': [0, 10,100]
                            },
                          "bsl_options" : {
                                    "method": ["sgd",'als'],
                                    'reg_i' :[0,5,10],
                                    'reg_u' : [0,5,10],
                                    'reg': [0.00005, 0.005,0.5],
                                    "learning_rate": [0.00005, 0.005,0.5],
                                    'n_epochs': [20]
                                }
                        },
              "SVD":{"n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6], "biased": [False, True]}
            
             
             
             
             
             
             
             }

In [None]:
def printmd(string, color=None):
    '''Print to track progress'''
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))

In [None]:
gs_results= pd.DataFrame()

In [None]:
### Fix this

In [None]:
%%time
for i in model_dict:
    printmd(f"**{i}**", color="blue")
    model = model_dict[i]
    parameters = param_dict[i]
    
    gs = GridSearchCV(model, parameters, measures=["rmse", "mae"], cv=cv)
    gs.fit(traindata)
    sub = pd.DataFrame.from_dict(gs.cv_results)
    sub["model"] = i
    gs_results = pd.concat([gs_results,sub])
    #display(gs_results.tail(4))
    printmd("**Done**", color="blue")


In [None]:
gs_results

In [None]:
# Next questions to address
# Why does cosine similarity result in zero division for small minimum support like 1?

# Can and should we manually adjust the search space for minimum data based on training data distribution?

In [None]:

####################################################################################################################


#### Lets build a pipeline for hyperparameter tuning and model selection all at once

In [None]:
# from enum import Enum
# class Model(Enum):
#     KNNWithZScore='z standrdized CF'
#     KNNWithMeans='baseline centered CF'
#     all= 'best model'



# model_list = [e for e in Model if e != Model.all]
# model_instances = []
# params = []

# num_folds = 5  # Hard coded
# scoring = ['rmse', 'mae']

# #clf1 = KNNWithZScore(verbose=True)
# #param1 = {}
# #param1['k'] = [50]
# #param1['sim_options'] =  {
# #                                        'name': ['pearson', 'cosine'],
# #                                        'min_support': [1, 5],
# #                                        'user_based': [False, True],
# #                                    }
# #param1['regressor'] = [clf1]
# #model_instances.append(clf1)
# #params.append(param1)

# gs = GridSearchCV(clf1, param1, cv=num_folds, measures=scoring, refit=True,joblib_verbose=2)


# gs.fit(traindata)

# if Model.KNNWithZScore in model_list:
#         clf1 = KNNWithZScore#(verbose=True)
#         param1 = {}
#         param1['regressor__k'] = [50]
#         param1['regressor__sim_options'] =  {
#                                                 'name': ['pearson', 'cosine'],
#                                                 'min_support': [1, 5],
#                                                 'user_based': [False, True],
#                                             },
#         param1['regressor'] = [clf1]
#         model_instances.append(clf1)
#         params.append(param1)


        
# pipeline = Pipeline([('regressor', model_instances[0])])


# gs = GridSearchCV(pipeline, params, cv=num_folds, measures=scoring, refit=True,joblib_verbose=2)

# gs.fit(traindata)


In [None]:
# sim_options = {
#     "name": ["msd", "cosine", "pearson", "pearson_baseline"],
#     #"min_support": [3, 4, 5],
#     "user_based": [False],
# }

In [None]:
# knn_model = KNNWithMeans(k=10,sim_options={'name':'pearson' , 'user_based':True})
# knn_model.fit(traindata)

In [None]:
# accuracy.rmse(test_pred_knn)

In [None]:
# #Evaluation on testset

# test_pred_knn=knn_model.test(testset)

# # compute RMSE
# accuracy.rmse(test_pred_knn)

In [None]:
# test_pred_df = pd.DataFrame(test_pred_knn)

In [None]:
# test_pred_df.sort_values(by=["uid","iid","r_ui"], ascending=False)

In [None]:
# # Use the famous SVD algorithm.
# algo = SVD()

# # Run 5-fold cross-validation and print results.
# cross_validate(algo, trainset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
# sim_options = {
#     "name": ["msd", "cosine", "pearson", "pearson_baseline"],
#     "min_support": [3, 4, 5],
#     "user_based": [True],
# }

In [None]:
# cross_validate(KNNWithZScore(sim_options=sim_options), traindata, cv=2)

In [None]:
# cv_result = cross_validate(KNNWithZScore(sim_options=sim_options), trainset, cv=5)

In [None]:
# from collections import defaultdict

# def get_top_n(predictions, n=10):
#     # First map the predictions to each user.
#     top_n = defaultdict(list)
#     for uid, iid, true_r, est, _ in predictions:
#         top_n[uid].append((iid, est))

#     # Then sort the predictions for each user and retrieve the k highest ones.
#     for uid, user_ratings in top_n.items():
#         user_ratings.sort(key=lambda x: x[1], reverse=True)
#         top_n[uid] = user_ratings[:n]

#     return top_n


In [None]:
# class collab_filtering_based_recommender_model():
#     def __init__(self, model, trainset, testset, data):
#         self.model = model
#         self.trainset = trainset
#         self.testset = testset
#         self.data = data
#         self.pred_test = None
#         self.recommendations = None
#         self.top_n = None
#         self.recommenddf = None

#     def fit_and_predict(self):        
#         printmd('**Fitting the train data...**', color='brown')
#         self.model.fit(self.trainset)       

#         printmd('**Predicting the test data...**', color='brown')
#         self.pred_test = self.model.test(self.testset)        
#         rmse = round(accuracy.rmse(self.pred_test), 3)
#         printmd('**RMSE for the predicted result is ' + str(rmse) + '**', color='brown')   
        
#         self.top_n = get_top_n(self.pred_test)
#         self.recommenddf = pd.DataFrame(columns=['userId', 'productId', 'Rating'])
#         for item in self.top_n:
#             subdf = pd.DataFrame(self.top_n[item], columns=['productId', 'Rating'])
#             subdf['userId'] = item
#             cols = subdf.columns.tolist()
#             cols = cols[-1:] + cols[:-1]
#             subdf = subdf[cols]        
#             self.recommenddf = pd.concat([self.recommenddf, subdf], axis = 0)        
#         return rmse
        
#     def cross_validate(self):
#         printmd('**Cross Validating the data...**', color='brown')
#         cv_result = cross_validate(self.model, self.data, n_jobs=-1)
#         cv_result = round(cv_result['test_rmse'].mean(),3)
#         printmd('**Mean CV RMSE is ' + str(cv_result)  + '**', color='brown')
#         return cv_result

#     def recommend(self, user_id, n=5):
#         printmd('**Recommending top ' + str(n)+ ' products for userid : ' + user_id + ' ...**', color='brown')
        
#         #df = pd.DataFrame(self.top_n[user_id], columns=['productId', 'Rating'])
#         #df['UserId'] = user_id
#         #cols = df.columns.tolist()
#         #cols = cols[-1:] + cols[:-1]
#         #df = df[cols].head(n)
#         df = self.recommenddf[self.recommenddf['userId'] == user_id].head(n)
#         display(df)
#         return df

In [None]:
# from surprise.model_selection import RandomizedSearchCV

# def find_best_model(model, parameters,data):
#     clf = RandomizedSearchCV(model, parameters, n_jobs=-1, measures=['rmse'])
#     clf.fit(data)             
#     print(clf.best_score)
#     print(clf.best_params)
#     print(clf.best_estimator)
#     return clf

In [None]:
# sim_options = {
#     "name": ["msd", "cosine", "pearson", "pearson_baseline"],
#     "min_support": [3, 4, 5],
#     "user_based": [True],
# }
# params = { 'k': range(30,50,1), 'sim_options': sim_options}


In [None]:
# clf = find_best_model(KNNWithZScore, params, trainset)