In [3]:
pip install comet_ml

In [4]:
# Please skip this cell when running the notebook
# import comet_ml at the top of your file
# import comet_ml at the top of your file
from comet_ml import Experiment

# Create an experiment with your api key
experiment = Experiment(
    api_key="1wJBkwGmGzfiNzCD8TIHqmW7d",
    project_name="team-es3-unsupervised-learning",
    workspace="engrchyke",
)

In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import json
import re
import squarify


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
import warnings
warnings.filterwarnings('ignore')

# Data Preprocessing
import random
from time import time

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds


# Models
from surprise import Reader, Dataset
from surprise import SVD,SVDpp, NormalPredictor, BaselineOnly, NMF, SlopeOne, CoClustering
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from surprise.model_selection import cross_validate
from sklearn.metrics.pairwise import cosine_similarity
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split

# Performance Evaluation
from surprise import accuracy
from sklearn.metrics import mean_squared_error
from surprise.accuracy import rmse


# Display
%matplotlib inline
sns.set(font_scale=1)
sns.set_style("white")
pd.set_option('display.max_columns', 37)


from IPython.display import display_html 
from IPython.core.display import HTML
from collections import defaultdict
from datetime import datetime

        
# Visualisation
import matplotlib.pyplot as plt 
sns.set_style("darkgrid")
import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
py.init_notebook_mode(connected = True)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


In [7]:
#Reading all the given data
#https://www.kaggle.com/competitions/edsa-movie-recommendation-2022/data?select=genome_scores.csv
train = pd.read_csv('/kaggle/input/team-es3/train.csv')
test = pd.read_csv('/kaggle/input/team-es3/test.csv')
genome_scores = pd.read_csv('/kaggle/input/team-es3/genome_scores.csv')
genome_tags = pd.read_csv('/kaggle/input/team-es3/tags.csv')
imdb = pd.read_csv('/kaggle/input/team-es3/imdb_data.csv')
links = pd.read_csv('/kaggle/input/team-es3/links.csv')
movies = pd.read_csv('/kaggle/input/team-es3/movies.csv')
tags = pd.read_csv('/kaggle/input/team-es3/tags.csv')

In [8]:
# Declaring a list that contains the names of the dataframes
df_list = [train, test, genome_scores, genome_tags, imdb, links, movies, tags]
# Create a list of the names of the imported datasets
df_names = ['train', 'test', 'genome_scores', 'genome_tags',
            'imdb', 'links', 'movies', 'tags']
#Zipping the data list and data names together.
df_dataset = zip(df_names, df_list)
#creating an empty dictionary to get the total number of observations and features available in different dataset.
dfs_dict = {}
for name, data in df_dataset:
    dfs_dict[name] = [data.shape[0], data.shape[1]]
    df_prop = pd.DataFrame(dfs_dict,
                          index=['rows', 'columns']).T
df_prop = df_prop.sort_values(by='rows', ascending=False)

#view the final output
df_prop  

In [9]:
#A function that displays multiple dataframes in one cell
def data_overview_display(dataframe_list,column_names=[]):

    html_string = ''
    html_string += ('<tr>' + ''.join(f'<td style="text-align:center">{name}</td>' for name in column_names) + '</tr>')
    html_string += ('<tr>' + ''.join(f'<td style="vertical-align:top"> {df.to_html(index=True)}</td>' for df in dataframe_list) + '</tr>')
    html_string = f'<table>{html_string}</table>'
    html_string = html_string.replace('table','table style="display:inline"')
    display_html(html_string, raw=True)

In [10]:
#displays the overview of train,test, Links,tags 
data_overview_display([train.head(),test.head(),links.head(),tags.head()], 
                      column_names=['Train Dataset','Test Dataset','Links Dataset','Tags Dataset'])

In [11]:
#displays the overview of movies,genome_tags,genome_scores 
data_overview_display([movies.head(),genome_tags.head(),genome_scores.head()], 
                      column_names=['Movies ','Genome tags','Genome scores'])

## Data Cleaning

In [12]:
#Create the null-value dataframes of all the given data
train_null = pd.DataFrame({"Null Values":train.isnull().sum()})
test_null = pd.DataFrame({"Null Values":test.isnull().sum()})
movies_null = pd.DataFrame({"Null Values":movies.isnull().sum()})
links_null = pd.DataFrame({"Null Values":links.isnull().sum()})
imdb_null = pd.DataFrame({"Null Values":imdb.isnull().sum()})
tags_null = pd.DataFrame({"Null Values":tags.isnull().sum()})
genome_tags_null = pd.DataFrame({"Null Values":genome_tags.isnull().sum()})
genome_scores_null = pd.DataFrame({"Null Values":genome_scores.isnull().sum()})

In [13]:
#Display overview of null values of dataframes
data_overview_display([train_null,test_null,movies_null,links_null,genome_scores_null,tags_null,genome_tags_null], column_names=['Train df','Test df','Movies df', 'Links df',
                                                                                                                                 'genome scores df','tags df','genome tags'])

In [14]:
#Display overview null value of "imdb"
data_overview_display([imdb_null], column_names=['imdb df'])

**Analysis of imdb_data**

The plot below is a visual representation of the different columns in the imdb dataset with their percentage of missing values.
There is a high number of movies without budget, director or title cast. Such high proportions of missing data largely disqualifies this particular set from our current modelling task.

In [15]:
# The percentage of each column of missing values
total = imdb.isnull().sum().sort_values(ascending=False)
percent_1 = (imdb.isnull().sum()/imdb.isnull().count())*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2],
                         axis=1, keys=['Total', '(%) missing'])
missing_data['(%) missing'].plot(kind='bar')
plt.xlabel('(%) Missing Values')
plt.ylabel('Columns with Missing Values')
plt.title('Percentage of Missing Values per Column')
plt.show()

From the above graph, it is observed that the budget column has the highest number of missing values of over 70% and the movieId column having no missing value, this could be as a result of the movieId being the unique key column.

### Checking of Duplicates

In [16]:
# Checking for unique users and movieId's in the train dataset
users = len(train.userId.unique())
items = len(train.movieId.unique())
print('There are {} unique users and {}\
 unique movies train dataset with {} duplicated entries'.format(users, items, train[train.duplicated()].count().sum()))

users1 = len(test.userId.unique())
items1 = len(test.movieId.unique())

print('There are {} unique users and {}\
 unique movies test dataset with {} duplicated entries'.format(users1, items1, test[test.duplicated()].count().sum()))

### Normalising and combining the Data

In [17]:
print(train.shape)

print(train.info)

In [18]:
# checking the shape and info of the movies dataset
print(movies.shape)

print(movies.info())

In [19]:
# checking the shape and info of the imdb dataset
print(imdb.shape)

print(imdb.info())

**Merging of Datasets**

Now that we have a basic understanding of the data we are working with, we thereby merge the sets below for more in depth analysis in the EDA section.

In [20]:
# Combining both train and movies datasets by using movieId
# as the matching column between both datasets
train_movies_df = pd.merge(train,
                           movies,
                           how='left',
                           on='movieId')

# Combining all the observations in movies_metadata_df with imdb_data
# using movieId as the matching column between both dataframes
movies_metadata_df = pd.merge(train_movies_df,
                              imdb,
                              how='left',
                              on='movieId')

movies_metadata_df.head()

In [21]:
movies_metadata_df.isnull().sum()

In [22]:
# Calculating the mean rating per movie
movies_ranking = movies_metadata_df[['title', 'rating']].groupby('title').mean().sort_values('rating', ascending=False)
movies_ranking.head()

# **Exploratory Data Analysis**

In [23]:
print (f'Average rating in the dataset: {np.mean(movies_metadata_df["rating"])}')

with sns.axes_style('white'):
    g = sns.factorplot("rating", data=movies_metadata_df, aspect=2.5, kind='count')
    g.set_ylabels("Total number of ratings")

### Observation
From the graph above, it shows that most movies have a rating of 4.0, followed by 3.0 rating while the least rated movies by individuals were rated 0.5 and 1.5. Also the mean rating was about 3.5, this reveals that the distribution is left skewed which indicates that most movies are rated well by the users.

In [24]:
movies_ranking['No_of_ratings'] = movies_metadata_df.groupby('title')['rating'].count()

In [25]:
movies_ranking.sort_values(by=['No_of_ratings', 'rating'],
                          ascending=False).head()

This table shows that the best movies are those with high number of ratings as they have been rated highly by large quantities of people.

In [26]:
# Set plot size
sns.set(rc={'figure.figsize':(12,9)})

# Plot Number of rating for every rating category.
sns.scatterplot(x='rating', y='No_of_ratings', data=movies_ranking)
plt.title('Number of ratings per average rating per movie')
plt.xlabel('Rating')
plt.ylabel('Number of ratings')
plt.show()

The above scatterplot shows that there is a strong correlation between the number of ratings a rating-category contains and the rating category, i.e. movies that have more ratings (views) strongly tend to also have higher average ratings. This supports the previously established notion that users tend to give higher ratings in general. The plot below similarly shows that even movies with more than one hundred views (ratings) the average rating stays consistent around 3.5. 

In [27]:
# Average rating of movies in the dataset
avg_rating = train.groupby('movieId')['rating'].mean()

# Plotting the results
plt.figure(figsize=(12,10))
avg_rating.plot(kind='hist')
plt.ylabel('Number of viewers (Frequency)')
plt.xlabel('Movie Rating')
plt.title('Average ratings of movies with 100 or more viewers')
plt.show()

So this confirms the hypothesis that movies with more views have a good average ratings

In [28]:
best_director = pd.DataFrame(movies_metadata_df.groupby('director')['rating'].mean().
                             sort_values(ascending=False))
best_director['No_of_ratings'] = movies_metadata_df.groupby('director')['rating'].count()
best_director.sort_values(by=['No_of_ratings', 'rating'], ascending=False).head(10)

In [29]:
# Set plot size
sns.set(rc={'figure.figsize':(12,9)})

sns.scatterplot(x = 'rating', y = 'No_of_ratings', data = best_director).set_title('Number of ratings per average rating per director')
plt.xlabel('Ratings')
plt.ylabel('Number of Ratings')
plt.show()

In [30]:
# Storing the years from the titles separately:

# We specify the parantheses so we don’t conflict with movies that have years in their titles
movies["year"] = movies.title.str.extract("\((\d{4})\)", expand=True)
# Removing the parentheses
movies["year"] = movies.year.str.extract("(\d\d\d\d)", expand=True)
# Removing the years from the ‘title’ column
movies["title"] = movies.title.str.replace("(\(\d\d\d\d\))", "")
# Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies["title"] = movies["title"].apply(lambda x: x.strip())

In [31]:
# Removing the character separating the genres for each movie
movies['genres'] = movies['genres'].str.replace('|',' ')

# Removing the same character for the meatadata:
imdb['title_cast'] = imdb['title_cast'].str.replace('|',' ')
imdb['plot_keywords'] = imdb['plot_keywords'].str.replace('|',' ')

In [32]:
movies.head(10)

In [33]:
imdb.head()

In [34]:
# map movie to id:
Mapping_file = dict(zip(movies.title.tolist(), movies.movieId.tolist()))

In [35]:
metadata = pd.merge(movies, imdb, on='movieId', how='left')
# The number of Movies released per year
num = metadata.groupby('year').count()
plt.figure(figsize=(20,10))
plt.plot(num.index, num['budget'])
plt.xlabel("years", size=25)
plt.xticks(rotation='vertical')
plt.ylabel('No. of Movies', size=25)
plt.title('Number of Movies Released By year', size=25)
plt.show()

The graph above highlights an important point to keep in mind: The dataset we are working with does not contain all movies ever released. It is a subset of thereof with properties that differ from the a hypotherical dataset that contains all movies ever released. For example, it's clear that movie production did not drop to almost 0 in 2016, but that the dataset doesn't include many movies after that date.

In [36]:
#The years of movies releases
year_corpus = metadata['year'].value_counts()
# Generating the wordcolud
year_wordcloud = WordCloud(background_color='black', height=2000, width=4000).generate_from_frequencies(year_corpus)
plt.figure(figsize=(16,8))
plt.imshow(year_wordcloud)
plt.axis('off')
plt.show()

The wordcloud above similarly displays the most prevalent release years in this dataset.

In [37]:
#Plotting total amount of movies released in each year using a count plot.
figure= plt.subplots(figsize=(15, 5))
axes=sns.countplot(x=metadata['year'], order = metadata['year'].value_counts()[0:50].index,color='blue')
axes.set_title('Total movies released per year',fontsize=19)
plt.xticks(rotation=90)
plt.show()

The above plot shows estimated number of movies produced each year with 2015 having the highest number of movie released and 1978 had the lowest number of movies released that year.

In [38]:
years = pd.merge(train, movies, on='movieId')[['userId','year']].dropna()
years['year'] = years['year'].astype('int64')
years.groupby('userId').mean()

# Set plot size
sns.set(rc={'figure.figsize':(12,9)})

plt.figure(figsize=(12,10))
years['year'].plot(kind='hist')
plt.ylabel('Frequency')
plt.xlabel('Average release year per user')
plt.title('Distribution of release years for movies rated by users')
plt.show()

The above graph shows the ratings per year (averaged year per user) on this dataset. It illustrated the point that certain users have preferences regarding release year which may be useful when performing predictive modelling.

Next to explore some of the highest-rated moviesin this dataset, the timestamp column of the train dataset is converted to date time format, and the years extracted

In [39]:
train['datetime_of_rating'] = train['timestamp'].apply(lambda x: datetime.fromtimestamp(x))
train['year_rated'] = train['datetime_of_rating'].dt.year
train_temp = train.drop(['timestamp', 'datetime_of_rating'], axis = 1)
train_temp.head()

In [40]:
# Group and count ratings by year
year_rated_df = train_temp[['year_rated','rating']].groupby('year_rated').count()

# Set plot size
sns.set(rc={'figure.figsize':(12,9)})

sns.scatterplot(x = 'year_rated', y = 'rating', data = year_rated_df.reset_index()).set_title('Number of ratings per average rating per director')
plt.xlabel('Year rated')
plt.ylabel('number of ratings')
plt.show()

From the plot above we can see that all ratings were only created between 1995 and 2020, with no clear correlation between year within that range and number of ratings created/collected.

## Rating timestamp exploration
Further analysis is performed to see whether day of the week (of rating creation/collection) influences the rating itself.

In [41]:
# Convert the timestamp values into date time format
train['timestamp'] = pd.to_datetime(train['timestamp'], unit='ms')
train.head()

In [42]:
# Extract the days of the week from the timestamp column
train['day_of_week'] = train['timestamp'].dt.dayofweek
days = {0:'Mon',1:'Tue',2:'Wed',3:'Thur',4:'Fri',5:'Sat',6:'Sun'}
train['day_of_week'] = train['day_of_week'].apply(lambda x: days[x])
train.tail()

In [43]:
# Plot of the average number of ratings for each day
train['day_of_week'].value_counts().plot(kind='bar')
plt.title('Ratings per day of the week')
plt.xlabel('Day of the week')
plt.ylabel('Proportion of ratings created/collected')
plt.show()

**Observation**

The graph above shows that on average, movies were rated more often on Sundays and Saturdays.
A possible explanation is that more people are at home/at the cinema watching movies on Saturdays and Sundays.

**Genre and Tag Exploration**

In [44]:
# Looking at the titles and checking for any similarity
metadata['genres'] = metadata['genres'].astype('str')
genre_corpus = ' '.join(metadata['genres'])
#Generating the stopwords
stopword = ['no genres', 'no', 'genres', 'genre', 'listed']
# Generating the wordcolud
genre_wordcloud = WordCloud(stopwords=stopword, background_color='black', height=2000, width=4000).generate(genre_corpus)
plt.figure(figsize=(16,8))
plt.imshow(genre_wordcloud)
plt.axis('off')
plt.show()

We can see that majority of the movies in the dataset are Comedy, Drama and Sci-Fi, and Romance, closely followed by Action, Adventure, Crime, and Thriller.

In [45]:
sns.violinplot(x = 'runtime', data = metadata,).set_title('Distribution of Movie Duration')
plt.show()

In [46]:
print('runtime mean: ', metadata['runtime'].mean())
print('runtime standard deviation: ', metadata['runtime'].std())

Most of the runtimes are centered around the mean as shown by the violin plot above and the variation between the length of the movies is quite small, as shown by the standard deviation. There are also a few outliers. Also, realistically speaking, runtime is not really important in determining if a person will watch a movie because if they really do not like watching long movies, they can simply watch it in intervals.



## **Recommender Choice**

The two filtering recommender systems are categorised as content based and collabortive. The image below is a summary of the type of features that each system could use to make recommendations.

Next is a summary of how each system creates recommendations, how it genereates predictions, its drawbacks, and then we finally delve into the modelling

![image.png](attachment:4bf52474-d32f-4812-ad62-bf671bfbd26f.png)

**Content Based:**

Content Based movie recommender systems suggest items based on similarity to a movie that they already positively rated/interacted with. In the context of movies, such a system could use features such as director, cast, genre or description of a movie to make recommendations.

Though content-based filtering doesn't need any data about users to make recommendations, a drawback is it tends to return on average items in a similar category with little variation across the recommendations.

# MODELLING

In [47]:
# Define sample size of 500 000 taken from train dataset
# The samples are taken without replacement
train_sample = train.sample(n = 500000, replace = False)
# Drop the timestamp column as it is not needed
data = train_sample.drop(['timestamp', 'datetime_of_rating', 'day_of_week', 'year_rated'], axis = 1)
# Define the Reader object by specifying the rating scale in the dataset
#reader = Reader(rating_scale=(0.5, 5)) #line_format='user item rating') 
reader = Reader(rating_scale=(data['rating'].min(), data['rating'].max()))
# Load the dataset from the pandas dataframe
data2 = Dataset.load_from_df(data, reader)

#Splitting dataset into train and validation sets
train_set, val_set = train_test_split(data2, test_size=0.15, random_state=42)

## **Collaborative Filtering:**¶
Collaborative filtering movie recommender systems are said to be more widely used and make recommendations to users based on the similarity between users, in this case it would be the recommendation of a movie or the likely prediction a user could rate a movie based on the ratings of similar users.

Though they are widely used in the industry, one of the drawbacks of collaborative filtering is the so called cold start problem when it comes to new users, as the system does not have sufficient data on the user to make accurate predictions.

Unlike content based methods, collaborative algorithms return more varied results, enabling the business to introduce users to new types of content, hence increasing viewage and profit. It is for this reason that we use Collaborative filtering as our main approach. The tested models are all implemented from sklean's surprise package. The algorithms are tested below and their results summarised in the table in the comet section.

Similarity Measure

![image.png](attachment:589088f3-defc-4aa3-8f91-c517e6b77da2.png)

**The KNNBasic is a basic nearest-neighbor based collaborative filtering algorithm derived from the basic nearest-neighbor approach.**



In [51]:
# Modeling
from surprise import model_selection
kSplit = model_selection.split.KFold(n_splits=5, shuffle=True)

sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }

collabKNN = KNNBasic(k=40, sim_options=sim_options)

rmseKNN = []
rmseSlope = []

for trainset, testset in kSplit.split(data2):  # iterate through the folds
    collabKNN.fit(trainset)
    predictionsKNN = collabKNN.test(testset)
    # get root means squared error
    rmseKNN.append(rmse(predictionsKNN, verbose=True))

# Dictionary for the data to log for KNN basic model
params = {'k': 40, 'sim_options': {'name': 'cosine', 'user_based': False},
          'model_type': 'KNN_basic'}
metrics = {'RMSE': rmse(predictionsKNN, verbose=True)}

# Log the parameters and results for the KNN basic model
experiment.log_parameters(params)
experiment.log_parameters(metrics)

In [53]:
# Modeling
kSplit = model_selection.split.KFold(n_splits=5, shuffle=True)
coClus = CoClustering(n_cltr_u=4,
                                                                   n_cltr_i=4,
                                                                   n_epochs=25)
rmseCo = []
for trainset, testset in kSplit.split(data2):  # iterate through the folds
    coClus.fit(trainset)
    predictionsCoClus = coClus.test(testset)
    # get root means squared error
    rmseCo.append(rmse(predictionsCoClus, verbose=True))

# Dictionary for the data to log for CoClustering model
params = {'user_cluster': 4, 'item_cluster': 4,
          'n_ephocs': 25, 'model_name': 'CoClustering'}
metrics = {'RMSE': rmse(predictionsCoClus, verbose=True)}

# Log the parameters and results for the CoClustering model
experiment.log_parameters(params)
experiment.log_parameters(metrics)

In [None]:
# Modeling
kSplit = model_selection.split.KFold(n_splits=5, shuffle=True)
BLO = BaselineOnly()
rmseBLO = []
# iterate through the folds.
for trainset, testset in kSplit.split(data2):
    BLO.fit(trainset)
    BLOPreds = BLO.test(testset)
    # get root means squared error
    rmseBLO.append(rmse(BLOPreds, verbose=True))

# Dictionary for the data to log for BLO model
paramns = {'model_name': 'BaseLineOnly'}
metrics = {'RMSE':rmse(BLOPreds, verbose=True)}

# Log the parameters and results for the BLO model
experiment.log_parameters(params)
experiment.log_parameters(metrics)

In [None]:
# Modeling
train_set, val_set = train_test_split(data, test_size=0.15, random_state=42)
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
bl_algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(bl_algo, data, measures=['RMSE'], cv=5, verbose=False)
bl_predictions = bl_algo.fit(train_set).test(val_set)
accuracy.rmse(bl_predictions)

# Dictionary for the data to log for baseline_tuned model
params = {'model_name': 'Baseline_tuned'}
metrics = {'RMSE': accuracy.rmse(bl_predictions)}

# Log the parameters and results for the baseline_tuned model
experiment.log_parameters(params)
experiment.log_parameters(metrics)

In [None]:
kSplit = model_selection.split.KFold(n_splits=5, shuffle=True)
KNNMeans = KNNWithMeans(sim_options=sim_options)
rmseKNNMeans = []
for trainset, testset in kSplit.split(data2):  # iterate through the folds
    KNNMeans.fit(trainset)
    KNNMeansPreds = KNNMeans.test(testset)
    # Get root means squared error
    rmseKNNMeans.append(rmse(KNNMeansPreds, verbose=True))

# Dictionary for the data to log for KNNMeans model
params = {'model_name': 'KNNWithMeans'}
metrics = {'RMSE': rmse(KNNMeansPreds, verbose=True)}

# Log the parameters and results for the KNNMeans model
experiment.log_parameters(params)
experiment.log_parameters(metrics)

In [None]:
# Modeling
kSplit = model_selection.split.KFold(n_splits=5, shuffle=True)
KNNZsco = KNNWithZScore(sim_options=sim_options)
rmseKNNZscore = []
for trainset, testset in kSplit.split(data2):  # iterate through the folds.
    KNNZsco.fit(trainset)
    KNNZscorePreds = KNNZsco.test(testset)
    # Get root means squared error
    rmseKNNZscore.append(rmse(KNNZscorePreds, verbose=True))

# Dictionary for the data to log for KNNWithZScore model
params = {'model_name': 'KNNWithZscore'}
metrics = {'RMSE':rmse(KNNZscorePreds, verbose=True)}

# Log the parameters and results for the KNNWithZscore model
experiment.log_parameters(params)
experiment.log_parameters(metrics)

In [None]:
# Modeling
kSplit = model_selection.split.KFold(n_splits=5, shuffle=True)
KNNBaseL = KNNBaseline(sim_options=sim_options)
rmseKNNBaseL = []
for trainset, testset in kSplit.split(data2):  # iterate through the folds.
    KNNBaseL.fit(trainset)
    KNNBaseLPreds = KNNBaseL.test(testset)
    # Get root means squared error
    rmseKNNBaseL.append(rmse(KNNBaseLPreds, verbose=True))

# Dictionary for the data to log for KNNBaseLine model
params = {'model_name': 'KNNBaseLine'}
metrics = {'RMSE': rmse(KNNBaseLPreds, verbose=True)}

# Log the parameters and results for the KNNBaseLine model
experiment.log_parameters(params)
experiment.log_parameters(metrics)

In [None]:
# Modelling
kSplit = model_selection.split.KFold(n_splits=5, shuffle=True)
NMF = NMF()
rmseNMF = []
for trainset, testset in kSplit.split(data2): #iterate through the folds.
    NMF.fit(trainset)
    NMFPreds = NMF.test(testset)
    rmseNMF.append(rmse(NMFPreds,verbose=True))#get root means squared error

    
# Dictionary for the data to log for the NMF model
params = {'model_name': 'NMF'}
metrics = {'RMSE': rmse(NMFPreds,verbose=True)}

# Log the parameters and results for the NMF model
experiment.log_parameters(params)
experiment.log_parameters(metrics)

In [None]:
# Define the SVD algorithm class
svd_algo = SVD()
# Fitting the model on the train_set
svd_algo.fit(train_set)
# Predicting on the validation set
svd_predictions = svd_algo.test(val_set)
# Calculating the RMSE of the predictions
accuracy.rmse(svd_predictions)

# Dictionary for the data to log for SVD model
params = {'model_name': 'SVD'}
metrics = {'RMSE': rmse(svd_predictions)}

# Log the parameters and results for the SVD model
experiment.log_parameters(params)
experiment.log_parameters(metrics)

In [None]:
rating_scale = Reader(rating_scale=(0.5, 5))
train_df = Dataset.load_from_df(train.drop('timestamp', axis=1), rating_scale)

# Training and validation set split for hypertuning
train_set, val_set = train_test_split(train_df,
                                      test_size=0.008,
                                      random_state=42)

# Modelling of the SVD hypertuning
svd_algo_hyper = SVD(n_factors=160, 
                     lr_all=0.0085,
                     reg_all=0.02,
                     n_epochs=20,
                     init_std_dev=0.01)
svd_algo_hyper.fit(train_set)

# Predicting on the validation set
svd_hyper_predictions = svd_algo_hyper.test(val_set)

# Convert the predictions to dataframe
#test = pd.DataFrame(predictions)
accuracy.rmse(svd_hyper_predictions)

# Dictionary for the data to log for the SVD tuned model
params = {'model_name': 'SVD_Tuned'}
metrics = {'RMSE': accuracy.rmse(svd_hyper_predictions)}

# Log the parameters and results for the SVD tuned model
experiment.log_parameters(params)
experiment.log_parameters(metrics)
# End the experiment for the SVD tuned experiment
experiment.end()

In [None]:
# Summary of the comet experiments
rmse_scores = [1.16, 1.04, 1.00, 1.12, 0.92, 0.91, 0.92, 1.03, 0.98, 1.04, 0.78]
models = ['NMF', 'KNN Basic', 'BaseLineOnly', 'CoClustering',
          'SVD', 'SVD_tuned', 'BaselineTuned', 'KNNwithZscore',
          'KNNBaseLine', 'KNNWithMeans', 'SVDNewTuned']
comet_scores = pd.DataFrame({'RMSE': rmse_scores}, index = models)
comet_scores.sort_values(by='RMSE')

In [None]:
pred_svd_hyper = [svd_algo_hyper.predict(row.userId,
                                         row.movieId) for idx,row in test.iterrows()]

# Converting the predictions to a dataframe
test_pred_svd_hyper = pd.DataFrame(pred_svd_hyper)

In [None]:
#renaming the fields of the prediction dataframe
test_pred_svd_hyper.drop(['r_ui', 'details'], axis=1, inplace=True)
test_pred_svd_hyper = test_pred_svd_hyper.rename(columns={'uid':'userId',
                                                          'iid':'movieId',
                                                          'est':'rating'})
test_pred_svd_hyper.head()

In [None]:
# Concatenating each userId and movieId to a single Id column for submission
test_pred_svd_hyper['Id'] =  test_pred_svd_hyper['userId'].astype(str).str.zfill(1) + '_' + test_pred_svd_hyper['movieId'].astype(str).str.zfill(1)

In [None]:
svd_hyper_predictions = test_pred_svd_hyper[['Id','rating']]
svd_hyper_predictions.head()

In [None]:
pickle.dump(model, open(save_path,'wb'))
model_save_path = "team2_unsupervised_recommender.pkl"
with open(model_save_path, 'wb') as file:
    pickle.dump(svd_algo_hyper, file)