### TODO
- Everything
- Drop users that have rated very few movies -> How to decide the threshold?
- Define the best approach to the collaborative filtering (model-based or memory-based)
- Build the recommender system (can it be as simple as kNN?)

### DONE
- File loading and creation of the DataFrame

In [None]:
# Imports
import os
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt

from surprise import Reader, Dataset, SVD, KNNWithMeans
from surprise.model_selection import cross_validate, GridSearchCV


# Not being used yet.
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

from scipy.sparse import csr_matrix

from scipy.spatial import Voronoi, voronoi_plot_2d
from imblearn.over_sampling import SMOTE

sns.set_theme(style="darkgrid")

In [None]:
# Read Movie Titles file
def readMovieTitle(file_path):
    data_dict = {'Movie_Id' : [], 'Release_Year' : [], 'Title' : []}
    data_file = open(file_path, "r", encoding='ISO-8859-1')
    for line in data_file:            
        id, year, title = line.split(',', 2)
        data_dict['Movie_Id'].append(id)
        data_dict['Release_Year'].append(year)
        data_dict['Title'].append(title.rstrip('\n'))
    data_file.close()
            
    return pd.DataFrame(data_dict)

In [None]:
# The dataset is very large, so there's a flag to load only a handful of rows if necessary
# First 100k lines takes 0.2s to load, the whole dataset takes almost 2 minutes
def readFile(file_path, rows = 100000, flag = False):
    data_dict = {'Cust_Id' : [], 'Movie_Id' : [], 'Rating' : [], 'Date' : []}
    data_file = open(file_path, "r")
    count = 0
    for line in data_file:
        count += 1
        if flag and (count > rows):
            break
            
        if ':' in line:
            movidId = line[:-2] # remove the last character ':'
            movieId = int(movidId)
        else:
            customerID, rating, date = line.split(',')
            data_dict['Cust_Id'].append(customerID)
            data_dict['Movie_Id'].append(movieId)
            data_dict['Rating'].append(rating)
            data_dict['Date'].append(date.rstrip("\n"))
    data_file.close()
            
    return pd.DataFrame(data_dict)

In [None]:
# Load the title of the movies
df_title = readMovieTitle('data/movie_titles.csv')
df_title['Movie_Id'] = df_title['Movie_Id'].astype(int)
df_title.head(10)

In [None]:
df_title.loc[df_title['Movie_Id'] == 1]

In [None]:
# Loading the user data
# User ID, Movie ID, Rating and Date
flag_limit = True
df1 = readFile('data/combined_data_1.txt', flag = flag_limit)
df2 = readFile('data/combined_data_2.txt', flag = flag_limit)
df3 = readFile('data/combined_data_3.txt', flag = flag_limit)
df4 = readFile('data/combined_data_4.txt', flag = flag_limit)
df1['Rating'] = df1['Rating'].astype(float)
df2['Rating'] = df2['Rating'].astype(float)
df3['Rating'] = df3['Rating'].astype(float)
df4['Rating'] = df4['Rating'].astype(float)

In [None]:
# Creation of the DataFrame with all the data
df = df1.copy()
df = pd.concat([df2, df3, df4])
df.index = np.arange(0,len(df))
df.head(10)

# Complete Dataframe with the movie titles
# Not necessary
df = df.merge(df_title, how='left')
df = df.loc[:, ['Cust_Id', 'Movie_Id', 'Title', 'Release_Year', 'Rating', 'Date']]
df.head(-10)

# Too much data, so I'm deleting the dataframes after the merge
# In total it uses 14gb of memory
del df1, df2, df3, df4

In [None]:
df.head(10)

In [None]:
# Group the dataframe by ratings
ratings_df = df.groupby('Rating')['Rating'].agg(['count'])

# Number of unique movies
movie_count = df['Movie_Id'].nunique()

# Number of unique customers
cust_count = df['Cust_Id'].nunique()

# Number of ratings
rating_count = df['Cust_Id'].count()

ax = ratings_df.plot(kind = 'barh', legend = False, figsize = (15,5))

plt.title(f'Total pool: {movie_count} Movies, {cust_count:,} customers, {rating_count:,} ratings given', fontsize=18)
plt.axis('off')

for i in range(1,6):
    ax.text(ratings_df.iloc[i-1][0]/4, i-1, 'Rating {}: {:.0f}%'.format(i, ratings_df.iloc[i-1][0]*100 / ratings_df.sum()[0]), color = 'white', weight = 'bold')

In [None]:
# Group users by the number of reviews
n_df = df.groupby('Cust_Id')['Rating'].agg(['count'])

print(n_df['count'].max())

n_df.hist(log=True)


In [None]:
f = ['count','mean']

df_movie_summary = df.groupby('Movie_Id')['Rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

print('Movie minimum times of review: {}'.format(movie_benchmark))

df_cust_summary = df.groupby('Cust_Id')['Rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

print('Customer minimum times of review: {}'.format(cust_benchmark))

In [None]:
# reader = Reader(rating_scale=(1, 5))
# svd = SVD()

# data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']][:], reader)
# #data.split(n_folds=3)

# cross_validate(svd, data, measures=['RMSE', 'MAE'])

In [None]:
# Checking the favorite movies of a random user (df['Cust_Id'].mode() -> users with most reviews)
df.loc[(df['Cust_Id'] == '305344') & (df['Rating'] >= 4)]

In [None]:
# Creating the dataset inside Surprise
# Surprise accepts by default only this 3 parameters \/
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']], reader)

trainingSet = data.build_full_trainset()

In [None]:
# Using SVD for recomendation
# Copied this online -> don't fully understand.
svd = SVD()
user_305344 = df_title.copy()
user_305344 = user_305344.reset_index()

svd.fit(trainingSet)

user_305344['Estimate_Score'] = user_305344['Movie_Id'].apply(lambda x: svd.predict(305344, x).est)

user_305344 = user_305344.drop('Movie_Id', axis = 1)

user_305344 = user_305344.sort_values('Estimate_Score', ascending=False)
print(user_305344[['Release_Year','Title', 'Estimate_Score']].head(10))

In [None]:
# Using kNN with Means
parameters = {
    'name' : 'msd',
    'user_based' : False, # Compute similarities between items and users
    'min_support' : 3
}

param_grid = {'sim_option' : parameters}

algo = KNNWithMeans(sim_options=parameters)

algo.fit(trainingSet)

# User ID + Movie ID
prediction = algo.predict(305344, 9235)
print(prediction.est)

In [None]:
# Grid Search to find the best parameters for SVD
param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

In [None]:
# Appplying Best Parameters to predict the score of the same movie and user predicted by kNN
algo = gs.best_estimator["rmse"]
algo.fit(data.build_full_trainset())

prediction = algo.predict(305344, 9235)
print(prediction.est)