# Library

In [1]:
import os
import shutil
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import mean_squared_error
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
def display_full(x, columns=False):
    pd.set_option('display.max_rows', len(x))
    if columns:
        pd.set_option('display.max_columns', len(x.columns))

    display(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

# Import Data

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# !mkdir /content/movie_list
shutil.unpack_archive('/content/drive/MyDrive/Colab Dataset/movielens-latest.zip', '/content/movie_list/')
main_path = '/content/movie_list/ml-latest'

movie_data = pd.read_csv('/content/movie_list/ml-latest/movies.csv')
display(movie_data.head())
link_data = pd.read_csv('/content/movie_list/ml-latest/links.csv')
display(link_data.head())
rating_data = pd.read_csv('/content/movie_list/ml-latest/ratings.csv')
display(rating_data.head())
tags_data = pd.read_csv('/content/movie_list/ml-latest/tags.csv')
display(tags_data.head())


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,Medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,space action,1442169421
4,14,318,imdb top 250,1442615195


# 1.Collaborative Filtering with Matrix Factorization (from Scratch)

## Data Wrangling

Collaborative Filtering has a problem to user cold-start, which model might not be able to provide a decent recommendation list to those who have given low number of rating, therefore model have lack of information about cold-start user preference


In this stage, we are going to select user who have given at least interacted with 2000 movies and movies has been rated by 1000 users (this will help to reduce the table size, cause I have limited resources to compute masive table)

In [5]:
n_interacted = 2000
user_movie_data_temp = pd.pivot_table(rating_data, index = ['userId'], values='movieId', aggfunc='count')
user_movie_data_temp[user_movie_data_temp.movieId>=n_interacted]
selected_user_ids = user_movie_data_temp[user_movie_data_temp.movieId>=n_interacted].index
print('number of userIds: ', str(len(selected_user_ids)))

n_rated = 1000
get_rated_movie = pd.pivot_table(rating_data, index=['movieId'], values='userId', aggfunc='count')
get_rated_movie[get_rated_movie.userId>=n_rated]
selected_movie_ids = get_rated_movie[get_rated_movie.userId>=n_rated].index

print('numbser of movieIds: ', str(len(selected_movie_ids)))

filtered_rating_data = rating_data[(rating_data['userId'].isin(selected_user_ids)) &(rating_data['movieId'].isin(selected_movie_ids))]
filtered_rating_data['movieId'] = filtered_rating_data['movieId'].apply(lambda x: 'm_'+str(x))
# filtered_rating_data['user_movie'] = filtered_rating_data['userId'].astype(str) + '_' + filtered_rating_data['movieId'].astype(str)

print('raw data shape.  : ',str(filtered_rating_data.shape))


number of userIds:  424
numbser of movieIds:  3931
raw data shape.  :  (736327, 4)


In [6]:
filtered_rating_data = filtered_rating_data[['userId','movieId','rating']]
filtered_rating_data

Unnamed: 0,userId,movieId,rating
128418,1272,m_1,3.5
128419,1272,m_7,2.0
128420,1272,m_11,4.0
128421,1272,m_16,2.5
128422,1272,m_21,3.0
...,...,...,...
27735125,283000,m_177615,4.0
27735126,283000,m_177765,4.0
27735140,283000,m_179819,4.0
27735141,283000,m_180031,3.5


## train test split

In [7]:
train_df, test_df =  train_test_split(filtered_rating_data, 
                                   stratify = filtered_rating_data['userId'],
                                   test_size = 0.2,
                                   random_state = 42)

print('train_df size:{}'.format(len(train_df)))
print('test_df size:{}'.format(len(test_df)))

train_df size:589061
test_df size:147266


While the information we require is present, it is not presented in a way that is beneficial for humans to comprehend. However, I have created a table that presents the same data in a format that is easier for humans to understand.

In [8]:
user_movie_data_train = train_df.pivot(index='userId', columns='movieId', values='rating').fillna(0.0)
user_movie_data_train

movieId,m_1,m_10,m_100,m_100163,m_1003,m_100383,m_1004,m_100498,m_1005,m_1006,...,m_98961,m_99007,m_991,m_99112,m_99114,m_99149,m_994,m_996,m_99813,m_999
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1272,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.5,0.0,3.5,0.0,0.0,0.0
2025,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,3.5,0.0,2.5,4.0,3.0,0.0,0.0,0.0,0.0
2150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0
2294,0.0,0.0,4.0,1.5,0.0,3.5,0.0,2.0,0.0,0.0,...,2.0,2.5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0
2329,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280585,5.0,3.5,3.5,0.0,0.0,0.0,0.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
280868,0.0,2.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,4.0,0.0,3.5,0.0,0.0,0.0
281631,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
281790,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Concept of Matrix Factorization



In [9]:
def matrix_factorization(R, K, steps=5, alpha=0.002, beta=0.02):
    '''
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter
    
    '''
 
    P = np.random.rand(len(R),K)
    Q = np.random.rand(len(R[0]),K)
    Q = Q.T

    for step in range(steps):
        print('Processing epoch {}'.format(step))
        
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

        eR = np.dot(P,Q)

        e = 0

        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        # 0.001: local minimum
        if e < 0.001:

            break

    return P, Q.T

In [10]:
R = np.array(user_movie_data_train)
nP, nQ = matrix_factorization(R, K=10)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


Transforming prediction to reconstructed matrix back into a Pandas dataframe.

In [11]:
pred_R = np.dot(nP, nQ.T)

# Transforming prediction to reconstructed matrix back into a Pandas dataframe in cross-tabural format
user_movie_pred = pd.DataFrame(pred_R, columns=user_movie_data_train.columns, index=list(user_movie_data_train.index))
print(user_movie_pred.shape)
user_movie_pred.head(10)

(424, 3931)


movieId,m_1,m_10,m_100,m_100163,m_1003,m_100383,m_1004,m_100498,m_1005,m_1006,...,m_98961,m_99007,m_991,m_99112,m_99114,m_99149,m_994,m_996,m_99813,m_999
1272,3.735673,3.02786,2.763605,2.455705,2.652861,3.181598,2.157176,2.13472,1.969594,2.648396,...,3.591778,2.960264,3.080945,3.253201,3.740445,2.992435,3.641021,2.64759,3.226346,2.921317
2025,4.607745,3.740133,3.480333,3.144934,3.179747,3.985038,2.518903,2.759624,2.321163,3.074867,...,4.100505,3.658158,3.812278,3.853485,4.487234,3.639213,4.333279,3.189702,4.049339,3.506791
2150,4.272514,3.410622,3.133937,2.872539,3.07255,3.753063,2.498425,2.547928,2.147251,2.982076,...,3.975518,3.422284,3.547729,3.681591,4.23923,3.403992,4.1347,3.113936,3.826688,3.404749
2294,3.48319,2.843854,2.607246,2.364914,2.315838,2.986017,1.951723,2.092785,1.849122,2.392773,...,3.251155,2.744832,2.944677,2.909869,3.441271,2.822187,3.386765,2.381784,2.981357,2.724866
2329,4.446563,3.573345,3.297245,2.978194,3.068511,3.868279,2.492232,2.667031,2.225207,3.070688,...,4.058573,3.550056,3.716832,3.777455,4.32772,3.535663,4.252082,3.102598,3.956291,3.466916
2551,3.987597,3.024001,2.973447,2.679168,2.527373,3.265841,1.861138,2.109318,1.703336,2.405896,...,3.170439,2.857409,3.2407,2.961413,3.687116,3.068296,3.582,2.551238,3.189968,2.579945
2629,4.499855,3.592571,3.311423,2.984118,3.144853,3.856156,2.532812,2.640916,2.261542,3.050234,...,4.134764,3.609865,3.725442,3.786054,4.423049,3.60241,4.299265,3.182454,3.929446,3.469604
4027,4.671176,3.684927,3.369455,3.141599,3.311902,4.029169,2.695433,2.715683,2.223083,3.269857,...,4.353793,3.663054,3.88092,3.916321,4.577518,3.706308,4.548788,3.334933,4.103748,3.733081
4605,3.29008,2.710631,2.461078,2.335966,2.488688,3.031681,2.057953,2.066739,1.698239,2.296406,...,3.050886,2.633356,2.724002,2.895468,3.318901,2.585429,3.207811,2.529777,3.047218,2.763222
4796,3.406018,2.798793,2.543738,2.265337,2.420186,2.912582,1.912666,2.083387,1.744077,2.316433,...,3.155505,2.859107,2.809161,2.921463,3.330606,2.709641,3.223412,2.388351,3.061513,2.701911


In [12]:
# User Matrix
Pu = pd.DataFrame(nP, index=list(user_movie_data_train.index))
# Movie Matrix
Qu = pd.DataFrame(nQ, index=user_movie_data_train.columns)

# This matrix can be used independently to predict testing dataset

In [13]:
def predict_rating(data):
    try:
        pred_rating = np.dot(Pu.loc[data.userId], Qu.loc[data.movieId].T)
    except Exception as e:
        pred_rating = np.nan
        print('Unknown user: {} or movieId: {}'.format(data.userId,data.movieId))
    return pred_rating

In [14]:
test_df['pred_rating'] = test_df.apply(predict_rating, axis=1)

In [15]:
test_df['userId'] = test_df['userId'].apply(lambda x: 'user_'+str(x))
test_df

Unnamed: 0,userId,movieId,rating,pred_rating
25761256,user_263149,m_3176,3.5,4.100837
22315943,user_228128,m_7419,2.5,3.716400
24342373,user_248881,m_89774,1.5,3.090492
18711521,user_190879,m_143355,3.0,3.291458
12150140,user_124492,m_2325,4.0,2.867495
...,...,...,...,...
13061955,user_133546,m_77561,4.0,3.180395
13866589,user_141955,m_3104,3.0,3.083342
26766434,user_273271,m_3396,3.5,3.565933
7491494,user_77157,m_3548,3.0,2.854872


# Evaluation

In [16]:
rmse_test = mean_squared_error(test_df['rating'], test_df['pred_rating'], squared=False)
rmse_test

0.8286958971486692

In [17]:
def precision_recall_at_k(predictions, k, threshold):
    user_pred_true = defaultdict(list)
    for uid, mid, true, pred in predictions.values:
        user_pred_true[uid].append((pred, true))

    # print(user_pred_true)
    precisions = dict()
    recalls = dict()
    global_n_rel = 0
    global_n_rec_k = 0
    global_n_rel_and_rec_k = 0
    global_precisions = 0
    global_recalls = 0

    for uid, user_ratings in user_pred_true.items():
    #     print(user_ratings)
        user_ratings.sort(key=lambda x: x[0], reverse=True)

    #     print(user_ratings)
        n_rel = sum((true >= threshold) for (_,true) in user_ratings)

        n_rec_k = sum((pred >= threshold) for (pred,_) in user_ratings[:k])

        n_rel_and_rec_k = sum(((true >= threshold) and (pred >= threshold)) for (pred,true) in user_ratings[:k])

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k !=0 else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

        
        
        global_n_rel += n_rel
        global_n_rec_k += n_rec_k
        global_n_rel_and_rec_k += n_rel_and_rec_k


    global_precisions = global_n_rel_and_rec_k / global_n_rec_k if global_n_rec_k !=0 else 0
    global_recalls = global_n_rel_and_rec_k / global_n_rel if global_n_rel != 0 else 0

    return precisions, recalls, global_precisions, global_recalls
    

In [18]:
k = 3
threshold = 4
individual_precision, individual_recall, global_precision, global_recall =  precision_recall_at_k(test_df, k, threshold)

In [19]:
print(individual_precision)
print(individual_recall)
print(global_precision)
print(global_recall)

{'user_263149': 1.0, 'user_228128': 1.0, 'user_248881': 0, 'user_190879': 0, 'user_124492': 1.0, 'user_134596': 1.0, 'user_87694': 1.0, 'user_243324': 1.0, 'user_201646': 0, 'user_281790': 0, 'user_141955': 0, 'user_169639': 0.6666666666666666, 'user_253059': 1.0, 'user_2629': 1.0, 'user_51515': 1.0, 'user_71277': 0.6666666666666666, 'user_191063': 0, 'user_111908': 0, 'user_60950': 1.0, 'user_102042': 1.0, 'user_19924': 1.0, 'user_5176': 1.0, 'user_19679': 1.0, 'user_176323': 1.0, 'user_191444': 0.5, 'user_33854': 1.0, 'user_146760': 0, 'user_27983': 0.3333333333333333, 'user_133398': 1.0, 'user_67910': 1.0, 'user_245960': 1.0, 'user_205765': 0.6666666666666666, 'user_199011': 0.6666666666666666, 'user_18490': 1.0, 'user_61055': 0, 'user_26786': 0, 'user_51571': 1.0, 'user_263685': 1.0, 'user_158194': 1.0, 'user_248694': 0, 'user_44565': 1.0, 'user_137577': 0.3333333333333333, 'user_21001': 1.0, 'user_232485': 0, 'user_123100': 1.0, 'user_93731': 1.0, 'user_181291': 1.0, 'user_274332'

# 2.Collaborative Filtering using Surprise Package

In [20]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3096322 sha256=801645eac71d695f86c36847c7210c37275b932783a6f76f52b40a55ee68d4cb
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [21]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split

In [22]:
# using the same data set as above, then we fit in to suprise package
filtered_rating_data

Unnamed: 0,userId,movieId,rating
128418,1272,m_1,3.5
128419,1272,m_7,2.0
128420,1272,m_11,4.0
128421,1272,m_16,2.5
128422,1272,m_21,3.0
...,...,...,...
27735125,283000,m_177615,4.0
27735126,283000,m_177765,4.0
27735140,283000,m_179819,4.0
27735141,283000,m_180031,3.5


In [None]:
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(filtered_rating_data[['userId','movieId','rating']], reader)

trainset, testset = train_test_split(data, test_size=0.25)

# We'll use the famous SVD (one of matrix factorization) algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
evaluation = cross_validate(algo, data, measures=['RMSE','MAE'], cv= 5, verbose=True)


In [None]:
def get_top_n(predictions, n=5):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n
    

In [26]:
top_n = get_top_n(predictions, 10)

for uid, user_ratings in top_n.items():
    print(uid, [mid for (mid, _) in user_ratings])

137652 ['m_2762', 'm_3147', 'm_76251', 'm_2329', 'm_174053', 'm_3793', 'm_480', 'm_50', 'm_4720', 'm_457']
59314 ['m_904', 'm_318', 'm_1201', 'm_1193', 'm_527', 'm_1178', 'm_55820', 'm_1206', 'm_1198', 'm_1219']
224634 ['m_912', 'm_318', 'm_337', 'm_953', 'm_4993', 'm_858', 'm_2918', 'm_930', 'm_1247', 'm_904']
83579 ['m_1270', 'm_260', 'm_356', 'm_1240', 'm_1291', 'm_4226', 'm_2194', 'm_3527', 'm_1272', 'm_457']
102042 ['m_50', 'm_593', 'm_1265', 'm_4226', 'm_899', 'm_1270', 'm_4993', 'm_1203', 'm_1201', 'm_2762']
133399 ['m_1213', 'm_318', 'm_1222', 'm_1387', 'm_1200', 'm_55820', 'm_16', 'm_80463', 'm_1234', 'm_6']
78982 ['m_5952', 'm_1221', 'm_50', 'm_318', 'm_858', 'm_541', 'm_1704', 'm_7361', 'm_6016', 'm_1214']
105104 ['m_1228', 'm_908', 'm_1214', 'm_1097', 'm_1387', 'm_61240', 'm_364', 'm_3435', 'm_6377', 'm_1304']
187986 ['m_858', 'm_318', 'm_1250', 'm_55820', 'm_44555', 'm_6', 'm_1193', 'm_48780', 'm_64614', 'm_58559']
275841 ['m_858', 'm_5445', 'm_590', 'm_81845', 'm_4995', '