# Library

In [None]:
import os
import shutil
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import mean_squared_error
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
def display_full(x, columns=False):
    pd.set_option('display.max_rows', len(x))
    if columns:
        pd.set_option('display.max_columns', len(x.columns))

    display(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

# Import Data

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# !mkdir /content/movie_list
shutil.unpack_archive('/content/drive/MyDrive/Colab Dataset/movielens-latest.zip', '/content/movie_list/')
main_path = '/content/movie_list/ml-latest'

movie_data = pd.read_csv('/content/movie_list/ml-latest/movies.csv')
display(movie_data.head())
link_data = pd.read_csv('/content/movie_list/ml-latest/links.csv')
display(link_data.head())
rating_data = pd.read_csv('/content/movie_list/ml-latest/ratings.csv')
display(rating_data.head())
tags_data = pd.read_csv('/content/movie_list/ml-latest/tags.csv')
display(tags_data.head())


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,Medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,space action,1442169421
4,14,318,imdb top 250,1442615195


# 1.Collaborative Filtering with Matrix Factorization (from Scratch)

## Data Wrangling

Collaborative Filtering has a problem to user cold-start, which model might not be able to provide a decent recommendation list to those who have given low number of rating, therefore model have lack of information about cold-start user preference


In this stage, we are going to select user who have given at least interacted with 2000 movies and movies has been rated by 1000 users (this will help to reduce the table size, cause I have limited resources to compute masive table)

In [None]:
n_interacted = 2000
user_movie_data_temp = pd.pivot_table(rating_data, index = ['userId'], values='movieId', aggfunc='count')
user_movie_data_temp[user_movie_data_temp.movieId>=n_interacted]
selected_user_ids = user_movie_data_temp[user_movie_data_temp.movieId>=n_interacted].index
print('number of userIds: ', str(len(selected_user_ids)))

n_rated = 1000
get_rated_movie = pd.pivot_table(rating_data, index=['movieId'], values='userId', aggfunc='count')
get_rated_movie[get_rated_movie.userId>=n_rated]
selected_movie_ids = get_rated_movie[get_rated_movie.userId>=n_rated].index

print('numbser of movieIds: ', str(len(selected_movie_ids)))

filtered_rating_data = rating_data[(rating_data['userId'].isin(selected_user_ids)) &(rating_data['movieId'].isin(selected_movie_ids))]
filtered_rating_data['movieId'] = filtered_rating_data['movieId'].apply(lambda x: 'm_'+str(x))
# filtered_rating_data['user_movie'] = filtered_rating_data['userId'].astype(str) + '_' + filtered_rating_data['movieId'].astype(str)

print('raw data shape.  : ',str(filtered_rating_data.shape))


number of userIds:  424
numbser of movieIds:  3931
raw data shape.  :  (736327, 4)


In [None]:
filtered_rating_data = filtered_rating_data[['userId','movieId','rating']]
filtered_rating_data

Unnamed: 0,userId,movieId,rating
128418,1272,m_1,3.5
128419,1272,m_7,2.0
128420,1272,m_11,4.0
128421,1272,m_16,2.5
128422,1272,m_21,3.0
...,...,...,...
27735125,283000,m_177615,4.0
27735126,283000,m_177765,4.0
27735140,283000,m_179819,4.0
27735141,283000,m_180031,3.5


## train test split

In [None]:
train_df, test_df =  train_test_split(filtered_rating_data, 
                                   stratify = filtered_rating_data['userId'],
                                   test_size = 0.2,
                                   random_state = 42)

print('train_df size:{}'.format(len(train_df)))
print('test_df size:{}'.format(len(test_df)))

train_df size:589061
test_df size:147266


While the information we require is present, it is not presented in a way that is beneficial for humans to comprehend. However, I have created a table that presents the same data in a format that is easier for humans to understand.

In [None]:
user_movie_data_train = train_df.pivot(index='userId', columns='movieId', values='rating').fillna(0.0)
user_movie_data_train

movieId,m_1,m_10,m_100,m_100163,m_1003,m_100383,m_1004,m_100498,m_1005,m_1006,...,m_98961,m_99007,m_991,m_99112,m_99114,m_99149,m_994,m_996,m_99813,m_999
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1272,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.5,0.0,3.5,0.0,0.0,0.0
2025,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,3.5,0.0,2.5,4.0,3.0,0.0,0.0,0.0,0.0
2150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0
2294,0.0,0.0,4.0,1.5,0.0,3.5,0.0,2.0,0.0,0.0,...,2.0,2.5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0
2329,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280585,5.0,3.5,3.5,0.0,0.0,0.0,0.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
280868,0.0,2.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,4.0,0.0,3.5,0.0,0.0,0.0
281631,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
281790,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Concept of Matrix Factorization



In [None]:
def matrix_factorization(R, K, steps=5, alpha=0.002, beta=0.02):
    '''
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter
    
    '''
 
    P = np.random.rand(len(R),K)
    Q = np.random.rand(len(R[0]),K)
    Q = Q.T

    for step in range(steps):
        print('Processing epoch {}'.format(step))
        
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

        eR = np.dot(P,Q)

        e = 0

        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        # 0.001: local minimum
        if e < 0.001:

            break

    return P, Q.T

In [None]:
R = np.array(user_movie_data_train)
nP, nQ = matrix_factorization(R, K=10)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


Transforming prediction to reconstructed matrix back into a Pandas dataframe.

In [None]:
pred_R = np.dot(nP, nQ.T)

# Transforming prediction to reconstructed matrix back into a Pandas dataframe in cross-tabural format
user_movie_pred = pd.DataFrame(pred_R, columns=user_movie_data_train.columns, index=list(user_movie_data_train.index))
print(user_movie_pred.shape)
user_movie_pred.head(10)

(424, 3931)


movieId,m_1,m_10,m_100,m_100163,m_1003,m_100383,m_1004,m_100498,m_1005,m_1006,...,m_98961,m_99007,m_991,m_99112,m_99114,m_99149,m_994,m_996,m_99813,m_999
1272,3.704933,3.007068,2.750135,2.472991,2.652726,3.341733,2.234315,2.249339,1.826829,2.631809,...,3.437851,2.952539,3.083602,2.989692,3.699034,2.813051,3.577055,2.597896,3.391457,2.91503
2025,4.421106,3.539121,3.326415,3.084211,3.068702,4.069096,2.521009,2.683477,2.347356,3.186719,...,4.23472,3.552699,3.709807,3.884674,4.422847,3.659528,4.371806,3.224635,4.056423,3.55272
2150,4.285594,3.363209,3.145889,2.924917,3.063465,3.76249,2.501781,2.653686,2.129555,2.982331,...,3.888541,3.442264,3.614412,3.683015,4.20659,3.469269,4.156432,3.066423,3.86705,3.399714
2294,3.565833,2.9859,2.536842,2.393682,2.427902,3.004471,2.026742,2.213747,1.832468,2.522394,...,3.137199,2.875125,2.878284,2.914477,3.559463,2.751124,3.379228,2.55903,3.285609,2.839151
2329,4.347439,3.46921,3.228835,2.934374,3.085764,4.004204,2.533478,2.650342,2.244649,3.065708,...,4.14755,3.436063,3.738733,3.764968,4.376345,3.549846,4.377773,3.111934,3.990541,3.405643
2551,3.510894,2.608615,2.692949,2.498177,2.673976,3.478835,2.163087,2.137162,1.690185,2.535213,...,3.595106,2.75839,3.235368,3.12209,3.495837,2.900139,3.516665,2.48867,3.195247,2.786888
2629,4.463067,3.566304,3.284157,3.090476,3.101952,3.935165,2.557749,2.647811,2.237492,3.07696,...,4.098727,3.615451,3.782802,3.799067,4.381156,3.662086,4.212529,3.236345,4.002277,3.552976
4027,4.706546,3.687777,3.414177,3.147321,3.327239,4.141282,2.768419,2.834908,2.298139,3.234826,...,4.27882,3.725603,3.999447,3.846524,4.605923,3.666762,4.444017,3.299592,4.22077,3.677154
4605,3.212018,2.450162,2.442023,2.252122,2.325002,3.209729,1.944808,2.080196,1.659087,2.498929,...,3.243016,2.527302,2.840763,2.862035,3.223775,2.583719,3.322567,2.313214,2.997358,2.660311
4796,3.362401,2.689624,2.450053,2.217999,2.458253,3.006244,1.998367,2.080015,1.684336,2.332789,...,3.143747,2.621553,2.862709,2.789539,3.401112,2.596261,3.343984,2.327013,3.110509,2.563302


In [None]:
# User Matrix
Pu = pd.DataFrame(nP, index=list(user_movie_data_train.index))
# Movie Matrix
Qu = pd.DataFrame(nQ, index=user_movie_data_train.columns)

# This matrix can be used independently to predict testing dataset

In [None]:
def predict_rating(data):
    try:
        pred_rating = np.dot(Pu.loc[data.userId], Qu.loc[data.movieId].T)
    except Exception as e:
        pred_rating = np.nan
        print('Unknown user: {} or movieId: {}'.format(data.userId,data.movieId))
    return pred_rating

In [None]:
test_df['pred_rating'] = test_df.apply(predict_rating, axis=1)

In [None]:
test_df

Unnamed: 0,userId,movieId,rating,pred_rating
25761256,263149,m_3176,3.5,4.080173
22315943,228128,m_7419,2.5,3.637218
24342373,248881,m_89774,1.5,3.041938
18711521,190879,m_143355,3.0,3.290767
12150140,124492,m_2325,4.0,2.913533
...,...,...,...,...
13061955,133546,m_77561,4.0,3.036337
13866589,141955,m_3104,3.0,3.147288
26766434,273271,m_3396,3.5,3.527973
7491494,77157,m_3548,3.0,2.870988


# Evaluation

In [None]:
rmse_test = mean_squared_error(test_df['rating'], test_df['pred_rating'], squared=False)
rmse_test

0.8298063482981628

# 2.Collaborative Filtering using Surprise Package

In [None]:
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp39-cp39-linux_x86_64.whl size=3195800 sha256=d2069901510a7ff7b201b5e43ec6b9b9e4fd059339680a6e4c9942842a004950
  Stored in directory: /root/.cache/pip/wheels/c6/3a/46/9b17b3512bdf283c6cb84f59929cdd5199d4e754d596d22784
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split

In [None]:
# using the same data set as above, then we fit in to suprise package
filtered_rating_data

Unnamed: 0,userId,movieId,rating
128418,1272,m_1,3.5
128419,1272,m_7,2.0
128420,1272,m_11,4.0
128421,1272,m_16,2.5
128422,1272,m_21,3.0
...,...,...,...
27735125,283000,m_177615,4.0
27735126,283000,m_177765,4.0
27735140,283000,m_179819,4.0
27735141,283000,m_180031,3.5


In [None]:
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(filtered_rating_data[['userId','movieId','rating']], reader)

trainset, testset = train_test_split(data, test_size=0.25)

# We'll use the famous SVD (one of matrix factorization) algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
evaluation = cross_validate(algo, data, measures=['RMSE','MAE'], cv= 5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7402  0.7408  0.7407  0.7421  0.7427  0.7413  0.0009  
MAE (testset)     0.5589  0.5589  0.5598  0.5590  0.5609  0.5595  0.0008  
Fit time          10.78   11.61   11.08   11.03   11.15   11.13   0.27    
Test time         1.61    2.12    1.57    1.76    1.73    1.76    0.19    


In [None]:
def get_top_n(predictions, n=5):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n
    

In [None]:
top_n = get_top_n(predictions, 10)

for uid, user_ratings in top_n.items():
    print(uid, [mid for (mid, _) in user_ratings])

231832 ['m_912', 'm_260', 'm_1198', 'm_110', 'm_60069', 'm_1172', 'm_919', 'm_2324', 'm_1204', 'm_3095']
263860 ['m_2529', 'm_2467', 'm_109487', 'm_260', 'm_1333', 'm_1527', 'm_2360', 'm_30749', 'm_356', 'm_914']
220709 ['m_912', 'm_541', 'm_1193', 'm_1387', 'm_1219', 'm_2028', 'm_1276', 'm_750', 'm_3089', 'm_1200']
29647 ['m_7361', 'm_318', 'm_1947', 'm_2571', 'm_1958', 'm_104374', 'm_3097', 'm_7153', 'm_80463', 'm_5995']
281790 ['m_527', 'm_589', 'm_1961', 'm_480', 'm_508', 'm_1246', 'm_3147', 'm_1625', 'm_67255', 'm_858']
61614 ['m_1259', 'm_7669', 'm_1307', 'm_527', 'm_2028', 'm_318', 'm_1012', 'm_2797', 'm_150', 'm_1029']
191063 ['m_1222', 'm_593', 'm_2858', 'm_293', 'm_1732', 'm_1214', 'm_541', 'm_2571', 'm_1213', 'm_1196']
93512 ['m_356', 'm_2571', 'm_593', 'm_3147', 'm_457', 'm_1196', 'm_915', 'm_3510', 'm_628', 'm_858']
208336 ['m_919', 'm_912', 'm_1203', 'm_4322', 'm_904', 'm_916', 'm_1214', 'm_4327', 'm_923', 'm_3200']
21001 ['m_6874', 'm_778', 'm_3949', 'm_318', 'm_714', 'm