# Matrix Factorization Model

* Preprocessing of Dataset
* Create Model
* Model Tuning
* Create Final Model and Prediction

# Import Necessary Libraries

In [17]:
!pip install surprise

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", 500)
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate



# Import Dataset

In [18]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/datasets/movie_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,2457839,356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.0,4.0,1996-08-24 09:28:42
1,2457840,356,Forrest Gump (1994),Comedy|Drama|Romance|War,7.0,4.0,2002-01-16 19:02:55
2,2457841,356,Forrest Gump (1994),Comedy|Drama|Romance|War,8.0,5.0,1996-06-05 13:44:19
3,2457842,356,Forrest Gump (1994),Comedy|Drama|Romance|War,9.0,4.0,2001-07-01 20:26:38
4,2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10.0,3.0,1999-11-25 02:32:02


# Preprocessing of Dataset

In [19]:
user_movie_df = df.pivot_table(index=["userId"], columns=["title"], values=["rating"])

In [20]:
user_movie_df.head()

Unnamed: 0_level_0,rating,rating,rating,rating
title,Blade Runner (1982),Cries and Whispers (Viskningar och rop) (1972),Forrest Gump (1994),The Dark Knight (2011)
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1.0,4.0,,,
2.0,5.0,,,
3.0,5.0,,,
4.0,,,4.0,
7.0,,,4.0,


In [21]:
reader = Reader(rating_scale=(1,5))

In [22]:
data = Dataset.load_from_df(df[["userId", "movieId", "rating"]], reader)

# Create Model

In [23]:
train_data, test_data = train_test_split(data, test_size=0.3)

In [24]:
svd_model = SVD()

In [25]:
svd_model.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f8808017100>

In [26]:
predictions = svd_model.test(test_data)

In [27]:
accuracy.rmse(predictions)

RMSE: 0.9374


0.937442715614268

In [28]:
df["movieId"].value_counts()

356       66172
541       30526
4422        644
130219        1
Name: movieId, dtype: int64

In [29]:
df[df["userId"]==10]

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,userId,rating,timestamp
4,2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10.0,3.0,1999-11-25 02:32:02


In [30]:
svd_model.predict(uid=10, iid=356, verbose=True)

user: 10         item: 356        r_ui = None   est = 4.06   {'was_impossible': False}


Prediction(uid=10, iid=356, r_ui=None, est=4.061583942472283, details={'was_impossible': False})

# Model Tuning

In [31]:
param_grid = {"n_epochs": [5, 10, 15, 20, 25, 30],
              "lr_all": [0.002, 0.005, 0.007, 0.008, 0.009]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3, n_jobs=-1, joblib_verbose=True)
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   47.2s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  2.0min finished


In [32]:
gs.best_params

{'rmse': {'n_epochs': 10, 'lr_all': 0.002},
 'mae': {'n_epochs': 20, 'lr_all': 0.002}}

In [33]:
gs.best_score

{'rmse': 0.9320234204447709, 'mae': 0.7205138566504311}

# Create Final Model and Prediction

In [34]:
svd_model_final = SVD(**gs.best_params["rmse"])
data = data.build_full_trainset()
svd_model_final.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f880808ceb0>

In [35]:
df[df["userId"]==10]

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,userId,rating,timestamp
4,2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10.0,3.0,1999-11-25 02:32:02


In [36]:
svd_model_final.predict(uid=10, iid=356, verbose=True)

user: 10         item: 356        r_ui = None   est = 4.04   {'was_impossible': False}


Prediction(uid=10, iid=356, r_ui=None, est=4.044928608651132, details={'was_impossible': False})