In [1]:
import multiprocessing as mp
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from funk_svd.dataset import fetch_ml20m_ratings
from funk_svd.utils import timer
from funk_svd import SVD

## Import data from MovieLens 20M dataset

[MovieLens 20M Dataset Research Paper]("http://files.grouplens.org/papers/harper-tiis2015.pdf")

In [2]:
%%time

df = fetch_ml20m_ratings()
print()

Downloading data...
Unzipping data...

CPU times: user 55.6 s, sys: 5.71 s, total: 1min 1s
Wall time: 1min 55s


In [3]:
df.head()

Unnamed: 0,u_id,i_id,rating,timestamp
0,28507,1176,4.0,1995-01-09 12:46:44
1,131160,1079,3.0,1995-01-09 12:46:49
2,131160,47,5.0,1995-01-09 12:46:49
3,131160,21,3.0,1995-01-09 12:46:49
4,85252,45,3.0,1996-01-29 01:00:00


In [4]:
df.tail()

Unnamed: 0,u_id,i_id,rating,timestamp
20000258,53930,118706,3.5,2015-03-31 08:00:51
20000259,16978,2093,3.5,2015-03-31 08:03:17
20000260,89081,55232,3.5,2015-03-31 08:11:26
20000261,89081,52458,4.0,2015-03-31 08:11:28
20000262,87586,7151,3.5,2015-03-31 08:40:02


## Perform a train/val/test split

There is 138,493 different users in the MovieLens20m dataset, each of them having rated at least 20 movies. Let's sample the 4 last ratings per user and randomly split them between validation and test sets. 

To do so, we need to query our DataFrame for each user and then select their 4 last ratings. With so much users it's naturally quite expensive... hopefully it's possible to parallelize it as iterations are independant, allowing us to save some time (especially if you have good computing ressources). I'm using an Intel Core i5-7300U CPU (only 2 physical cores) on a 16GB laptop so I won't be able to save that much :)

<img src="https://www.dlapiper.com/~/media/images/insights/publications/2015/warning.jpg?la=en&hash=6F2E30889FD9E0B11016A1712E6E583575717C54" width="23" align="left">

&nbsp; If you want to run this notebook with **Windows**, you won't be able to use `multiprocessing.Pool` because it's lacking `fork()`. For simplicity you can also do it sequentially without loosing so much time compared to my dual core CPU.

In [5]:
@timer(text="")
def compute_val_test_mask(users, df, i, n_process, n_rate=4):
    val_test_mask = []
    
    for j in range(i, len(users), n_process):
        u_id = users[j]
        u_subset = df[df["u_id"] == u_id].copy()
        val_test_mask += u_subset.iloc[-n_rate:].index.tolist()
        
    print("Process {} done in".format(i), end=" ")
    return val_test_mask

In [6]:
users = df["u_id"].unique()

n_process = 12
pool = mp.Pool(processes=n_process)

results = [
    pool.apply_async(compute_val_test_mask,
                     args=(users, df, i, n_process))
    for i in range(n_process)
]

results = [p.get() for p in results]
val_test_mask = [item for sublist in results for item in sublist]

Process 1 done in 24 min and 44 sec
Process 0 done in 25 min and 1 sec
Process 3 done in 24 min and 56 sec
Process 2 done in 25 min and 6 sec
Process 4 done in 25 min and 4 sec
Process 5 done in 25 min and 4 sec
Process 7 done in 25 min and 3 sec
Process 10 done in 24 min and 48 sec
Process 6 done in 25 min and 8 sec
Process 8 done in 25 min and 1 sec
Process 9 done in 24 min and 56 sec
Process 11 done in 24 min and 45 sec


In [7]:
train = df.drop(val_test_mask)
val = df.loc[val_test_mask].sample(frac=0.5, random_state=7)
test = df.loc[val_test_mask].drop(val.index.tolist())

## Modelization

Let's fit our model.

In [8]:
svd = SVD(learning_rate=0.001, regularization=0.005, n_epochs=100,
          n_factors=15, min_rating=1, max_rating=5)

svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

Preprocessing data...

Epoch 1/100  | val_loss: 0.98 - val_rmse: 0.99 - val_mae: 0.78 - took 1.4 sec
Epoch 2/100  | val_loss: 0.95 - val_rmse: 0.97 - val_mae: 0.76 - took 0.9 sec
Epoch 3/100  | val_loss: 0.93 - val_rmse: 0.97 - val_mae: 0.76 - took 0.9 sec
Epoch 4/100  | val_loss: 0.92 - val_rmse: 0.96 - val_mae: 0.75 - took 0.9 sec
Epoch 5/100  | val_loss: 0.91 - val_rmse: 0.96 - val_mae: 0.75 - took 0.9 sec
Epoch 6/100  | val_loss: 0.91 - val_rmse: 0.95 - val_mae: 0.74 - took 0.9 sec
Epoch 7/100  | val_loss: 0.90 - val_rmse: 0.95 - val_mae: 0.74 - took 0.9 sec
Epoch 8/100  | val_loss: 0.90 - val_rmse: 0.95 - val_mae: 0.74 - took 0.9 sec
Epoch 9/100  | val_loss: 0.89 - val_rmse: 0.94 - val_mae: 0.73 - took 0.9 sec
Epoch 10/100 | val_loss: 0.89 - val_rmse: 0.94 - val_mae: 0.73 - took 0.9 sec
Epoch 11/100 | val_loss: 0.88 - val_rmse: 0.94 - val_mae: 0.73 - took 0.9 sec
Epoch 12/100 | val_loss: 0.88 - val_rmse: 0.94 - val_mae: 0.73 - took 0.9 sec
Epoch 13/100 | val_loss: 0.88 - val_rmse:

<funk_svd.svd.SVD at 0x7f68be06dd30>

Predict test set and compute results.

In [9]:
%%time

pred = svd.predict(test)

rmse = np.sqrt(mean_squared_error(test["rating"], pred))
mae = mean_absolute_error(test["rating"], pred)

print("Test RMSE: {:.2f}".format(rmse))
print("Test MAE:  {:.2f}".format(mae))
print()

Test RMSE: 0.88
Test MAE:  0.68

CPU times: user 750 ms, sys: 10.5 ms, total: 761 ms
Wall time: 751 ms


## Comparison with Surprise library

In [10]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD

Format data according Surprise way.

In [11]:
%%time

reader = Reader(rating_scale=(1, 5))

trainset = Dataset.load_from_df(train[["u_id", "i_id", "rating"]],
                               reader=reader).build_full_trainset()

testset = Dataset.load_from_df(test[["u_id", "i_id", "rating"]], reader=reader)
testset = testset.construct_testset(testset.raw_ratings)

CPU times: user 35.3 s, sys: 2.47 s, total: 37.8 s
Wall time: 37.3 s


Fit the model with the same parameters.

In [12]:
%%time

svd = SVD(lr_all=.001, reg_all=0.005, n_epochs=46, n_factors=15, verbose=True)
svd.fit(trainset)
print()

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45

CPU times: user 11min 13s, sys: 650 ms, total: 11min 13s
Wall time: 11min 13s


Predict test set and compute results.

In [13]:
%%time

pred = svd.test(testset)
y_true = [p.r_ui for p in pred]
y_hat = [p.est for p in pred]

rmse = np.sqrt(mean_squared_error(y_true, y_hat))
mae = mean_absolute_error(y_true, y_hat)

print("Test RMSE: {:.2f}".format(rmse))
print("Test MAE:  {:.2f}".format(mae))
print()

Test RMSE: 0.88
Test MAE:  0.68

CPU times: user 1.92 s, sys: 290 ms, total: 2.21 s
Wall time: 1.79 s


Accuracy performance is naturally equivalent, difference stands in the computation time, `Numba` allowing us to run more than 10 times faster than with cython.

| Movielens 20M | RMSE   | MAE    | Time          |
|:--------------|:------:|:------:|--------------:|
| Surprise      |  0.88  |  0.68  | 11 min 13 sec |
| Funk-svd      |  0.88  |  0.68  |        48 sec |