In [1]:
import multiprocessing as mp
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from funk_svd.dataset import fetch_ml_ratings
from funk_svd.utils import timer
from funk_svd import SVD

## Import data from MovieLens 20M dataset

[MovieLens 20M Dataset Research Paper]("http://files.grouplens.org/papers/harper-tiis2015.pdf")

In [4]:
%%time

df = fetch_ml_ratings(variant='20m')
print ()

data_dir_path is  C:\Users\jfdol\funk_svd_data
Reading ratings from  C:\Users\jfdol\funk_svd_data\ml-100k\u.data
Wall time: 875 ms


In [3]:
df.head()

Unnamed: 0,u_id,i_id,rating,timestamp
0,259,255,4.0,1997-09-20 00:05:10
1,259,286,4.0,1997-09-20 00:05:27
2,259,298,4.0,1997-09-20 00:05:54
3,259,185,4.0,1997-09-20 00:06:21
4,259,173,4.0,1997-09-20 00:07:23


In [5]:
df.tail()

Unnamed: 0,u_id,i_id,rating,timestamp
99994,729,328,3.0,1998-04-22 20:10:38
99995,729,333,4.0,1998-04-22 20:10:38
99996,729,313,3.0,1998-04-22 20:10:38
99997,729,748,4.0,1998-04-22 20:10:38
99998,729,689,4.0,1998-04-22 20:10:38


## Perform a train/val/test split

There is 138,493 different users in the MovieLens20m dataset, each of them having rated at least 20 movies. Let's sample the 4 last ratings per user and randomly split them between validation and test sets. 

To do so, we need to query our DataFrame for each user and then select their 4 last ratings. With so much users it's naturally quite expensive... hopefully it's possible to parallelize it as iterations are independant, allowing us to save some time (especially if you have good computing ressources). I'm using an Intel Core i5-7300U CPU (only 2 physical cores) on a 16GB laptop so I won't be able to save that much :)

<img src="https://www.dlapiper.com/~/media/images/insights/publications/2015/warning.jpg?la=en&hash=6F2E30889FD9E0B11016A1712E6E583575717C54" width="23" align="left">

&nbsp; If you want to run this notebook with **Windows**, you won't be able to use `multiprocessing.Pool` because it's lacking `fork()`. For simplicity you can also do it sequentially without loosing so much time compared to my dual core CPU.

In [6]:
@timer(text="")
def compute_val_test_mask(users, df, i, n_process, n_rate=4):
    val_test_mask = []
    
    for j in range(i, len(users), n_process):
        u_id = users[j]
        u_subset = df[df["u_id"] == u_id].copy()
        val_test_mask += u_subset.iloc[-n_rate:].index.tolist()
        
    print("Process {} done in".format(i), end=" ")
    return val_test_mask

In [None]:
users = df["u_id"].unique()

n_process = 12
pool = mp.Pool(processes=n_process)

results = [
    pool.apply_async(compute_val_test_mask,
                     args=(users, df, i, n_process))
    for i in range(n_process)
]

results = [p.get() for p in results]
val_test_mask = [item for sublist in results for item in sublist]

In [None]:
train = df.drop(val_test_mask)
val = df.loc[val_test_mask].sample(frac=0.5, random_state=7)
test = df.loc[val_test_mask].drop(val.index.tolist())

## Modelization

Let's fit our model.

In [None]:
svd = SVD(learning_rate=0.001, regularization=0.005, n_epochs=100,
          n_factors=15, min_rating=1, max_rating=5)

svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

Predict test set and compute results.

In [None]:
%%time

pred = svd.predict(test)

rmse = np.sqrt(mean_squared_error(test["rating"], pred))
mae = mean_absolute_error(test["rating"], pred)

print("Test RMSE: {:.2f}".format(rmse))
print("Test MAE:  {:.2f}".format(mae))
print()

## Comparison with Surprise library

In [None]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD

Format data according Surprise way.

In [None]:
%%time

reader = Reader(rating_scale=(1, 5))

trainset = Dataset.load_from_df(train[["u_id", "i_id", "rating"]],
                               reader=reader).build_full_trainset()

testset = Dataset.load_from_df(test[["u_id", "i_id", "rating"]], reader=reader)
testset = testset.construct_testset(testset.raw_ratings)

Fit the model with the same parameters.

In [None]:
%%time

svd = SVD(lr_all=.001, reg_all=0.005, n_epochs=46, n_factors=15, verbose=True)
svd.fit(trainset)
print()

Predict test set and compute results.

In [None]:
%%time

pred = svd.test(testset)
y_true = [p.r_ui for p in pred]
y_hat = [p.est for p in pred]

rmse = np.sqrt(mean_squared_error(y_true, y_hat))
mae = mean_absolute_error(y_true, y_hat)

print("Test RMSE: {:.2f}".format(rmse))
print("Test MAE:  {:.2f}".format(mae))
print()

Accuracy performance is naturally equivalent, difference stands in the computation time, `Numba` allowing us to run more than 10 times faster than with cython.

| Movielens 20M | RMSE   | MAE    | Time          |
|:--------------|:------:|:------:|--------------:|
| Surprise      |  0.88  |  0.68  | 11 min 13 sec |
| Funk-svd      |  0.88  |  0.68  |        48 sec |