In [1]:
import numpy as np
import pandas as pd
from urllib.request import urlretrieve
import zipfile
import scipy.sparse as sps
import time as time
import csv
import sklearn

from pcteam.utils import *
from pcteam.nmf import *
from pcteam.pca import *

In [2]:
# Load Dataset
# https://colab.research.google.com/github/google/eng-edu/blob/main/ml/recommendation-systems/recommendation-systems.ipynb

print("Downloading MovieLens Dataset...")

dataset_name = "ml-latest-small" # To change after "ml-latest-small", "ml-25m"
if not zipfile.is_zipfile(f"{dataset_name}.zip"):
    urlretrieve(f"https://files.grouplens.org/datasets/movielens/{dataset_name}.zip", f"{dataset_name}.zip")
zipfile.ZipFile(f"{dataset_name}.zip", "r").extractall()

print("Done")

Downloading MovieLens Dataset...
Done


In [3]:
ratings = pd.read_csv(f"{dataset_name}/ratings.csv", header=0, names=['user_id', 'movie_id', 'rating', 'timestamp'], encoding='UTF-8').drop(columns=['timestamp'])

ratings['movie_idx'] = ratings['movie_id'].map(index_mapping(ratings['movie_id'].sort_values()))
ratings['user_idx'] = ratings['user_id'].map(index_mapping(ratings['user_id']))

In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    100836 non-null  int64  
 1   movie_id   100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   movie_idx  100836 non-null  int64  
 4   user_idx   100836 non-null  int64  
dtypes: float64(1), int64(4)
memory usage: 3.8 MB


In [5]:
d = 20
steps = 100
delta = 1e-6

df = ratings[['user_idx', 'movie_idx', 'rating']]
train, test = split_test(df, 'user_idx', seed=42)

M = sps.csr_matrix((train['rating'].to_numpy(), (train['user_idx'].to_numpy(), train['movie_idx'].to_numpy())))
M_test = sps.csr_matrix((test['rating'].to_numpy(), (test['user_idx'].to_numpy(), test['movie_idx'].to_numpy())))
W, H = init_wh(M, d, init=42)

M1, W1, H1 = M.copy(), W.copy(), H.copy()
M2, W2, H2 = M.copy(), W.copy(), H.copy()
M3, W3, H3 = M.copy(), W.copy(), H.copy()
M4, W4, H4 = M.copy(), W.copy(), H.copy()
M5, W5, H5 = M.copy(), W.copy(), H.copy()


In [6]:
print(approx_size(M))
print(approx_size(W))
print(approx_size(H))
print(M.shape)
print(W.shape)
print(H.shape)

1.205156
0.0976
1.55584
(610, 9724)
(610, 20)
(20, 9724)


In [7]:
print("------- POWER FACTORISATION -------")
start = time.time()

W, H = powerfac(M, W, H, max_iter=steps, verbose=1)

end = time.time()
print(f"Time in s: {end - start}")


err = rmse_error_md(M_test, W, H)
print(f"RMSE: {err}")
print(f"Sparsity M: {density(M):.2%}, W: {density(W):.2%}, H: {density(H):.2%}")

------- POWER FACTORISATION -------
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
Time in s: 454.31353783607483
RMSE: 3.095952048872313
Sparsity M: 1.69%, W: 100.00%, H: 99.84%
