In [1]:
import numpy as np
import pandas as pd
from urllib.request import urlretrieve
import zipfile
import scipy.sparse as sps
import time as time
import csv
import sklearn

from pcteam.utils import *
from pcteam.nmf import *

In [2]:
# Load Dataset
# https://colab.research.google.com/github/google/eng-edu/blob/main/ml/recommendation-systems/recommendation-systems.ipynb

print("Downloading MovieLens Dataset...")

dataset_name = "ml-25m" # To change after "ml-latest-small", "ml-25m"
if not zipfile.is_zipfile(f"{dataset_name}.zip"):
    urlretrieve(f"https://files.grouplens.org/datasets/movielens/{dataset_name}.zip", f"{dataset_name}.zip")
zipfile.ZipFile(f"{dataset_name}.zip", "r").extractall()

print("Done")

Downloading MovieLens Dataset...
Done


In [3]:
ratings = pd.read_csv(f"{dataset_name}/ratings.csv", header=0, names=['user_id', 'movie_id', 'rating', 'timestamp'], encoding='UTF-8').drop(columns=['timestamp'])

ratings['movie_idx'] = ratings['movie_id'].map(index_mapping(ratings['movie_id'].sort_values()))
ratings['user_idx'] = ratings['user_id'].map(index_mapping(ratings['user_id']))

In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   user_id    int64  
 1   movie_id   int64  
 2   rating     float64
 3   movie_idx  int64  
 4   user_idx   int64  
dtypes: float64(1), int64(4)
memory usage: 953.7 MB


In [5]:
d = 50
steps = 200
delta = 1e-6

df = ratings[['user_idx', 'movie_idx', 'rating']]
train, test = split_test(df, 'user_idx', seed=42)

M = sps.csr_matrix((train['rating'].to_numpy(), (train['user_idx'].to_numpy(), train['movie_idx'].to_numpy())))
M_test = sps.csr_matrix((test['rating'].to_numpy(), (test['user_idx'].to_numpy(), test['movie_idx'].to_numpy())))
W, H = init_wh(M, d, init=42)

M1, W1, H1 = M.copy(), W.copy(), H.copy()
M2, W2, H2 = M.copy(), W.copy(), H.copy()
M3, W3, H3 = M.copy(), W.copy(), H.copy()
M4, W4, H4 = M.copy(), W.copy(), H.copy()
M5, W5, H5 = M.copy(), W.copy(), H.copy()


In [6]:
print(approx_size(M))
print(approx_size(W))
print(approx_size(H))
print(M.shape)
print(W.shape)
print(H.shape)

298.700816
65.0164
23.6188
(162541, 59047)
(162541, 50)
(50, 59047)


In [7]:
import sklearn.decomposition
print("------- SKLEARN NMF -------")
start = time.time()

model = sklearn.decomposition.NMF(d, init='custom', solver='mu', max_iter=steps, alpha_H=0.0)
W = model.fit_transform(X=M, W=W, H=H)
H = model.components_

end = time.time()
print(f"Time in s: {end - start}")


err = rmse_error_md(M_test, W, H)
print(f"RMSE: {err}")
print(f"Density M: {density(M):.2%}, W: {density(W):.2%}, H: {density(H):.2%}")

------- SKLEARN NMF -------
Time in s: 38.87878918647766
RMSE: 3.012996460172509
Density M: 0.26%, W: 100.00%, H: 99.73%


In [8]:
import sklearn.decomposition
print("------- SKLEARN NMF 2 -------")
start = time.time()

model2 = sklearn.decomposition.NMF(d)
W = model2.fit_transform(X=M)
H = model2.components_

end = time.time()
print(f"Time in s: {end - start}")


err = rmse_error_md(M_test, W, H)
print(f"RMSE: {err}")
print(f"Density M: {density(M):.2%}, W: {density(W):.2%}, H: {density(H):.2%}")

------- SKLEARN NMF 2 -------




Time in s: 292.87944197654724
RMSE: 3.004252669607644
Density M: 0.26%, W: 36.61%, H: 9.78%


In [9]:
print("------- PCTEAM NMF -------")
start = time.time()

W1, H1, _ = NMF(M1, W1, H1, mu_w, mu_h, delta=delta, max_iter=steps, err_func='rmse')

end = time.time()
print(f"Time in s: {end - start}")

err = rmse_error_md(M_test, W1, H1)
print(f"RMSE: {err}")
print(f"Density M: {density(M1):.2%}, W: {density(W1):.2%}, H: {density(H1):.2%}")

------- PCTEAM NMF -------
Time in s: 187.00484204292297
RMSE: 2.9829228385083173
Density M: 0.26%, W: 100.00%, H: 100.00%


In [10]:
print("------- ACCELERATED NMF -------")
start = time.time()

W2, H2, _ = NMF_A(M2, W2, H2, mu_w, mu_h, delta=delta, max_iter=steps, err_func='rmse', alpha=1, epsilon=0.1)

end = time.time()
print(f"Time in s: {end - start}")

err = rmse_error_md(M_test, W2, H2)
print(f"RMSE: {err}")
print(f"Density M: {density(M2):.2%}, W: {density(W2):.2%}, H: {density(H2):.2%}")

------- ACCELERATED NMF -------
Time in s: 234.15478706359863
RMSE: 2.9904389735704626
Density M: 0.26%, W: 100.00%, H: 100.00%


In [11]:
print("------- PCTEAM ONMF 1.0 -------")
start = time.time()

W3, H3, _ = ONMF(M3, W3, H3, mu=1.0, delta=delta, max_iter=steps, err_func='rmse')

end = time.time()
print(f"Time in s: {end - start}")

err = rmse_error_md(M_test, W3, H3)
print(f"RMSE: {err}")
print(f"Density M: {density(M3):.2%}, W: {density(W3):.2%}, H: {density(H3):.2%}")

------- PCTEAM ONMF 1.0 -------
Time in s: 194.02230405807495
RMSE: 2.9822670222773446
Density M: 0.26%, W: 100.00%, H: 100.00%


In [12]:
print("------- PCTEAM ONMF 0.0 -------")
start = time.time()

W4, H4, _ = ONMF(M4, W4, H4, mu=0.0, delta=delta, max_iter=steps, err_func='rmse')

end = time.time()
print(f"Time in s: {end - start}")

err = rmse_error_md(M_test, W4, H4)
print(f"RMSE: {err}")
print(f"Density M: {density(M4):.2%}, W: {density(W4):.2%}, H: {density(H4):.2%}")

------- PCTEAM ONMF 0.0 -------
Time in s: 193.267431974411
RMSE: 2.9829228385083173
Density M: 0.26%, W: 100.00%, H: 100.00%


In [13]:
import sklearn.decomposition
print("------- SKLEARN T-SVD -------")
start = time.time()

model = sklearn.decomposition.TruncatedSVD(d)
W5 = model.fit_transform(X=M5)
H5 = model.components_

end = time.time()
print(f"Time in s: {end - start}")


err = rmse_error_md(M_test, W5, H5)
print(f"RMSE: {err}")
print(f"Density M: {density(M5):.2%}, W: {density(W5):.2%}, H: {density(H5):.2%}")

------- SKLEARN T-SVD -------
Time in s: 7.764592885971069
RMSE: 2.947887578326157
Density M: 0.26%, W: 100.00%, H: 99.97%


In [14]:
print(frob_error2(H @ H.T, np.eye(d)))
print(frob_error2(H1 @ H1.T, np.eye(d)))
print(frob_error2(H2 @ H2.T, np.eye(d)))
print(frob_error2(H3 @ H3.T, np.eye(d)))
print(frob_error2(H4 @ H4.T, np.eye(d)))


22052.153498742267
23865645.60961169
28950279.387705643
7.069732396215471
23865645.60961169


In [15]:
#with open('pcteam_nmf.csv', 'w') as f:
#    for k, v in log.items():
#        f.write(f"{k}, {v}\n")

#with open('pcteam_nmf.npy', 'wb') as f:
#    np.save(f, W)
#    np.save(f, H)

#with open('sklean_nmf.npy', 'wb') as f:
#    np.save(f, W_c)
#    np.save(f, H_c)

In [16]:
#import cProfile
#import re
#cProfile.run("NMF(M, W, H, w_update=mu_w, h_update=mu_h, delta=1e-16, steps=5, err_func='rmse', log=log, verbose=0)")
