In [1]:
import os

import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from scipy.sparse import lil_matrix,csr_matrix, save_npz, load_npz
import keras.backend as K
from keras.models import Model
from keras.layers import Input, Dropout, Dense
from keras.regularizers import l2
from keras.optimizers import SGD

2022-11-30 20:01:46.265032: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-30 20:01:46.641215: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-30 20:01:47.607558: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64
2022-11-30 20:01:47.607619: W tensorflow/compiler/xla/stream_exec

In [2]:
# generate the folders for the project
output_path = 'output'
input_path = 'input'

if not os.path.exists(output_path):
    os.makedirs(output_path)
    
if not os.path.exists(input_path):
    os.makedirs(input_path)

In [3]:
# Download the dataset if not existing
!wget -nc http://files.grouplens.org/datasets/movielens/ml-20m.zip
!unzip -n ml-20m.zip

--2022-11-30 20:01:48--  http://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: ‘ml-20m.zip’


2022-11-30 20:02:07 (10,2 MB/s) - ‘ml-20m.zip’ saved [198702078/198702078]

Archive:  ml-20m.zip
   creating: ml-20m/
  inflating: ml-20m/genome-scores.csv  
  inflating: ml-20m/genome-tags.csv  
  inflating: ml-20m/links.csv        
  inflating: ml-20m/movies.csv       
  inflating: ml-20m/ratings.csv      
  inflating: ml-20m/README.txt       
  inflating: ml-20m/tags.csv         


In [4]:
# Preprocessing step
try:
    # Try reading the preprocessed file if existing, if not generate
    ratings_df = pd.read_csv(os.path.join(input_path, 'ratings_preprocessed.csv'))
except:
    # Read the original file
    ratings_df = pd.read_csv(os.path.join('ml-20m', 'ratings.csv'))
    # Make the userId start from zero
    ratings_df['userId'] = ratings_df['userId'] - 1

    # Create a mapping for movieId since they are not sequential
    unique_movie_ids = set(ratings_df['movieId'].values)
    movie2idx = {}
    for i, movie_id in enumerate(unique_movie_ids):
        movie2idx[movie_id] = i
    # Add them to ratings_df
    ratings_df['movie_idx'] = ratings_df.apply(lambda row: movie2idx[row['movieId']], axis=1)

    # No need the timestamp of the rating
    ratings_df.drop(columns=['timestamp'], inplace=True)

    # Save the new ratings_df
    ratings_df.to_csv(os.path.join(input_path, 'ratings_preprocessed.csv'))

In [5]:
N = ratings_df['userId'].max() + 1
M = ratings_df['movieId'].max() + 1

N, M

(138493, 131263)

In [6]:
def make_sparse_matrix(N, M, df):
    A = lil_matrix((N, M))
    
    def update_sparse_matrix(row):
        i = int(row['userId'])
        j = int(row['movieId'])
        
        A[i, j] = row['rating']
        
    df.apply(update_sparse_matrix, axis=1)
    
    return A

In [7]:
# Create the sparse matrix for train and test
# lil better for adding new values, csr better for saving
df = shuffle(ratings_df)

cut_off = int(0.8 * len(df.index))
df_train = df.iloc[:cut_off]
df_test = df.iloc[cut_off:]

A_train = make_sparse_matrix(N, M, df_train)
A_train = A_train.tocsr()
mask_train = (A_train > 0)
save_npz(os.path.join(input_path, 'A_train.npz'), A_train)

A_test = make_sparse_matrix(N, M, df_test)
A_test = A_test.tocsr()
mask_test = (A_test > 0)
save_npz(os.path.join(input_path, 'A_test.npz'), A_test)

In [8]:
batch_size = 128
epochs = 20
regularization = 1e-4

In [9]:
mu = A_train.sum() / mask_train.sum()
print(mu)

3.525553664608152


In [10]:
def mse_loss(y_true, y_pred):
    mask = K.cast(K.not_equal(y_true, 0), dtype='float32')
    diff = y_pred - y_true
    sqdiff = diff * diff * mask
    sse = K.sum(K.sum(sqdiff))
    n = K.sum(K.sum(mask))
    
    return sse / n

def generator(A, M):
    while True:
        A, M = shuffle(A, M)
        for i in range(A.shape[0] // batch_size + 1):
            upper = min((i+1) * batch_size, A.shape[0])
            a = A[i*batch_size: upper].toarray()
            m = M[i*batch_size: upper].toarray()
            a = a - mu * m

            yield a, a

def generator_test(A, M, A_test, M_test):
    while True:
        A, M = shuffle(A, M)
        for i in range(A.shape[0] // batch_size + 1):
            upper = min((i+1) * batch_size, A.shape[0])
            a = A[i*batch_size: upper].toarray()
            m = M[i*batch_size: upper].toarray()
            at = A_test[i*batch_size: upper].toarray()
            mt = M_test[i*batch_size: upper].toarray()
            a = a - mu * m
            at = at - mu * mt

            yield a, at

In [11]:
i = Input(shape=(M,))
x = Dropout(0.7)(i)
x = Dense(700, activation='tanh', kernel_regularizer=l2(regularization))(x)
x = Dense(M, kernel_regularizer=l2(regularization))(x)

model = Model(i, x)
model.compile(
    loss=mse_loss,
    optimizer='adam',
    metrics=[mse_loss],
)

2022-11-30 20:05:40.861903: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-30 20:05:40.946867: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-30 20:05:40.948865: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-30 20:05:40.952330: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operati

In [None]:
history = model.fit(
    generator(A_train.copy(), mask_train.copy()),
    validation_data=generator_test(A_train.copy(), mask_train.copy(), A_test.copy(), mask_test.copy()),
    epochs=epochs,
    steps_per_epoch=A_train.shape[0] // batch_size + 1,
)

Epoch 1/20


2022-11-30 20:05:44.172142: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-11-30 20:05:44.301844: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7fd29c027300 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2022-11-30 20:05:44.301856: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2022-11-30 20:05:44.327210: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


   1/1082 [..............................] - ETA: 43:57 - loss: 1.4986 - mse_loss: 1.2201

2022-11-30 20:05:44.636681: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='test loss')
plt.legend()
plt.show()

In [None]:
plt.plot(history.history['mse_loss'], label='train mse')
plt.plot(history.history['val_mse_loss'], label='test mse')
plt.legend()
plt.show()