In [1]:
import os
from collections import Counter

import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from scipy.sparse import lil_matrix,csr_matrix, save_npz, load_npz
import keras.backend as K
from keras.models import Model
from keras.layers import Input, Dropout, Dense
from keras.regularizers import l2
from keras.optimizers import SGD
import matplotlib.pyplot as plt

In [2]:
# Generate the folders for the project
output_path = 'output'
input_path = 'input'

if not os.path.exists(output_path):
    os.makedirs(output_path)
    
if not os.path.exists(input_path):
    os.makedirs(input_path)

In [3]:
# Download the dataset if not existing
# !wget -nc http://files.grouplens.org/datasets/movielens/ml-20m.zip
# !unzip -n ml-20m.zip

In [4]:
# Preprocessing step
try:
    # Try reading the preprocessed file if existing, if not generate
    ratings_df = pd.read_csv(os.path.join(input_path, 'ratings_preprocessed.csv'))
except:
    # Read the original file
    ratings_df = pd.read_csv(os.path.join('ml-20m', 'ratings.csv'))
    # Make the userId start from zero
    ratings_df['userId'] = ratings_df['userId'] - 1

    # Create a mapping for movieId since they are not sequential
    unique_movie_ids = set(ratings_df['movieId'].values)
    movie2idx = {}
    for i, movie_id in enumerate(unique_movie_ids):
        movie2idx[movie_id] = i
    # Add them to ratings_df
    ratings_df['movie_idx'] = ratings_df.apply(lambda row: movie2idx[row['movieId']], axis=1)

    # No need the timestamp of the rating
    ratings_df.drop(columns=['timestamp'], inplace=True)

    # Save the new ratings_df
    ratings_df.to_csv(os.path.join(input_path, 'ratings_preprocessed.csv'), index=False)

In [5]:
# Set the subset data from the generic dataset to adapt the training to the hardware size
# n = 10000
# m = 2000

# user_id_count = Counter(ratings_df['userId'])
# movie_id_count = Counter(ratings_df['userId'])

# user_ids = [user for user, count in user_id_count.most_common(n)]
# movie_ids = [movie for movie, count in movie_id_count.most_common(m)]

# ratings_lite_df = ratings_df[ratings_df['userId'].isin(user_ids) & 
#                              ratings_df['movieId'].isin(movie_ids)].reset_index().copy()

# Generate a new mapping adapted to the number of users to adapt the size of the sparse matrix
# new_user_id_map = {old: i for i, old in enumerate(user_ids)}
# new_movie_id_map = {old: j for j, old in enumerate(movie_ids)}

# ratings_lite_df.loc[:, 'userId'] = ratings_lite_df.apply(lambda row: new_user_id_map[row['userId']], axis=1)
# ratings_lite_df.loc[:, 'movieId'] = ratings_lite_df.apply(lambda row: new_movie_id_map[row['movieId']], axis=1)

In [7]:
ratings_lite_df = ratings_df.copy()

In [8]:
N = ratings_lite_df['userId'].max() + 1
M = ratings_lite_df['movieId'].max() + 1

N, M

(138493, 131263)

In [9]:
def make_sparse_matrix(N, M, df):
    A = lil_matrix((N, M))
    
    def update_sparse_matrix(row):
        i = int(row['userId'])
        j = int(row['movieId'])
        
        A[i, j] = row['rating']
        
    df.apply(update_sparse_matrix, axis=1)
    
    return A

In [10]:
# Create the sparse matrix for train and test
# lil better for adding new values, csr better for saving
df = shuffle(ratings_lite_df)

cut_off = int(0.8 * len(df.index))
df_train = df.iloc[:cut_off]
df_test = df.iloc[cut_off:]

A_train = make_sparse_matrix(N, M, df_train)
A_train = A_train.tocsr()
mask_train = (A_train > 0)
save_npz(os.path.join(input_path, 'A_train.npz'), A_train)

A_test = make_sparse_matrix(N, M, df_test)
A_test = A_test.tocsr()
mask_test = (A_test > 0)
save_npz(os.path.join(input_path, 'A_test.npz'), A_test)

In [11]:
batch_size = 128
epochs = 20
regularization = 1e-4

In [12]:
# global rating  = Sum of ratings / number of ratings 
mu = A_train.sum() / mask_train.sum()
print(mu)

3.525713787506539


In [13]:
def mse_loss(y_true, y_pred):
    # The real loss should not consider those rows where the rating is missing
    # we need to use a mask to gather the ratings to calculate a real loss.
    mask = K.cast(K.not_equal(y_true, 0), dtype='float32')
    diff = y_pred - y_true
    sqdiff = diff * diff * mask
    sse = K.sum(K.sum(sqdiff))
    n = K.sum(K.sum(mask))
    
    return sse / n

def generator_train(A, M):
    while True:
        A, M = shuffle(A, M)
        #  For each step in the nmber of batch steps
        for i in range(A.shape[0] // batch_size + 1):
            # Make the moving upper step
            upper = min((i+1) * batch_size, A.shape[0])
            # Crop the batch to generate from the A matrix
            a = A[i*batch_size: upper].toarray()
            m = M[i*batch_size: upper].toarray()
            # Subtract the global average rating to center the data (working with deviations over mu) 
            a = a - mu * m

            yield a, a

def generator_test(A, M, A_test, M_test):
    while True:
        A, M = shuffle(A, M)
        #  For each step in the nmber of batch steps
        for i in range(A.shape[0] // batch_size + 1):
            # Make the moving upper step
            upper = min((i+1) * batch_size, A.shape[0])
            # Crop the batch to generate from the A matrix
            a = A[i*batch_size: upper].toarray()
            m = M[i*batch_size: upper].toarray()
            at = A_test[i*batch_size: upper].toarray()
            mt = M_test[i*batch_size: upper].toarray()
            # Subtract the global average rating to center the data (working with deviations over mu) 
            a = a - mu * m
            at = at - mu * mt

            yield a, at

In [14]:
i = Input(shape=(M,))
x = Dropout(0.7)(i)
x = Dense(700, activation='tanh', kernel_regularizer=l2(regularization))(x)
x = Dense(M, kernel_regularizer=l2(regularization))(x)

model = Model(i, x)
model.compile(
    loss=mse_loss,
    optimizer='adam',
    metrics=[mse_loss],
)

In [None]:
history = model.fit(
    generator_train(A_train.copy(), mask_train.copy()),
    validation_data=generator_test(A_train.copy(), mask_train.copy(), A_test.copy(), mask_test.copy()),
    epochs=epochs,
    steps_per_epoch=A_train.shape[0] // batch_size + 1,
    validation_steps=A_test.shape[0] // batch_size + 1
)

Epoch 1/20
Epoch 2/20
  62/1082 [>.............................] - ETA: 3:30 - loss: 0.8099 - mse_loss: 0.6233

In [None]:
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='test loss')
plt.legend()
plt.show()

In [None]:
plt.plot(history.history['mse_loss'], label='train mse')
plt.plot(history.history['val_mse_loss'], label='test mse')
plt.legend()
plt.show()