In [None]:
# Install Kaggle
! pip install -q kaggle

from google.colab import files
files.upload()

In [None]:
# Move the Kaggle API Token in the correct folder, test it works
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

ref                                                              title                                                size  lastUpdated          downloadCount  voteCount  usabilityRating  
---------------------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
ahsan81/hotel-reservations-classification-dataset                Hotel Reservations Dataset                          480KB  2023-01-04 12:50:31           7858        276  1.0              
googleai/musiccaps                                               MusicCaps                                           793KB  2023-01-25 09:25:48           1236        136  0.9411765        
themrityunjaypathak/most-subscribed-1000-youtube-channels        Most Subscribed 1000 Youtube Channels                28KB  2023-01-21 14:42:05           1678         56  1.0              
senapatirajesh/netflix-tv-shows-and-movies             

In [None]:
# Download the dataset from Kaggle
! kaggle datasets download samlearner/letterboxd-movie-ratings-data

Downloading letterboxd-movie-ratings-data.zip to /content
 91% 171M/188M [00:02<00:00, 92.6MB/s]
100% 188M/188M [00:02<00:00, 85.1MB/s]


In [None]:
# Unzip the data
! unzip letterboxd-movie-ratings-data.zip -d dataset

Archive:  letterboxd-movie-ratings-data.zip
  inflating: dataset/movie_data.csv  
  inflating: dataset/ratings_export.csv  
  inflating: dataset/users_export.csv  


In [None]:
# Load the dataset into a Pandas dataframe
movie_data = pd.read_csv("dataset/movie_data.csv", lineterminator="\n")
ratings_data = pd.read_csv("dataset/ratings_export.csv", lineterminator="\n")
user_data = pd.read_csv("dataset/users_export.csv", lineterminator="\n")

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [None]:
import pickle

def get_and_save_or_load_maps(d):
  mm = "./processed/movie_to_index.pkl"
  uu = "./processed/user_to_index.pkl"
  try:
    with open(mm, 'rb') as f: d["movie_to_index"] = pickle.load(f)
    with open(uu, 'rb') as f: d["user_to_index"] = pickle.load(f)
  except:
    print("COULD NOT LOAD MAPS")
    d["movie_to_index"] = {m: i for i, m in enumerate(d["all_movies"])}
    d["user_to_index"] = {u: i for i, u in enumerate(d["all_users"])}
    with open(mm, 'wb') as f: pickle.dump(d["movie_to_index"], f)
    with open(uu, 'wb') as f: pickle.dump(d["user_to_index"], f)
  return d

In [None]:
def get_and_save_or_load_movies(d):
  mm = "./processed/all_movies.pkl"
  try:
    with open(mm, 'rb') as f: d["all_movies"] = pickle.load(f)
  except:
    print("COULD NOT LOAD MOVIES")
    d["all_movies"] = ratings_data.movie_id.unique()
    with open(mm, 'wb') as f: pickle.dump(d["all_movies"], f)
  return d


In [None]:
from sklearn.model_selection import train_test_split

def get_and_save_or_load_users(d, train_ratio, test_ratio):
  uu = "./processed/all_users.pkl"
  tt1 = "./processed/train_users.pkl"
  tt2 = "./processed/test_users.pkl"
  try:
    with open(uu, 'rb') as f: d["all_users"] = pickle.load(f)
    with open(tt1, 'rb') as f: d["train_users"] = pickle.load(f)
    with open(tt2, 'rb') as f: d["test_users"] = pickle.load(f)
  except:
    print("COULD NOT LOAD USERS")
    d["all_users"] = ratings_data.user_id.unique()
    d["train_users"], d["test_users"] = train_test_split(d["all_users"], train_size=train_ratio, test_size=test_ratio)
    with open(uu, 'wb') as f: pickle.dump(d["all_users"], f)
    with open(tt1, 'wb') as f: pickle.dump(d["train_users"], f)
    with open(tt2, 'wb') as f: pickle.dump(d["test_users"], f)
  return d


In [None]:
from scipy.sparse import coo_matrix

def get_and_save_or_load_matrix(d, test_or_train):
  s = test_or_train + "_matrix"
  tt = "./processed/" + s + ".pkl"
  try:
    with open(tt, 'rb') as f: d[s] = pickle.load(f)
  except:
    print("COULD NOT LOAD MATRIX: " + test_or_train)
    shape = (len(d["all_users"]), len(d["all_movies"]))
    df = ratings_data[ratings_data["user_id"].isin(d[test_or_train + "_users"])]
    row = df['user_id'].map(d['user_to_index']).values
    col = df['movie_id'].map(d['movie_to_index']).values
    data = df['rating_val'].values
    d[s] = coo_matrix((data, (row, col)), shape=shape)
    d[s] = d[s].tocsr()
    with open(tt, 'wb') as f: pickle.dump(d[s], f)
  return d


In [None]:
def get_and_save_or_load_sample(train_ratio=0.8, test_ratio=0.2):
  d = dict()
  d = get_and_save_or_load_movies(d)
  d = get_and_save_or_load_users(d, train_ratio, test_ratio)
  d = get_and_save_or_load_maps(d)
  for s in ("test", "train"):
    d = get_and_save_or_load_matrix(d, s)
  return d

In [None]:
# I added the processed.zip file manually
! unzip processed.zip -d .

Archive:  processed.zip
   creating: ./content/processed/
  inflating: ./content/processed/test_users.pkl  
  inflating: ./content/processed/user_to_index.pkl  
  inflating: ./content/processed/all_users.pkl  
  inflating: ./content/processed/train_matrix.pkl  
  inflating: ./content/processed/train_users.pkl  
  inflating: ./content/processed/movie_to_index.pkl  
  inflating: ./content/processed/test_matrix.pkl  
  inflating: ./content/processed/all_movies.pkl  


In [None]:
! mkdir /content/processed
! mv /content/content/processed/* /content/processed/

In [None]:
! rm -rf /content/content
! rm -rf /content/processed.zip

In [None]:
def subtract_column(sparse_matrix, column):
    column = column.flatten()
    nonzero_rows, nonzero_cols = sparse_matrix.nonzero()
    nonzero_values = sparse_matrix.data
    nonzero_values -= column[nonzero_rows]
    new_sparse_matrix = sparse_matrix.copy()
    new_sparse_matrix.data[:] = nonzero_values
    return new_sparse_matrix

def demean_matrix(mat):
    sums = mat.sum(axis=1).A1
    counts = np.diff(mat.indptr)
    averages = sums / counts
    averages = averages.reshape(-1, 1)
    return subtract_column(mat, averages)

d = get_and_save_or_load_sample()
d["train_matrix_demeaned"] = demean_matrix(d["train_matrix"].asfptype())
d["test_matrix_demeaned"] = demean_matrix(d["test_matrix"].asfptype())
d["train_matrix_demeaned"].data

  averages = sums / counts


array([ 0.56193742,  0.56193742,  0.56193742, ...,  2.78800631,
       -0.21199369, -1.21199369])

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import keras.backend as K

LATENT_DIMENSION = 100
DROPOUT_RATE = 0.5
HIDDEN_LAYER_SIZE = 128

def custom_final_activation(x):
    return K.hard_sigmoid(x) * 10

def build_model(num_movies):
    single_movie_input = layers.Input(shape=(1,), name="single_movie")
    weighted_average_input = layers.Input(shape=(LATENT_DIMENSION,), name="weighted_average")

    movie_embedding_op = layers.Embedding(num_movies, LATENT_DIMENSION, name="movie_embedding_op")
    
    single_movie_embedding = movie_embedding_op(single_movie_input)
    single_movie_embedding_reshaped = layers.Reshape((LATENT_DIMENSION,), name="single_movie_embedding_reshaped")(single_movie_embedding)

    x = layers.Concatenate()([weighted_average_input, single_movie_embedding_reshaped])
    x = layers.BatchNormalization()(x)
    x = layers.Dense(HIDDEN_LAYER_SIZE, activation="selu")(x)
    x = layers.Dropout(DROPOUT_RATE)(x)
    x = layers.Dense(HIDDEN_LAYER_SIZE, activation="selu")(x)
    x = layers.Dropout(DROPOUT_RATE)(x)
    x = layers.Dense(1, activation=custom_final_activation, name="predicted_rating")(x)

    return tf.keras.Model(inputs=[single_movie_input, weighted_average_input], outputs=x), movie_embedding_op

model, movie_embedding_op = build_model(d["train_matrix"].shape[1])
model.compile(optimizer="adam", loss="mean_squared_error")

In [None]:
import numpy as np

def extract_training_sample():
    nonzero_indices = d["train_matrix"].nonzero()
    ui = np.random.choice(nonzero_indices[0])
    sparse_row = d["train_matrix"][ui]
    _, movie_indices = sparse_row.nonzero()
    mi = np.random.choice(movie_indices)

    num_rated_movies = len(movie_indices)

    single_movie_input = np.array([mi]).reshape(-1, 1)
    multiple_movie_input = movie_indices.reshape(-1, num_rated_movies)
    ratings_input = d["train_matrix_demeaned"][ui, movie_indices].reshape(-1, num_rated_movies).toarray()
    rating_to_predict = np.array([d["train_matrix"][ui, mi]])

    return [single_movie_input, multiple_movie_input, ratings_input], rating_to_predict

print(extract_training_sample())

([array([[10198]], dtype=int32), array([[    20,     25,     27, ..., 274868, 280831, 282677]], dtype=int32), array([[-3.87184343,  1.12815657, -5.87184343, ...,  2.12815657,
         2.12815657,  0.12815657]])], array([8]))


In [None]:
%load_ext cython

In [None]:
%%cython

import numpy as np
import scipy.sparse as sparse
cimport numpy as np

def cython_inner(DM, np.ndarray[object, ndim=1] multiple_movie_input, int i, int num_columns, np.ndarray[int, ndim=1] random_user_indices):
    cdef int j
    cdef np.ndarray[double, ndim=1] arr = np.zeros((num_columns,))
    cdef int row_index = random_user_indices[i]
    cdef int start = DM.indptr[row_index]
    cdef int end = DM.indptr[row_index+1]
    cdef np.ndarray[int, ndim=1] col_indices = DM.indices[start:end]
    cdef np.ndarray[double, ndim=1] data = DM.data[start:end]
    
    for j in range(num_columns):
        col = multiple_movie_input[i][j]
        try:
            index = np.where(col_indices == col)[0][0]
            arr[j] = data[index]
        except:
            arr[j] = 0
    return arr

def cython_demeaned_rating_matrix(DM, np.ndarray[object, ndim=1] multiple_movie_input, 
                                  np.ndarray[int, ndim=1] random_user_indices, 
                                  int num_rows, np.ndarray[long, ndim=1] row_lengths):
    cdef int i
    cdef np.ndarray[object, ndim=1] demeaned_rating_matrix = np.empty((num_rows,), dtype='object')
    for i in range(num_rows):
        demeaned_rating_matrix[i] = cython_inner(DM, multiple_movie_input, i, row_lengths[i], random_user_indices)
    
    return demeaned_rating_matrix

In [None]:
import tensorflow as tf
import numpy as np

def extract_training_batch(batch_size=100):
    M = d["train_matrix"]
    DM = d["train_matrix_demeaned"].copy()
    DM[:, 0] = DM[:, 0] - DM[:, 0]
    user_indices, movie_indices = M.nonzero()
    index_indices = np.random.choice(range(len(user_indices)), size=batch_size)
    random_user_indices = user_indices[index_indices]
    random_movie_indices = movie_indices[index_indices]
    nonzero_indices_by_row = np.array(np.split(M.indices, M.indptr)[1:-1], dtype='object')
    single_movie_input = np.array(random_movie_indices).reshape(batch_size, 1)
    multiple_movie_input = nonzero_indices_by_row[random_user_indices]
    ratings_to_predict = np.array([M.asfptype()[random_user_indices, random_movie_indices]]).reshape(batch_size, 1)

    def pad_array(array, max_len, padding_value=0.0):
        padded = np.pad(array, (0, max_len - len(array)), mode='constant', constant_values=padding_value)
        return padded

    def pad_multiple_arrays(multiple_arrays, padding_value=0.0):
        max_len = max([len(x) for x in multiple_arrays])
        padded = [pad_array(x, max_len, padding_value) for x in multiple_arrays]
        return np.array(padded)

    multiple_movie_input = pad_multiple_arrays(multiple_movie_input, padding_value=0)
    num_rows = len(multiple_movie_input)
    num_columns = len(multiple_movie_input[0])
    repeated_random_user_indices = np.array([[random_user_indices[i]] * num_columns for i in range(num_rows)])
    demeaned_rating_matrix = DM[repeated_random_user_indices, multiple_movie_input].toarray()
    single_movie_input = tf.convert_to_tensor(single_movie_input, dtype=tf.float32)
    multiple_movie_input = tf.convert_to_tensor(multiple_movie_input, dtype=tf.float32)
    demeaned_rating_matrix = tf.convert_to_tensor(demeaned_rating_matrix, dtype=tf.float32)
    movie_embeddings = movie_embedding_op(multiple_movie_input)
    weights = tf.expand_dims(demeaned_rating_matrix, axis=-1)
    weighted_average_input = tf.reduce_sum(tf.multiply(movie_embeddings, weights), axis=1)
    normalized_weighted_average_input = tf.nn.l2_normalize(weighted_average_input)
    X_train = [single_movie_input, normalized_weighted_average_input]
    Y_train = ratings_to_predict.astype('float32')
    Y_train = tf.convert_to_tensor(Y_train, dtype=tf.float32)
    return X_train, Y_train

extract_training_batch()

([<tf.Tensor: shape=(100, 1), dtype=float32, numpy=
  array([[  5669.],
         [  3816.],
         [  3141.],
         [   593.],
         [ 47417.],
         [   337.],
         [  1177.],
         [  1685.],
         [   722.],
         [   923.],
         [  8904.],
         [ 17520.],
         [ 52081.],
         [ 24714.],
         [  3964.],
         [  8845.],
         [ 19767.],
         [  6527.],
         [ 32796.],
         [  6911.],
         [ 28741.],
         [ 59311.],
         [  1743.],
         [   455.],
         [  5434.],
         [ 22426.],
         [ 16519.],
         [  1761.],
         [ 10392.],
         [  7156.],
         [  9185.],
         [  3582.],
         [ 39008.],
         [  7606.],
         [  8120.],
         [255112.],
         [  8215.],
         [ 29863.],
         [ 16653.],
         [  4464.],
         [190248.],
         [  3594.],
         [  2296.],
         [  3998.],
         [  4111.],
         [  5859.],
         [  2572.],
        

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 multiple_movies (InputLayer)   [(None, None)]       0           []                               
                                                                                                  
 ratings (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 movie_embedding_op (Embedding)  multiple            28607100    ['single_movie[0][0]',           
                                                                  'multiple_movies[0][0]']        
                                                                                                  
 single_movie (InputLayer)      [(None, 1)]          0           []                           

In [None]:
single_movie_input, multiple_movie_input, ratings_input, rating_to_predict = extract_training_data()
history = model.fit(
    x=[single_movie_input, multiple_movie_input, ratings_input], 
    y=rating_to_predict
)



In [None]:
import numpy as np
import tensorflow as tf
from keras.utils import Sequence

NUM_EPOCHS = 10

class DataGenerator(Sequence):
    def __init__(self, batch_size=100):
        self.num_movies = d["train_matrix"].shape[1]
        self.batch_size = batch_size
        
    def __len__(self):
        return int(self.num_movies / self.batch_size)
    
    def __getitem__(self, idx):
        return extract_training_batch(self.batch_size)

training_generator = DataGenerator()
history = model.fit(training_generator, epochs=NUM_EPOCHS, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'