In [9]:
! chmod 600 /home/giuven/.kaggle/kaggle.json

In [7]:
! pip install -q kaggle

In [10]:
! kaggle datasets list

ref                                                              title                                                size  lastUpdated          downloadCount  voteCount  usabilityRating  
---------------------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
ahsan81/hotel-reservations-classification-dataset                Hotel Reservations Dataset                          480KB  2023-01-04 12:50:31           8002        280  1.0              
googleai/musiccaps                                               MusicCaps                                           793KB  2023-01-25 09:25:48           1302        143  0.9411765        
themrityunjaypathak/most-subscribed-1000-youtube-channels        Most Subscribed 1000 Youtube Channels                28KB  2023-01-21 14:42:05           1763         57  1.0              
senapatirajesh/netflix-tv-shows-and-movies             

In [11]:
! kaggle datasets download samlearner/letterboxd-movie-ratings-data

Downloading letterboxd-movie-ratings-data.zip to /home/giuven
100%|█████████████████████████████████████████| 188M/188M [04:58<00:00, 636kB/s]
100%|█████████████████████████████████████████| 188M/188M [04:58<00:00, 659kB/s]


In [13]:
! mkdir /home/giuven/Desktop/MovieRecommenderSystem/local_notebooks/dataset
! unzip letterboxd-movie-ratings-data.zip -d /home/giuven/Desktop/MovieRecommenderSystem/local_notebooks/dataset

Archive:  letterboxd-movie-ratings-data.zip
  inflating: /home/giuven/Desktop/MovieRecommenderSystem/local_notebooks/dataset/movie_data.csv  
  inflating: /home/giuven/Desktop/MovieRecommenderSystem/local_notebooks/dataset/ratings_export.csv  
  inflating: /home/giuven/Desktop/MovieRecommenderSystem/local_notebooks/dataset/users_export.csv  


In [20]:
! pip install -q numpy
! pip install -q pandas

NOTEBOOK_FOLDER = "/home/giuven/Desktop/MovieRecommenderSystem/local_notebooks/"

import numpy as np
import pandas as pd

# Load the dataset into a Pandas dataframe
movie_data = pd.read_csv(NOTEBOOK_FOLDER + "dataset/movie_data.csv", lineterminator="\n")
ratings_data = pd.read_csv(NOTEBOOK_FOLDER + "dataset/ratings_export.csv", lineterminator="\n")
user_data = pd.read_csv(NOTEBOOK_FOLDER + "dataset/users_export.csv", lineterminator="\n")

In [None]:
! mkdir /home/giuven/Desktop/MovieRecommenderSystem/local_notebooks/processed


In [35]:
import pickle
from scipy.sparse import coo_matrix
from sklearn.model_selection import train_test_split

def get_and_save_or_load_maps(d):
    mm = NOTEBOOK_FOLDER + "processed/movie_to_index.pkl"
    uu = NOTEBOOK_FOLDER + "processed/user_to_index.pkl"
    try:
        with open(mm, 'rb') as f: d["movie_to_index"] = pickle.load(f)
        with open(uu, 'rb') as f: d["user_to_index"] = pickle.load(f)
    except:
        print("COULD NOT LOAD MAPS")
        d["movie_to_index"] = {m: i for i, m in enumerate(d["all_movies"])}
        d["user_to_index"] = {u: i for i, u in enumerate(d["all_users"])}
        with open(mm, 'wb') as f: pickle.dump(d["movie_to_index"], f)
        with open(uu, 'wb') as f: pickle.dump(d["user_to_index"], f)
    return d

def get_and_save_or_load_movies(d):
    mm = NOTEBOOK_FOLDER + "processed/all_movies.pkl"
    try:
        with open(mm, 'rb') as f: d["all_movies"] = pickle.load(f)
    except:
        print("COULD NOT LOAD MOVIES")
        d["all_movies"] = ratings_data.movie_id.unique()
        with open(mm, 'wb') as f: pickle.dump(d["all_movies"], f)
    return d

def get_and_save_or_load_users(d, train_ratio, test_ratio):
    uu = NOTEBOOK_FOLDER + "processed/all_users.pkl"
    tt1 = NOTEBOOK_FOLDER + "processed/train_users.pkl"
    tt2 = NOTEBOOK_FOLDER + "processed/test_users.pkl"
    try:
        with open(uu, 'rb') as f: d["all_users"] = pickle.load(f)
        with open(tt1, 'rb') as f: d["train_users"] = pickle.load(f)
        with open(tt2, 'rb') as f: d["test_users"] = pickle.load(f)
    except:
        print("COULD NOT LOAD USERS")
        d["all_users"] = ratings_data.user_id.unique()
        d["train_users"], d["test_users"] = train_test_split(d["all_users"], train_size=train_ratio, test_size=test_ratio)
        with open(uu, 'wb') as f: pickle.dump(d["all_users"], f)
        with open(tt1, 'wb') as f: pickle.dump(d["train_users"], f)
        with open(tt2, 'wb') as f: pickle.dump(d["test_users"], f)
    return d

def get_and_save_or_load_matrix(d, test_or_train):
    s = test_or_train + "_matrix"
    tt = NOTEBOOK_FOLDER + "processed/" + s + ".pkl"
    try:
        with open(tt, 'rb') as f: d[s] = pickle.load(f)
    except:
        print("COULD NOT LOAD MATRIX: " + test_or_train)
        shape = (len(d["all_users"]), len(d["all_movies"]))
        df = ratings_data[ratings_data["user_id"].isin(d[test_or_train + "_users"])]
        row = df['user_id'].map(d['user_to_index']).values
        col = df['movie_id'].map(d['movie_to_index']).values
        data = df['rating_val'].values
        d[s] = coo_matrix((data, (row, col)), shape=shape)
        d[s] = d[s].tocsr()
        with open(tt, 'wb') as f: pickle.dump(d[s], f)
    return d


def get_and_save_or_load_sample(train_ratio=0.8, test_ratio=0.2):
    d = dict()
    d = get_and_save_or_load_movies(d)
    d = get_and_save_or_load_users(d, train_ratio, test_ratio)
    d = get_and_save_or_load_maps(d)
    for s in ("test", "train"):
        d = get_and_save_or_load_matrix(d, s)
    return d

In [36]:
def subtract_column(sparse_matrix, column):
    column = column.flatten()
    nonzero_rows, nonzero_cols = sparse_matrix.nonzero()
    nonzero_values = sparse_matrix.data
    nonzero_values -= column[nonzero_rows]
    new_sparse_matrix = sparse_matrix.copy()
    new_sparse_matrix.data[:] = nonzero_values
    return new_sparse_matrix

def demean_matrix(mat):
    sums = mat.sum(axis=1).A1
    counts = np.diff(mat.indptr)
    averages = sums / counts
    averages = averages.reshape(-1, 1)
    return subtract_column(mat, averages)

d = get_and_save_or_load_sample()
d["train_matrix_demeaned"] = demean_matrix(d["train_matrix"].asfptype())
d["test_matrix_demeaned"] = demean_matrix(d["test_matrix"].asfptype())
d["train_matrix_demeaned"].data

COULD NOT LOAD MOVIES
COULD NOT LOAD USERS
COULD NOT LOAD MAPS
COULD NOT LOAD MATRIX: test
COULD NOT LOAD MATRIX: train


  averages = sums / counts


array([-1.0555973 ,  2.9444027 , -2.0555973 , ...,  2.78800631,
       -0.21199369, -1.21199369])

In [41]:
! pip install -q scikit-learn
! pip install -q matplotlib
! pip install -q scipy
! pip install -q tensorflow
! pip install -q keras

In [39]:
tf.config.list_physical_devices('GPU')

[]

In [38]:
import tensorflow as tf
from tensorflow.keras import layers
import keras.backend as K

LATENT_DIMENSION = 100
DROPOUT_RATE = 0.5
HIDDEN_LAYER_SIZE = 128

def custom_final_activation(x):
    return K.hard_sigmoid(x) * 10

def build_model(num_movies):
    single_movie_input = layers.Input(shape=(1,), name="single_movie")
    weighted_average_input = layers.Input(shape=(LATENT_DIMENSION,), name="weighted_average")

    movie_embedding_op = layers.Embedding(num_movies, LATENT_DIMENSION, name="movie_embedding_op")
    
    single_movie_embedding = movie_embedding_op(single_movie_input)
    single_movie_embedding_reshaped = layers.Reshape((LATENT_DIMENSION,), name="single_movie_embedding_reshaped")(single_movie_embedding)

    x = layers.Concatenate()([weighted_average_input, single_movie_embedding_reshaped])
    x = layers.BatchNormalization()(x)
    x = layers.Dense(HIDDEN_LAYER_SIZE, activation="selu")(x)
    x = layers.Dropout(DROPOUT_RATE)(x)
    x = layers.Dense(HIDDEN_LAYER_SIZE, activation="selu")(x)
    x = layers.Dropout(DROPOUT_RATE)(x)
    x = layers.Dense(1, activation=custom_final_activation, name="predicted_rating")(x)

    return tf.keras.Model(inputs=[single_movie_input, weighted_average_input], outputs=x), movie_embedding_op


with tf.device('/device:GPU:0'):
  model, movie_embedding_op = build_model(d["train_matrix"].shape[1])
  model.compile(optimizer="adam", loss="mean_squared_error")

2023-02-04 09:59:49.824469: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 114428400 exceeds 10% of free system memory.
2023-02-04 09:59:49.852806: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 114428400 exceeds 10% of free system memory.
2023-02-04 09:59:49.863890: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 114428400 exceeds 10% of free system memory.


In [37]:
import tensorflow as tf
import numpy as np
from keras.utils import Sequence

def extract_batch(mode, batch_size=100):
    if mode in ("training, train"):
        M = d["train_matrix"]
        DM = d["train_matrix_demeaned"].copy()
    else:
        M = d["test_matrix"]
        DM = d["test_matrix_demeaned"].copy()
    DM[:, 0] = DM[:, 0] - DM[:, 0]
    user_indices, movie_indices = M.nonzero()
    index_indices = np.random.choice(range(len(user_indices)), size=batch_size)
    random_user_indices = user_indices[index_indices]
    random_movie_indices = movie_indices[index_indices]
    nonzero_indices_by_row = np.array(np.split(M.indices, M.indptr)[1:-1], dtype='object')
    single_movie_input = np.array(random_movie_indices).reshape(batch_size, 1)
    multiple_movie_input = nonzero_indices_by_row[random_user_indices]
    ratings_to_predict = np.array([M.asfptype()[random_user_indices, random_movie_indices]]).reshape(batch_size, 1)

    def pad_array(array, max_len, padding_value=0.0):
        padded = np.pad(array, (0, max_len - len(array)), mode='constant', constant_values=padding_value)
        return padded

    def pad_multiple_arrays(multiple_arrays, padding_value=0.0):
        max_len = max([len(x) for x in multiple_arrays])
        padded = [pad_array(x, max_len, padding_value) for x in multiple_arrays]
        return np.array(padded)

    multiple_movie_input = pad_multiple_arrays(multiple_movie_input, padding_value=0)
    num_rows = len(multiple_movie_input)
    num_columns = len(multiple_movie_input[0])
    repeated_random_user_indices = np.array([[random_user_indices[i]] * num_columns for i in range(num_rows)])
    demeaned_rating_matrix = DM[repeated_random_user_indices, multiple_movie_input].toarray()
    single_movie_input = tf.convert_to_tensor(single_movie_input, dtype=tf.float32)
    multiple_movie_input = tf.convert_to_tensor(multiple_movie_input, dtype=tf.float32)
    demeaned_rating_matrix = tf.convert_to_tensor(demeaned_rating_matrix, dtype=tf.float32)
    movie_embeddings = movie_embedding_op(multiple_movie_input)
    weights = tf.expand_dims(demeaned_rating_matrix, axis=-1)
    weighted_average_input = tf.reduce_sum(tf.multiply(movie_embeddings, weights), axis=1)
    normalized_weighted_average_input = tf.nn.l2_normalize(weighted_average_input)
    X = [single_movie_input, normalized_weighted_average_input]
    Y = ratings_to_predict.astype('float32')
    Y = tf.convert_to_tensor(Y, dtype=tf.float32)
    if mode not in ("prediction",): return X, Y
    else: return X, Y, ratings_to_predict, random_user_names, random_movie_names


class TrainingDataGenerator(Sequence):
    def __init__(self, batch_size=100):
        self.num_samples = d["train_matrix"].shape[1]
        self.batch_size = batch_size
        
    def __len__(self):
        return int(self.num_samples / self.batch_size)
    
    def __getitem__(self, idx):
        return extract_batch("training", self.batch_size)


class EvaluationDataGenerator(Sequence):
    def __init__(self, num_samples, batch_size=100):
        self.num_samples = num_samples
        self.batch_size = batch_size
        
    def __len__(self):
        return int(self.num_samples / self.batch_size)
    
    def __getitem__(self, idx):
        return extract_batch("evaluation", self.batch_size)


extract_batch("training")

2023-02-04 09:59:41.804880: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-02-04 09:59:41.804900: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-02-04 09:59:41.804915: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (giuven-OMEN-Laptop-15-ek0xxx): /proc/driver/nvidia/version does not exist
2023-02-04 09:59:41.805312: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


NameError: name 'movie_embedding_op' is not defined