Basic workflow for the CIL Project 1
- Modules built on pytorch_lightning for easier interface and logging capabilities
- Important functionality logged to Wandb (See [this](https://wandb.ai/site/articles/pytorch-lightning-with-weights-biases) article for more information)
- NeuMF mostly implemented except for the $\alpha$ weighting in the paper
- Probably overfits because the models map everyhing to the same entry -> 
    - todo: add weight initialization
    - todo: check gradients (maybe batch norm or LeakyRELU or parametric RELU would work better)
- Todo: Check the validation in the sample code and find an effective way to incorporate it into pytorch lightning
- GPU: Comment off `gpu=` part in the trainer definition if GPU not available

In [None]:
! pip install pytorch_lightning;

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import math



import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset

import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import wandb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# add your WANDB_API_KEY to kaggle secrets by going to Add-ons -> Secrets
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
WANDB_API_KEY = user_secrets.get_secret("WANDB_API_KEY") 
# in a local computer add the following line to your bashrc or at the top of your notebook
# export WANDB_API_KEY="WANDB_API_KEY"
# WANDB_API_KEY = os.environ['WANDB_API_KEY']

In [None]:
# this uses API key added to kaggle secrets
! wandb login $WANDB_API_KEY

In [None]:
#  use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Using device:', device)

In [None]:
number_of_users, number_of_movies = (10000, 1000)

data_pd = pd.read_csv('/kaggle/input/cil-collaborative-filtering-2022/data_train.csv')
test_pd = pd.read_csv('/kaggle/input/cil-collaborative-filtering-2022/sampleSubmission.csv')
print(data_pd.head(5))
print(test_pd.head(5))
print()
print('Shape', data_pd.shape, test_pd.shape, data_pd.shape[0]/test_pd.shape[0])

In [None]:
from sklearn.model_selection import train_test_split
# Split the dataset into train and test

train_size = 0.9

train_pd, test_pd = train_test_split(data_pd, train_size=train_size, random_state=42)

In [None]:
def extract_users_items_predictions(data_pd):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    return users, movies, predictions

train_users, train_movies, train_predictions = extract_users_items_predictions(train_pd)

# also create full matrix of observed values
data = np.full((number_of_users, number_of_movies), np.mean(train_pd.Prediction.values))
mask = np.zeros((number_of_users, number_of_movies)) # 0 -> unobserved value, 1->observed value

for user, movie, pred in zip(train_users, train_movies, train_predictions):
    data[user - 1][movie - 1] = pred
    mask[user - 1][movie - 1] = 1

In [None]:
data.shape

In [None]:
from sklearn.metrics import mean_squared_error

rmse = lambda x, y: math.sqrt(mean_squared_error(x, y))

test_users, test_movies, test_predictions = extract_users_items_predictions(test_pd)

# test our predictions with the true values
def get_score(predictions, target_values=test_predictions):
    return rmse(predictions, target_values)

def extract_prediction_from_full_matrix(reconstructed_matrix, users=test_users, movies=test_movies):
    # returns predictions for the users-movies combinations specified based on a full m \times n matrix
    assert(len(users) == len(movies)), "users-movies combinations specified should have equal length"
    predictions = np.zeros(len(test_users))

    for i, (user, movie) in enumerate(zip(users, movies)):
        predictions[i] = reconstructed_matrix[user][movie]

    return predictions

## Neural CF
Define models

In [None]:
class BaseNCF(pl.LightningModule):
    """ This is the BaseClass for underlying models, defines training and validation functionality, 
    and Adam optimizer
    """
    def __init__(self, number_of_users, number_of_movies, embedding_size,                 
                 lr: float = 0.001,
                b1: float = 0.5,
                b2: float = 0.999,
                momentum: float = 0.9,
                loss_func = mse_loss,
                sublayer_name = ''):
        super().__init__()
        self.save_hyperparameters()
        self.embedding_layer_users = nn.Embedding(number_of_users, embedding_size)
        self.embedding_layer_movies = nn.Embedding(number_of_movies, embedding_size)
        self.loss_func = loss_func
        

        self.feed_forward = nn.Sequential(
            nn.Linear(in_features=2 * embedding_size, out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=16),
            nn.ReLU(),
            nn.Linear(in_features=16, out_features=1), # maybe predict per category?
            nn.ReLU()
        )
        
    def forward(self, users, movies):
        raise ('Must be implemented by subclasses')
    
    def training_step(self, batch, batch_idx):
        users, movies, ratings = batch
        predictions = self(users, movies)
        loss = self.loss_func(predictions, ratings)
        self.log(f'{self.hparams.sublayer_name}/train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        users, movies, ratings = batch
        predictions = self(users, movies)
        loss = self.loss_func(predictions, ratings)
        self.log(f'{self.hparams.sublayer_name}/valid_loss', loss)
        return loss
    
    def configure_optimizers(self):
        lr = self.hparams.lr
        b1 = self.hparams.b1
        b2 = self.hparams.b2
        weight_decay = 5*1e-4 
        opt = torch.optim.Adam(self.parameters(), lr=lr, betas=(b1, b2), weight_decay= weight_decay)
        #scheduler = lr_scheduler.CosineAnnealingLR(opt,T_max=T_max, 
        #                                           eta_min=min_lr)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(opt,T_0=1, T_mult = 2,
                                                             eta_min=1e-7)
        #scheduler = lr_scheduler.ReduceLROnPlateau(opt,
        #                                           mode='min',
        #                                          factor=0.1,
        #                                           patience=7,
        #                                           threshold=0.0001,
        #                                           min_lr=min_lr,)
        

        #scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=100, gamma=0.1)
        return {"optimizer": opt,
                "lr_scheduler": {
                    "scheduler": scheduler,
                    "monitor": "val_loss"
                },
               }
    
    def predict_step(self, batch, batch_idx):
        users, movies = batch
        return self(users, movies)
    
    def name(self):
#         todo: name model for saving purposes
        return f"BaseNCF_{'Adam'}"

class Vanilla_NCF(BaseNCF):
    def __init__(self, number_of_users, number_of_movies, embedding_size,                 
                 sublayer_name='vanilla_ncf',
                 *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding_layer_users = nn.Embedding(number_of_users, embedding_size)
        self.embedding_layer_movies = nn.Embedding(number_of_movies, embedding_size)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(in_features=2 * embedding_size, out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=16),
            nn.ReLU(),
            nn.Linear(in_features=16, out_features=1), # maybe predict per category?
            nn.ReLU()
        )

    def forward(self, users, movies):
        users_embedding = self.embedding_layer_users(users)
        movies_embedding = self.embedding_layer_movies(movies)
        concat = torch.cat([users_embedding, movies_embedding], dim=1)
        return torch.squeeze(self.feed_forward(concat))
    
   
    def name(self):
        return f"Vanilla_NCF_{'Adam'}"
    

In [None]:
class GMF(BaseNCF):
    def __init__(self, number_of_users, number_of_movies, embedding_size,
                 sublayer_name='gmf',
                 *args, **kwargs):
        super(GMF, self).__init__(number_of_users, number_of_movies, embedding_size,)
        self.embedding_layer_users = nn.Embedding(number_of_users, embedding_size)
        self.embedding_layer_movies = nn.Embedding(number_of_movies, embedding_size)
        self.output_layer = nn.Linear(in_features=embedding_size, out_features=1)
        
    def forward(self, users, movies):
        users_embedding = self.embedding_layer_users(users)
        movies_embedding = self.embedding_layer_movies(movies)
        return self.output_layer(users_embedding * movies_embedding)    

In [None]:
class MLP(BaseNCF):
    def __init__(self, number_of_users, number_of_movies, embedding_size,
                 layer_sizes=[64, 32, 16, 8], # may be used to define feed_forward layer in a more sophisticated way
                 sublayer_name='mlp',
                 *args, **kwargs):
        super(MLP, self).__init__(number_of_users, number_of_movies, embedding_size,)
        self.embedding_layer_users = nn.Embedding(number_of_users, embedding_size)
        self.embedding_layer_movies = nn.Embedding(number_of_movies, embedding_size)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(in_features=2 * embedding_size, out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=32),
            nn.ReLU(),
            nn.Linear(in_features=32, out_features=16), # maybe predict per category?
            nn.ReLU(),
            nn.Linear(in_features=16, out_features=8), 
            nn.ReLU()
        )
        self.output_layer = nn.Linear(8, 1)

        
    def forward(self, users, movies):
        users_embedding = self.embedding_layer_users(users)
        movies_embedding = self.embedding_layer_movies(movies)
        concat = torch.cat([users_embedding, movies_embedding], dim=1)
        hidden_layers = torch.squeeze(self.feed_forward(concat)) 
        return self.output_layer(hidden_layers)
    
    # The paper uses VanillaSGD at this layer as they start the GMF and MLP layers pretrained
    def configure_optimizers(self):
        lr = self.hparams.lr
        opt = torch.optim.SGD(self.parameters(), lr=lr)
        return {"optimizer": opt}
    

In [None]:
class NeuMF(BaseNCF):
    def __init__(self, number_of_users, number_of_movies, embedding_size,                 
                 lr: float = 0.001,
#                 b1: float = 0.5,
#                 b2: float = 0.999,
#                 momentum: float = 0.9,
                 alpha = 0.5, # trade-off parameter between two models
                 gmf_pretrained_ckpt = '',
                 mlp_pretrained_ckpt = '',
                 sublayer_name='neumf',
                 *args, **kwargs):
        super().__init__( number_of_users, number_of_movies, embedding_size,)
        
        self.gmf = GMF.load_from_checkpoint(gmf_pretrained_ckpt); self.gmf.mode = 'train'
        self.mlp = MLP.load_from_checkpoint(mlp_pretrained_ckpt); self.mlp.mode = 'train'
        self.output_layer = nn.Linear(in_features=2, out_features=1)

    def forward(self, users, movies):
        gmf = self.gmf(users, movies)
        mlp = self.mlp(users, movies)
        concat = torch.cat([gmf, mlp], dim=1)
#         todo: implement \alpha later
        return self.output_layer(concat)
    
   
    def name(self):
        return f"Full_NCF_{'VanillaSGD'}"
    
    
    

# Pretraining & Setup
Pretrain GMF and MLP layers

NOTE: In the sample code, they reform the matrix and calculate a different metric at the end of each epoch, we don't do that currently and only calculate validation loss on a batch.

In [None]:
# Parameters
batch_size = 1024
valid_batch_size = 1024
num_epochs = 25
show_validation_score_every_epochs = 1
embedding_size = 16
learning_rate = 1e-3
pretrain = False

In [None]:
def mse_loss(predictions, target):
    return torch.mean((predictions - target) ** 2)

# Build Dataloaders
train_users_torch = torch.tensor(train_users, device=device)
train_movies_torch = torch.tensor(train_movies, device=device)
train_predictions_torch = torch.tensor(train_predictions, device=device)

train_dataloader = DataLoader(
    TensorDataset(train_users_torch, train_movies_torch, train_predictions_torch),
    batch_size=batch_size)

test_users_torch = torch.tensor(test_users, device=device)
test_movies_torch = torch.tensor(test_movies, device=device)
test_predictions_torch = torch.tensor(test_predictions, device=device)

test_dataloader = DataLoader(
    TensorDataset(test_users_torch, test_movies_torch, test_predictions_torch),
    batch_size=valid_batch_size)

wandb_logger = WandbLogger(project='cil-project',
                           log_model='all', 
                           )
lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step')
# saves a file like: my/path/sample-mnist-epoch=02-val_loss=0.32.ckpt
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor="valid_loss",
    save_top_k=2,
    mode="min",
)
trainer = pl.Trainer(callbacks=[checkpoint_callback, lr_monitor], 
#                      gpus=list(range(torch.cuda.device_count())), 
                     gpus=[0], auto_select_gpus=True,
                     max_epochs=num_epochs, 
                     auto_lr_find=True,
                     logger=wandb_logger,
                     log_every_n_steps=5,
                    )

## VanillaNCF
Uncomment to train VanillaNCF as presented in the sample notebook

In [None]:
# as in the sample code
# ncf = Vanilla_NCF(number_of_users, number_of_movies, embedding_size,
#          loss_func=mse_loss).to(device)
# # uncomment to log gradients
# # wandb_logger.watch(ncf)
# wandb_logger = WandbLogger(project='cil-project',
#                            log_model='all', 
#                            name=f'vanilla_ncf_{time.time():.0f}'
#                            )
# checkpoint_callback = pl.callbacks.ModelCheckpoint(
#     monitor="vanilla_ncf/valid_loss",
#     save_top_k=2,
#     mode="min",
# )
# trainer = pl.Trainer(callbacks=[checkpoint_callback, lr_monitor], 
# #                      gpus=list(range(torch.cuda.device_count())), 
#                      gpus=[0], auto_select_gpus=True,
#                      max_epochs=num_epochs, 
#                      auto_lr_find=True,
#                      logger=wandb_logger,
#                      log_every_n_steps=5,
#                     )

In [None]:
import time

## Define & Pretrain GMF

In [None]:
gmf = GMF(number_of_users, number_of_movies, embedding_size,
         loss_func=mse_loss).to(device)

wandb_logger = WandbLogger(project='cil-project',
                           log_model='all', 
                           name=f'gmf_{time.time():.0f}'
                           )
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor="gmf/valid_loss",
    save_top_k=2,
    mode="min",
)
trainer = pl.Trainer(callbacks=[checkpoint_callback, lr_monitor], 
#                      gpus=list(range(torch.cuda.device_count())), 
                     gpus=[0], auto_select_gpus=True,
                     max_epochs=num_epochs, 
                     auto_lr_find=True,
                     logger=wandb_logger,
                     log_every_n_steps=5,
                    )

In [None]:
if pretrain:
    print('Starting pretraining GMF module')
    trainer.fit(gmf,  train_dataloaders=train_dataloader, val_dataloaders=test_dataloader)
else:
    print('GMF pretraining skipped')

## Define & Pretrain MLP

In [None]:
mlp = MLP(number_of_users, number_of_movies, embedding_size,
         loss_func=mse_loss).to(device)

wandb.finish()
wandb_logger = WandbLogger(project='cil-project',
                           log_model='all', 
                           name=f'mlp_{time.time():.0f}'
                           )
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor="mlp/valid_loss",
    save_top_k=2,
    mode="min",
)

trainer = pl.Trainer(callbacks=[checkpoint_callback, lr_monitor], 
#                      gpus=list(range(torch.cuda.device_count())), 
                     gpus=[0], auto_select_gpus=True,
                     max_epochs=num_epochs, 
                     auto_lr_find=True,
                     logger=wandb_logger,
                     log_every_n_steps=5,
                    )

In [None]:
if pretrain:
    print('Starting MLP pretraining.')
    trainer.fit(mlp,  train_dataloaders=train_dataloader, val_dataloaders=test_dataloader)
else:
    print('MLP pretraining passed.')

In [None]:
# Uncomment when running for the first time in a scheduler
# break

# NeuMF Layer
Download pretrained weights and initialize NeuMF layer and train on that

In [None]:
wandb.finish()
run = wandb.init(project='cil-project')
gmf_artifact = run.use_artifact('gsaltintas/cil-project/model-19m8lcmr:v8', type='model')
mlp_artifact = run.use_artifact('gsaltintas/cil-project/model-3t58trbi:v24', type='model')
gmf_artifact_dir = gmf_artifact.download()
mlp_artifact_dir = mlp_artifact.download()


In [None]:
os.listdir(gmf_artifact_dir)
os.path.abspath(os.path.join(gmf_artifact_dir, 'model.ckpt'))

In [None]:

model = NeuMF(number_of_users, number_of_movies, embedding_size,
              gmf_pretrained_ckpt=os.path.abspath(os.path.join(gmf_artifact_dir, 'model.ckpt')),
              mlp_pretrained_ckpt=os.path.abspath(os.path.join(mlp_artifact_dir, 'model.ckpt')),
             loss_func=mse_loss).to(device)

wandb_logger = WandbLogger(project='cil-project',
                           log_model='all', 
                           name=run
                           )
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor="neumf/valid_loss",
    save_top_k=2,
    mode="min",
)
trainer = pl.Trainer(callbacks=[checkpoint_callback, lr_monitor], 
#                      gpus=list(range(torch.cuda.device_count())), 
                     gpus=[0], auto_select_gpus=True,
                     max_epochs=num_epochs, 
                     auto_lr_find=True,
                     logger=wandb_logger,
                     log_every_n_steps=5,
                    )

In [None]:
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=test_dataloader)

# Submit
## Submission Predictions

In [None]:
submission_pd = pd.read_csv('/kaggle/input/cil-collaborative-filtering-2022/sampleSubmission.csv')
submission_users, submission_movies, submission_ratings = extract_users_items_predictions(submission_pd)
submission_users_torch = torch.tensor(submission_users, device=device)
submission_movies_torch = torch.tensor(submission_movies, device=device)
# create submission dataloader
submit_loader = DataLoader(TensorDataset(submission_users_torch, submission_movies_torch),
                          batch_size=batch_size)

In [None]:
# predict entries using trainer
submission_predictions = trainer.predict(model, submit_loader)

# Submit
Transfer predictions for each batch to the data frame (can use model(batch) then trainer.predict for memory efficiency) and then write it to csv.

In [None]:
submission_i = 0
for pred_batch in submission_predictions:
    start_ind = submission_i*batch_size
    end_ind = pred_batch.shape[0] + start_ind
    submission_pd.iloc[start_ind: end_ind, 1] = np.array(pred_batch.cpu())
    submission_i += 1

In [None]:
submission_pd.head, submission_pd.describe()

In [None]:
submission_pd.to_csv('submission.csv', encoding='utf-8' )
# just in case upload submission to wandb
wandb.save('submission.csv')