### Link to dataset
- MovieLens20M: https://www.kaggle.com/ntnhan54/movielens20m
- MillionSong: https://www.kaggle.com/danh99/millionsong

In [None]:

from torch import nn, optim
from torch.nn import functional as F
import torch
from torch import nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import numpy as np
import  pandas as pd
import os
from scipy import sparse
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook as tqdm

import seaborn as sn
sn.set()


import sys
import warnings; 
warnings.simplefilter('ignore')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
DATA_DIR = '../input/movielens20m'
path = 'movielens-VAE.pt'
train_path = 'movielens-VAE-training.pt'
metrics_path = 'history_MovieLens_VAE.csv'

In [None]:
train_data = pd.read_csv(DATA_DIR + '/train.csv')
val_data = pd.read_csv(DATA_DIR + '/val.csv')
test_data = pd.read_csv(DATA_DIR + '/test.csv')


In [None]:
nItems = train_data.sid.nunique()
nItems

In [None]:
train_data = sparse.csr_matrix((np.ones_like(train_data.uid), (train_data.uid.values, train_data.sid.values)), 
                             dtype='float64',
                             shape=(train_data.uid.nunique(),nItems))


val_data = sparse.csr_matrix((np.ones_like(val_data.uid), (val_data.uid.values, val_data.sid.values)), 
                             dtype='float64',
                             shape=(val_data.uid.nunique(), nItems))

test_data = sparse.csr_matrix((np.ones_like(test_data.uid), (test_data.uid.values, test_data.sid.values)), 
                             dtype='float64',
                             shape=(test_data.uid.nunique(), nItems))

In [None]:
class netflixDataset(torch.utils.data.Dataset):
    def __init__(self, scr_matrix, eval = False,prop=0.2):
        self.scr_matrix = scr_matrix
        self.eval = eval
        self.prop = prop
      
          
    def __getitem__(self, idx):
      
      item = {}
        
      

      if self.eval:
        u_items = self.scr_matrix[idx,:].toarray()[0]
        
        nu_items = u_items.sum()       
        val_size = int(nu_items*self.prop)
        idx_labels = np.where(u_items == 1)[0]
        data = np.ones_like(u_items)
        
        
                
        val_idx = np.random.choice(idx_labels, size=val_size, replace=False)                   
        data[val_idx] = 0
         
        
        
        
        item['data'] = torch.tensor(u_items*data,dtype=torch.float64)     
        
        item['ground_truth'] = torch.tensor(np.logical_not(data),dtype=torch.float64)             
        
        
       
      else:
        item['data'] = torch.tensor(self.scr_matrix[idx,:].toarray(),dtype=torch.float64)
      return item
        

    def __len__(self):
        return self.scr_matrix.shape[0]

In [None]:
class VAE(nn.Module):
    def __init__(self,n_Items, hidden=600, dimz= 200, p=0.5):
        super(VAE, self).__init__()

        self.n_Items = n_Items
        self.dimz = dimz
        self.hidden = hidden
        self.p = p

        self.inference = nn.Sequential(
           
#             nn.Dropout(self.p),
            nn.Linear(self.n_Items,self.hidden),
            nn.Tanh(),
            nn.Linear(self.hidden,2*self.dimz)          
        )
        self.generative = nn.Sequential(
            nn.Linear(self.dimz,self.hidden),
            nn.Tanh(),
            nn.Linear(self.hidden,self.n_Items),
            
        )
  
        

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        
        return mu + std*eps* ( 1 if self.Mode =='train' else 0)


    def forward(self, x,Mode='train'):       
        self.Mode = Mode
        x = F.normalize(x, p=2, dim=1)  
        distribution = self.inference(x)



        mu, logvar = distribution[:, :self.dimz], distribution[:, self.dimz:]
        z = self.reparameterize(mu, logvar)
        logit = self.generative(z)

        
        return logit, mu, logvar

In [None]:
def loss_function(recon_x, x, mu, logvar,beta):
    
    CE = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
    KLD = -0.5 * torch.mean(torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1))

   
    return LL + beta*KLD 

In [None]:
def NDCG_at_k(labels, scores, k = 100):
    device = scores.device
    arg_sort_scores = torch.argsort(scores,1,descending=True)
    arg_sort_labels = torch.argsort(labels,1,descending=True)


    pred_labels = torch.gather(labels,1,arg_sort_scores[:,:k]).to(device)


    tp = (1. /torch.log(torch.arange(2,2+k).float())).to(device)


    dcg = (tp * pred_labels).sum(axis = 1)

    idcg = torch.Tensor([tp[:min(int(n),k)].sum() for n in labels.sum(1)]).to(device)

    ndcg = (dcg/idcg).mean()

    return ndcg

In [None]:
def Recall_at_k(labels, scores, k = 20):
    device = scores.device
    arg_sort_scores = torch.argsort(scores,1,descending=True)
    arg_sort_labels = torch.argsort(labels,1,descending=True)

    pred_labels = torch.gather(labels,1,arg_sort_scores[:,:k]).to(device)

    denominator = labels.sum(1)
    denominator[denominator > k] = k

    return (pred_labels.sum(1) / denominator).mean()

In [None]:
@torch.no_grad()
def evaluate(model, val_loader, ndcg_k = [100], recall_k = [20,50]):
    '''Evaluate model at Recall and NDCG metrics'''
    metrics = {}
    for k in ndcg_k:
        metrics[f'ndcg@{k}'] = []
    for k in recall_k:
        metrics[f'recall@{k}'] = []
    for data in val_loader:
        X = data['data'].float().to(device)  
        X = X.squeeze(1)


        ground_truth = torch.stack([data['ground_truth'][i,:] for i in range(X.shape[0])])\
                      .squeeze(1).to(device)

        pred ,_,_= model(X,Mode ='eval')

        pred = pred.detach()


        pred[X!=0] = -np.inf
        for k in ndcg_k:
            ndcg = NDCG_at_k(ground_truth,pred, k)
            metrics[f'ndcg@{k}'] += [ndcg.item()]
        for k in recall_k:
            recall = Recall_at_k(ground_truth,pred, k)
            metrics[f'recall@{k}'] += [recall.item()]
    
    for k in ndcg_k:
        metrics[f'ndcg@{k}'] = np.mean(metrics[f'ndcg@{k}'])
    for k in recall_k:
        metrics[f'recall@{k}'] = np.mean(metrics[f'recall@{k}'])
    return metrics

## New model

In [None]:
# Declare Model
model = VAE(nItems).to(device)
n_Epochs = 200

# KL-Annealing for new training
anneal = 0
anneal_cap = 1
anneal_steps = 1.0/200_000

# prepare Data
train_ds = netflixDataset(train_data)


val_ds = netflixDataset(val_data,eval=True)
val_dl = DataLoader(val_ds, batch_size=500)


optimizer = optim.AdamW(model.parameters(), lr=1e-3,weight_decay=0.01)
cur_metric = -np.inf

In [None]:
with open(metrics_path, 'a') as f:
    
    pbar = tqdm(range(n_Epochs),total = n_Epochs)
    for epoch in pbar:
        metrics = {}
        train_loss =  []
        # train phase
        model.train()
        train_phase = tqdm(train_dl,total = len(train_dl))
        for data in train_phase:
            x = data['data'].float().to(device)
            x = x.squeeze(1)
            optimizer.zero_grad()  


            recon_x, mu, logvar = model(x)   


            CE = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
            KLD = -0.5 * torch.mean(torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1))

            loss =  CE + anneal * KLD            
            anneal = min(anneal+anneal_steps,anneal_cap)   
            loss.backward()      

            optimizer.step()
            train_loss.append(loss.item())
            metrics['loss'] = train_loss[-1]
            train_phase.set_postfix(metrics)

        # Eval phases
        model.eval()
        metrics = evaluate(model, val_dl)
        metrics['train_loss'] = np.mean(train_loss)

        ndcg = list(metrics.values())[0]
        if ndcg > cur_metric:
            cur_metric = ndcg
            torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': metrics['train_loss'],
                    'evaluate': metrics,
                    'beta': anneal
                    }, path)
        pbar.set_postfix(metrics)
        # write metrics to file
        s = ['{:.3f}'.format(v) for v in metrics.values()]
        f.write(','.join(s) + '\n')

In [None]:
torch.save({
                    'epoch': n_Epochs,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': metrics['train_loss'],
                    'evaluate': metrics,
                    'beta': anneal
                    }, train_path)

In [None]:
df = pd.read_csv(metrics_path, header = None, names = metrics.keys(),index_col = False)
df

In [None]:
plt.figure(figsize=(12, 3))
plt.plot(df['train_loss'], '-')
plt.ylabel("loss")
plt.xlabel("Epochs")
pass

In [None]:
plt.figure(figsize=(12, 3))
plt.plot(df['ndcg@100'],'-')
plt.ylabel("ndcg@100")
plt.xlabel("Epochs")
pass

In [None]:
plt.figure(figsize=(12, 3))
plt.plot(df['recall@20'], 'x-')
plt.ylabel("recall@20")
plt.xlabel("Epochs")
pass

In [None]:
plt.figure(figsize=(12, 3))
plt.plot(df['recall@50'], '-')
plt.ylabel("recall@50")
plt.xlabel("Epochs")
pass

In [None]:
test_ds = netflixDataset(test_data,eval=True)
test_dl = DataLoader(test_ds, batch_size=512, shuffle = False, pin_memory = True)
model.eval()
evaluate(model, test_dl)

In [None]:
# best model
model1 = VAE(nItems).to(device)
checkpoint = torch.load(path)

model1.load_state_dict(checkpoint['model_state_dict'])

In [None]:
test_ds = netflixDataset(test_data,eval=True)
test_dl = DataLoader(test_ds, batch_size=500)
model1.eval()
evaluate(model1, test_dl)