In [None]:
!pip install httplib2==0.15.0

from torch import nn, optim
from torch.nn import functional as F
import torch
from torch import nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import numpy as np
import  pandas as pd
import os
from scipy import sparse
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook as tqdm

import seaborn as sn
sn.set()

import sys
import warnings; 
warnings.simplefilter('ignore')

Collecting httplib2==0.15.0
[?25l  Downloading https://files.pythonhosted.org/packages/be/83/5e006e25403871ffbbf587c7aa4650158c947d46e89f2d50dcaf018464de/httplib2-0.15.0-py3-none-any.whl (94kB)
[K     |███▌                            | 10kB 20.4MB/s eta 0:00:01[K     |███████                         | 20kB 18.8MB/s eta 0:00:01[K     |██████████▍                     | 30kB 14.9MB/s eta 0:00:01[K     |█████████████▉                  | 40kB 14.2MB/s eta 0:00:01[K     |█████████████████▎              | 51kB 9.2MB/s eta 0:00:01[K     |████████████████████▊           | 61kB 9.3MB/s eta 0:00:01[K     |████████████████████████▏       | 71kB 10.5MB/s eta 0:00:01[K     |███████████████████████████▋    | 81kB 11.2MB/s eta 0:00:01[K     |███████████████████████████████ | 92kB 10.2MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 6.7MB/s 
[?25hInstalling collected packages: httplib2
  Found existing installation: httplib2 0.17.4
    Uninstalling httplib2-0.17.4:


In [None]:
# Drive ID
ml_20m_train = '1-3RvqzVCj9dI7Np1qwg7EAlt3RY0eYtJ'
ml_20m_val = '1-5Y4TomaQyhXjTcOJHpQEIDl2gbo2cC3'
ml_20m_test = '1-8F6ychbApdNtVJQp20ecz1sXEVVDr4I'

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# Download data
downloaded = drive.CreateFile({'id':ml_20m_train})   
downloaded.GetContentFile('train.csv') 

downloaded = drive.CreateFile({'id':ml_20m_val})   
downloaded.GetContentFile('val.csv') 

downloaded = drive.CreateFile({'id':ml_20m_test})   
downloaded.GetContentFile('test.csv') 



   

In [None]:
fid = '1ywXP4zdEU_r59HMIlEVhTlBGFijCHVK6'
http = drive.auth.Get_Http_Object()

In [None]:
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
test_data = pd.read_csv('test.csv')

In [None]:
nItems = train_data.sid.nunique()

In [None]:
nItems

19209

In [None]:
train_data = sparse.csr_matrix((np.ones_like(train_data.uid), (train_data.uid.values, train_data.sid.values)), 
                             dtype='float64',
                             shape=(train_data.uid.nunique(),nItems))


val_data = sparse.csr_matrix((np.ones_like(val_data.uid), (val_data.uid.values, val_data.sid.values)), 
                             dtype='float64',
                             shape=(val_data.uid.nunique(), nItems))

test_data = sparse.csr_matrix((np.ones_like(test_data.uid), (test_data.uid.values, test_data.sid.values)), 
                             dtype='float64',
                             shape=(test_data.uid.nunique(), nItems))

In [None]:
class netflixDataset(torch.utils.data.Dataset):
    def __init__(self, scr_matrix, eval = False,prop=0.2):
        self.scr_matrix = scr_matrix
        self.eval = eval
        self.prop = prop
      
          
    def __getitem__(self, idx):
      
      item = {}
        
      

      if self.eval:
        u_items = self.scr_matrix[idx,:].toarray()[0]
        
        nu_items = u_items.sum()       
        val_size = int(nu_items*self.prop)
        idx_labels = np.where(u_items == 1)[0]
        data = np.ones_like(u_items)
        
        
                
        val_idx = np.random.choice(idx_labels, size=val_size, replace=False)                   
        data[val_idx] = 0
         
        
        
        
        item['data'] = torch.tensor(u_items*data,dtype=torch.float64)     
        
        item['ground_truth'] = torch.tensor(np.logical_not(data),dtype=torch.float64)             
        
        
       
      else:
        item['data'] = torch.tensor(self.scr_matrix[idx,:].toarray(),dtype=torch.float64)
      return item
        

    def __len__(self):
        return self.scr_matrix.shape[0]





In [None]:
class VAE(nn.Module):
    def __init__(self,n_Items, hidden=600, dimz= 200, p=0.5):
        super(VAE, self).__init__()

        self.n_Items = n_Items
        self.dimz = dimz
        self.hidden = hidden
        self.p = p

        self.inference = nn.Sequential(
           
            nn.Dropout(self.p),
            nn.Linear(self.n_Items,self.hidden),
            nn.Tanh(),
            nn.Linear(self.hidden,2*self.dimz)          
        )
        self.generative = nn.Sequential(
            nn.Linear(self.dimz,self.hidden),
            nn.ReLU(),
            nn.Linear(self.hidden,self.n_Items),
            
        )
  
        

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        
        return mu + std*eps* ( 0 if self.Mode =='train' else 1)


    def forward(self, x,Mode='train'):       
        self.Mode = Mode
        x = F.normalize(x, p=2, dim=1)  
        distribution = self.inference(x)



        mu, logvar = distribution[:, :self.dimz], distribution[:, self.dimz:]
        z = self.reparameterize(mu, logvar)
        logit = self.generative(z)

        
        return logit, mu, logvar


       



In [None]:
def loss_function(recon_x, x, mu, logvar,N_i,beta):

   # BCE = F.binary_cross_entropy(recon_x.view(-1,n_items), x.view(-1,n_items), reduction='sum')
    
    LL = -torch.mean(torch.sum(F.log_softmax(recon_x, -1) * x, -1))
    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
   
    return LL + beta*KLD 

In [None]:

def NDCG_at_k(labels, scores, k = 100):
  device = scores.device
  arg_sort_scores = torch.argsort(scores,1,descending=True)
  arg_sort_labels = torch.argsort(labels,1,descending=True)
  

  pred_labels = torch.gather(labels,1,arg_sort_scores[:,:k]).to(device)
 

  tp = (1. /torch.log(torch.arange(2,2+k))).to(device)
  
 
  dcg = (tp * pred_labels).sum(axis = 1)
 
  idcg = torch.Tensor([tp[:min(int(n),k)].sum() for n in labels.sum(1)]).to(device)
  
  ndcg = (dcg/idcg).mean()

  return ndcg

In [None]:
# Declare Model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = VAE(nItems,hidden=1200,dimz=400).to(device)
n_Epochs = 200


# KL-Annealing
anneal = 0
anneal_cap = 0.2
anneal_steps = 1.0/200_000

# prepare Data
train_ds = netflixDataset(train_data)
train_dl = DataLoader(train_ds, batch_size=512)

val_ds = netflixDataset(val_data,eval=True)
val_dl = DataLoader(val_ds, batch_size=1024)


optimizer = optim.AdamW(model.parameters(), lr=1e-3,weight_decay=0.01)

path = 'model51.pt'

In [None]:
total_loss = []
total_ndcgs = []
cur_metric = -np.inf

In [None]:
pbar = tqdm(range(n_Epochs),total = n_Epochs)
for epoch in pbar:
  NDCGs = []
  Recalls = []
  model.train()
  train_loss =  []
  metrics = {}
  

  train_phase = tqdm(enumerate(train_dl),total = len(train_dl),leave = False)
  for batch_idx, data in train_phase:
      
      x = data['data'].float().to(device)
      x = x.squeeze(1)
      optimizer.zero_grad()  
     
      
      recon_x, mu, logvar = model(x)   
     
      CE = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
      KLD = -0.5 * torch.mean(torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1))
     
      loss =  CE + anneal * KLD
      
      # loss = loss_function(recon_batch, x, mu, logvar,n_items,anneal)   
      anneal = min(anneal+anneal_steps,anneal_cap)   
      loss.backward()      
      
      
      optimizer.step()
      train_loss.append(loss.item())
      metrics['loss'] =  loss.item()
      train_phase.set_postfix(metrics)
    
  
  model.eval()
  eval_phase = tqdm(enumerate(val_dl),total = len(val_dl),leave = False)

  for batch_idx,data in eval_phase:
    
      
    X = data['data'].float().to(device)  
    X = X.squeeze(1)
    

    ground_truth = torch.stack([data['ground_truth'][i,:] for i in range(X.shape[0])])\
                  .squeeze(1).to(device)
    
   

    pred ,_,_= model(X,Mode ='eval')

    pred = pred.detach()
   
    
    pred[X!=0] = -np.inf
    ndcg = NDCG_at_k(ground_truth,pred)
    metrics['loss'] = np.mean(train_loss)
    metrics['ndcg'] = ndcg.item()
    NDCGs.append(metrics['ndcg'])
    eval_phase.set_postfix(metrics)
    
  
  metrics['ndcg'] = np.mean(NDCGs)
  total_loss.append(metrics['loss'])
  total_ndcgs.append(metrics['ndcg'])
  if total_ndcgs[-1] > cur_metric:
    cur_metric = total_ndcgs[-1]
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': total_loss[-1],
            'ndcg': total_ndcgs[-1],
            'beta': anneal
            }, path)
    checkpoint = drive.CreateFile({"parents": [{"kind": "drive#fileLink", "id": fid}]})
    checkpoint.SetContentFile(path)
    checkpoint.Upload()

  pbar.set_postfix(metrics)

In [None]:
plt.figure(figsize=(12, 3))
plt.plot(total_loss)
plt.ylabel("loss")
plt.xlabel("Epochs")
pass

In [None]:
plt.figure(figsize=(12, 3))
plt.plot(total_ndcgs)
plt.ylabel("loss")
plt.xlabel("Epochs")
pass

In [None]:

model = VAE(nItems)
optimizer = optim.AdamW(model.parameters(), lr=1e-3,weight_decay=0.01)

checkpoint = torch.load(path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
anneal = checkpoint['beta']
ndcg = checkpoint['ndcg']
model.to(device)

In [None]:
test_ds = netflixDataset(test_data,eval=True)
test_dl = DataLoader(test_ds,batch_size=1024)
model.eval()
metrics = {}
eval_phase = tqdm(enumerate(test_dl),total = len(test_dl),leave = False)
NDCGs = []
for batch_idx,data in eval_phase:
  
    
  X = data['data'].float().to(device)  
  X = X.squeeze(1)
  

  ground_truth = torch.stack([data['ground_truth'][i,:] for i in range(X.shape[0])])\
            .squeeze(1).to(device)
  
  

  pred ,_,_= model(X,Mode ='eval')

  pred = pred.detach()
  
  
  pred[X!=0] = -np.inf
  ndcg = NDCG_at_k(ground_truth,pred)

  metrics['ndcg'] = ndcg.item()
  NDCGs.append(metrics['ndcg'])
  eval_phase.set_postfix(metrics)
print(np.mean(NDCGs))

In [None]:
epoch,loss,anneal