In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, models, datasets
from sklearn.feature_extraction.text import CountVectorizer
from PIL import Image

import numpy as np
import pandas
import os
import shutil
import logging
import time

In [None]:
base_path = './movielens1m'

# Movielens1m Dataset

In [None]:
train_csv = os.path.join(base_path, 'train.dat')
test_csv = os.path.join(base_path, 'test.dat')
valid_csv = os.path.join(base_path, 'valid.dat')

In [None]:
train_dataframe = pandas.read_csv(train_csv, sep='\t',
                                   names=['user', 'item', 'rating'],
                                   index_col=False)
valid_dataframe = pandas.read_csv(valid_csv, sep='\t',
                                   names=['user', 'item', 'rating'],
                                   index_col=False)
test_dataframe = pandas.read_csv(test_csv, sep='\t',
                                   names=['user', 'item', 'rating'],
                                   index_col=False)
rating_mean = train_dataframe['rating'].mean()
rating_std = train_dataframe['rating'].std()

In [None]:
rating_mean, rating_std

In [None]:
class Movielens1m_org_dataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, rating_data):
        self.rating_data = rating_data

    def __len__(self):
        return len(self.rating_data)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        user, item, rating = self.rating_data.iloc[idx]    
        user = int(user)
        item = int(item)
        rating = float(rating)
        sample = {'user': user, 'item': item, 'rating': rating}
        return sample

In [None]:
movielens1m_train = Movielens1m_org_dataset(train_dataframe)
movielens1m_valid = Movielens1m_org_dataset(valid_dataframe)

batch_size = 32

svd_train_loader = torch.utils.data.DataLoader(movielens1m_train,
                                             batch_size=batch_size, shuffle=True,
                                             num_workers=4)
svd_valid_loader = torch.utils.data.DataLoader(movielens1m_valid,
                                             batch_size=batch_size, shuffle=True,
                                             num_workers=4)

# SVD model

In [None]:
class SVD(nn.Module):
    def __init__(self, n_users, n_items, rating_mean, embedding_size=32):
        super(SVD, self).__init__()
        self.user_embedding = nn.Embedding(num_embeddings=n_users,
                                           embedding_dim=embedding_size)
        self.item_embedding = nn.Embedding(num_embeddings=n_items,
                                           embedding_dim=embedding_size)
        self.user_bias = nn.Embedding(num_embeddings=n_users,
                                           embedding_dim=1)
        self.item_bias = nn.Embedding(num_embeddings=n_items,
                                           embedding_dim=1)
        self.rating_mean = rating_mean 
        
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


    def forward_train(self, inputs):

        item_embedding = self.item_embedding(inputs['item'].cuda())
        item_bias = self.item_bias(inputs['item'].cuda())

        user_embedding = self.user_embedding(inputs['user'].cuda())
        user_bias = self.user_bias(inputs['user'].cuda())

        preds = torch.sum(item_embedding*user_embedding, axis=1) + user_bias.squeeze(1) + item_bias.squeeze(1) + self.rating_mean

        return_vals = {'preds': preds, 'user_embedding': user_embedding,
                     'item_embedding': item_embedding}
        return return_vals
    
    def forward(self, x):

        item_embedding = self.item_embedding(x['item'].cuda())
        item_bias = self.item_bias(x['item'].cuda())
        
        user_embedding = self.user_embedding(x['user'].cuda())
        user_bias = self.user_bias(x['user'].cuda())

        preds = torch.sum(item_embedding*user_embedding, axis=1) + user_bias.squeeze(1) + item_bias.squeeze(1) + self.rating_mean

        return preds

def tensor_norm(tensor):
    return (tensor**2).sum()

def eval_model(loader, device):
    model.module.eval()
    with torch.no_grad():
        mse_test_loss = 0.0
        for inputs_svd in loader:
            svd_preds = svd_model(inputs_svd)
            mse_loss = rating_loss(svd_preds, inputs_svd['rating'].type(torch.FloatTensor).to(device))
            mse_test_loss += mse_loss
        mse_test_loss = mse_test_loss/len(loader)
    return mse_test_loss

def predict_svd(model, loader, device):
    model.module.eval()
    with torch.no_grad():
        preds = []
        for batch_idx, inputs_svd in enumerate(loader):
            svd_preds = model(inputs_svd)
            preds.append(svd_preds)
            if batch_idx % 100 == 0:
            print(f'finished {batch_idx} batches')
        preds = torch.cat(preds)
    return preds

In [None]:
svd_model = SVD(n_users=6040, n_items=3952, rating_mean=rating_mean, embedding_size=32)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
svd_model = svd_model.to(device)
svd_model = nn.DataParallel(svd_model)


# Training

In [None]:
logging.basicConfig(filename='./CKE_traning_svd.log' ,level=logging.INFO)

In [None]:
svd_optimizer = optim.Adam(svd_model.parameters(),lr=0.001)
l2_lambda = 0.1
rating_loss = nn.MSELoss()

In [None]:
svd_model.train()
best_loss = np.inf
for epoch in range(0, 21):
    rating_training_loss = 0.0
    start = time.time()
    for inputs_svd in svd_train_loader:

        # ============SVD============
        svd_optimizer.zero_grad()
        
        svd_outputs =  svd_model.module.forward_train(inputs_svd)
        
        regularization_loss = (tensor_norm(svd_outputs['user_embedding']) + 
                               tensor_norm(svd_outputs['user_embedding']))
        
        rating_mse = rating_loss(svd_outputs['preds'], inputs_svd['rating'].type(torch.FloatTensor).to(device))
        loss = rating_mse + l2_lambda*regularization_loss
        
        rating_training_loss += rating_mse
        loss.backward()
        svd_optimizer.step()

    rating_training_loss = rating_training_loss/len(svd_train_loader)
    
    mse_test_loss = eval_model(svd_valid_loader, device)
    end_time =  time.time()-start
    print(f'Epoch {epoch} | Time {end_time:2f} | Train mse Loss {rating_training_loss:4f} | Test mse loss {mse_test_loss:4f}')
    logging.info(f'Epoch {epoch} | Time {end_time:2f} | Train mse Loss {rating_training_loss:4f} | Test mse loss {mse_test_loss:4f}')
    if mse_test_loss < best_loss:
        best_loss = mse_test_loss
        state = svd_model.module.state_dict()
        torch.save(state, f'./models/svd_{epoch}.ckp.pth')


# Test SVD

## Test MSE

In [None]:
state = torch.load(f'./models/svd_19.ckp.pth')
svd_model.module.load_state_dict(state)

In [None]:
eval_model(svd_test_loader, device)

## Test Recall@K and MAP@K

In [None]:
test_data = pandas.read_csv(os.path.join(base_path, 'test_for_recall.csv'))

In [None]:
test_dataset = Movielens1m_org_dataset(test_data)

In [None]:
batch_size = 256
svd_test_loader = torch.utils.data.DataLoader(test_dataset,
                                             batch_size=batch_size, shuffle=False,
                                             num_workers=1)

In [None]:
preds = predict_svd(svd_model, svd_test_loader, device)

In [None]:
complete_csv = pandas.DataFrame({'user': test_data['user'],
                                 'item': test_data['item'],   
                                 'True_val': test_data['True_val'],
                                  'Preds': preds.detach().cpu().numpy()})

In [None]:
complete_csv.to_csv(os.path.join(base_path, 'results', 'Svd_final.csv'), index=False)

### The @k evaluation itself is coded into the calc recall@k and MAP@k_v1.1 notebook