In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, models, datasets
from sklearn.feature_extraction.text import CountVectorizer
from PIL import Image

from tqdm import tqdm
import numpy as np
import pandas
import os
import shutil
import logging
import time
import random

In [None]:
base_path = './movielens1m'

In [None]:
items_extra = os.path.join(base_path, 'items.csv')
bert_emb_path = os.path.join(base_path, 'items_with_BERT_embeddings.csv')

posters_path = os.path.join(base_path,  'posters', 'data')
kg_id_map_path = os.path.join(base_path, 'item_id_2_kg_id.csv')

In [None]:
kg_id_map_df = pandas.read_csv(kg_id_map_path)
movielensid_to_kg_id = kg_id_map_df.set_index('item_id').to_dict()['kg_id']

In [None]:
img_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])


# Bert emedding CSV

In [None]:
bert_csv = pandas.read_csv(bert_emb_path)

In [None]:
embd = [np.fromstring(bert_csv['embedding'].values[i]).reshape(1, -1) for i in range(len(bert_csv))]

In [None]:
bert_embedding = np.concatenate(embd, axis=0)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler().fit(bert_embedding)

In [None]:
bert_embd_scaled = scaler.transform(bert_embedding)

# KG 

## KG dataset

In [None]:
kg_train_csv = os.path.join(base_path, 'kg_train.dat')
kg_test_csv = os.path.join(base_path, 'kg_test.dat')
kg_valid_csv = os.path.join(base_path, 'kg_valid.dat')

In [None]:
import random


class KG_dataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, kg_csv, entity_total):
        """
        Args:
            kg_csv (string): Dataframe file contaning KG triplets of (head tail relation).
        """
        self.entity_total = entity_total
        self.kg_data = pandas.read_csv(kg_csv, 
                                   sep='\t',
                                   header=None,names=['head', 'tail', 'relation'])
    def __len__(self):
        return len(self.kg_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        head, tail, relation = self.kg_data.iloc[idx]
        pos_triple = (head, tail, relation)
        head = int(head)
        tail = int(tail)
        relation = int(relation)

        # Get negative sample for the current priple
        neg_list = None
        if random.random() < 0.5:
            neg_list = self.corrupt_head_filter(pos_triple, self.entity_total) 
        else:
            neg_list = self.corrupt_tail_filter(pos_triple, self.entity_total)

        neg_head, neg_tail, neg_relation = neg_list
        neg_head = int(neg_head)
        neg_tail = int(neg_tail)
        neg_relation = int(neg_relation)


        sample = {'pos_head': head, 'pos_tail': tail, 'pos_relation': relation,
                 'neg_head': neg_head, 'neg_tail': neg_tail, 'neg_relation': neg_relation}

        return sample
    


    # Change the head of a triple randomly,
    def corrupt_head_filter(self, triple, entityTotal):
        newHead = random.randrange(entityTotal)
        return (newHead, triple[1],triple[2])

    # Change the tail of a triple randomly,
    def corrupt_tail_filter(self, triple, entityTotal, tailDicts=None):
        newTail = random.randrange(entityTotal)
        return (triple[0], newTail,triple[2])


In [None]:
# Get entities_count and relations_count

kg_train_data = pandas.read_csv(kg_train_csv, sep='\t', header=None,names=['head', 'tail', 'relation'])
kg_test_data = pandas.read_csv(kg_test_csv, sep='\t', header=None,names=['head', 'tail', 'relation'])
kg_valid_data = pandas.read_csv(kg_valid_csv, sep='\t', header=None,names=['head', 'tail', 'relation'])

entities_count = pandas.concat([kg_train_data["head"],
                            kg_test_data["head"],
                            kg_valid_data["head"],
                           kg_train_data["tail"],
                           kg_test_data["tail"],
                           kg_valid_data["tail"],],axis=0).unique().max()

relations_count = pandas.concat([kg_train_data["relation"],
                            kg_test_data["relation"],
                            kg_valid_data["relation"]]
                            ,axis=0).unique().max()

entities_count, relations_count

In [None]:
kg_train_dataset = KG_dataset(kg_train_csv, entities_count)
kg_test_dataset = KG_dataset(kg_test_csv, entities_count)
kg_valid_dataset = KG_dataset(kg_valid_csv, entities_count)

In [None]:
kg_train_loader = torch.utils.data.DataLoader(kg_train_dataset,
                                             batch_size=512, shuffle=True,
                                             num_workers=0)

kg_valid_loader = torch.utils.data.DataLoader(kg_test_dataset,
                                             batch_size=512, shuffle=True,
                                             num_workers=0)

kg_test_loader = torch.utils.data.DataLoader(kg_valid_dataset,
                                             batch_size=512, shuffle=True,
                                             num_workers=0)

## KG Model

In [None]:
import torch.autograd as autograd


class KG_net(nn.Module):
    def __init__(self,
                L1_flag,
                embedding_size,
                entity_total,
                relation_total,
                device
                ):
        super(KG_net, self).__init__()
        self.L1_flag = L1_flag
        self.embedding_size = embedding_size
        self.ent_total = entity_total + 1
        self.rel_total = relation_total + 1
        use_cuda = torch.cuda.is_available()
        self.device = device


        # init user and item embeddings
#         , padding_idx=self.ent_total-1
        self.ent_embeddings = nn.Embedding(self.ent_total, self.embedding_size)

        self.rel_embeddings = nn.Embedding(self.rel_total, self.embedding_size)
        self.proj_embeddings = nn.Embedding(self.rel_total, self.embedding_size * self.embedding_size)

        self.ent_embeddings = self.ent_embeddings.to(self.device)
        self.rel_embeddings = self.rel_embeddings.to(self.device)
        self.proj_embeddings = self.proj_embeddings.to(self.device)

        
  
    def forward(self, x):
        h, t, r = x

        h = h.to(self.device)
        t = t.to(self.device)
        r = r.to(self.device)
        
        h_e = self.ent_embeddings(h)
        t_e = self.ent_embeddings(t)
        r_e = self.rel_embeddings(r)
        proj_e = self.proj_embeddings(r)


        proj_h_e = self.projection_transR_pytorch(h_e, proj_e)
        proj_t_e = self.projection_transR_pytorch(t_e, proj_e)

        if self.L1_flag:
            score = torch.sum(torch.abs(proj_h_e + r_e - proj_t_e), 1)
        else:
            score = torch.sum((proj_h_e + r_e - proj_t_e) ** 2, 1)
        
        return score


    def projection_transR_pytorch(self, original, proj_matrix):
        ent_embedding_size = original.shape[1]
        rel_embedding_size = proj_matrix.shape[1] // ent_embedding_size
        original = original.view(-1, ent_embedding_size, 1)
        proj_matrix = proj_matrix.view(-1, rel_embedding_size, ent_embedding_size)
        return torch.matmul(proj_matrix, original).view(-1, rel_embedding_size)



USE_CUDA = torch.cuda.is_available()
def to_gpu(var):
    if USE_CUDA:
        return var.cuda()
    return var

class marginLoss(nn.Module):
    def __init__(self):
        super(marginLoss, self).__init__()

    def forward(self, pos, neg, margin):
        zero_tensor = to_gpu(torch.FloatTensor(pos.size()))
        zero_tensor.zero_()
        zero_tensor = autograd.Variable(zero_tensor)
        return torch.sum(torch.max(pos - neg + margin, zero_tensor))
    
    
def normLoss(embeddings, dim=1):
    norm = torch.sum(embeddings ** 2, dim=dim, keepdim=True)
    return torch.sum(torch.max(norm - to_gpu(autograd.Variable(torch.FloatTensor([1.0]))), to_gpu(autograd.Variable(torch.FloatTensor([0.0])))))


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

kg_net = KG_net(True, embedding_size=32, entity_total=entities_count,
               relation_total=relations_count, device=device)
kg_net = kg_net.to(device)

## Training KG

In [None]:
from torch.autograd import Variable as V
import torch.optim as optim

# model params
relation_total = relations_count + 1
embedding_size = 32

# optimizer
kg_learning_rate = 0.001

kg_optimizer = optim.Adagrad(kg_net.parameters(), lr=kg_learning_rate,
                             weight_decay=1e-3)

# loss
margin = 1.0
KG_LAMBDA = 1.0
margin_loss = marginLoss()

In [None]:
kg_net.train()
for epoch in range(200):
    kg_training_running_loss = 0.0
    start = time.time()
    for  inputs_kg in kg_train_loader:

      # =============Preprocessing===========
        pos_head, pos_tail, pos_relation = inputs_kg["pos_head"], inputs_kg["pos_tail"], inputs_kg["pos_relation"]
        neg_head, neg_tail, neg_relation = inputs_kg["neg_head"], inputs_kg["neg_tail"], inputs_kg["neg_relation"]
        
        # ============KG============
        kg_outputs_poss = kg_net((pos_head, pos_tail, pos_relation))
        kg_outputs_neg = kg_net((neg_head, neg_tail, neg_relation))

        kg_margin = margin_loss(kg_outputs_poss, kg_outputs_neg, margin)
        ent_embeddings = kg_net.ent_embeddings(torch.cat([pos_head, pos_tail, neg_head, neg_tail]).to(device))
        rel_embeddings = kg_net.rel_embeddings(torch.cat([pos_relation, neg_relation]).to(device))

        kg_loss = kg_margin + normLoss(ent_embeddings) + normLoss(rel_embeddings)
        kg_training_running_loss += kg_loss
        kg_loss.backward()
        kg_optimizer.step()
        kg_optimizer.zero_grad()

    kg_training_running_loss = kg_training_running_loss/len(kg_train_loader)
    end_time =  time.time()-start
    print(f'Epoch {epoch} | Time {end_time:2f} | KG Loss {kg_training_running_loss:2f}')
    if epoch % 10 == 0:
        state = kg_net.state_dict()
        torch.save(state, f'./models/kg_net.ckp.pth')


In [None]:
state = kg_net.state_dict()
torch.save(state, f'./models/kg_net.ckp.pth')

# Movielens1m Dataset

In [None]:
class Movielens1m_dataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, rating_data, items_extra_csv, bert_embedding, img_dir,
                 img_size, entities_count, img_transform=None):
        """
        Args:
            rating_csv (string): Dataframe file contaning user item ratings.
            item_extrac_csv (string)L Path to csv file containing all the items 
                extra information
            img_dir (string): Directory with all the images.
            img_transform (callable, optional): Optional transform to be applied
                on a sample.
            text_trasform (callable, optional): Optional text preprocess to be
             applied for the movie summary. Deffalt is CountVectorizer
        """
        self.rating_data = rating_data
        self.items_extras = pandas.read_csv(items_extra_csv)
        self.items_bert = bert_embedding
        self.img_list = list(self.items_extras['img_name'])
        self.img_size = img_size
        self.img_dir = img_dir
        self.img_transform = img_transform

        self.unkwon_entity_idx = entities_count

    def __len__(self):
        return len(self.rating_data)

    def __get_item_extra__(self, item_idx):
        img_name = self.img_list[item_idx]
        if img_name not in ['<fialed>', '<failed>']:
            img_path = os.path.join(self.img_dir, img_name)
            image = Image.open(img_path)
            image = image.convert('RGB')
            if self.img_transform is not None:
                image = self.img_transform(image)
        else:
            image = torch.zeros(*self.img_size)

        text = torch.Tensor(self.items_bert[item_idx])
        
        
        if int(item_idx) in movielensid_to_kg_id.keys():
            entity = int(movielensid_to_kg_id[item_idx])
        else:
            entity = self.unkwon_entity_idx

        return image, text.view(-1), entity
    
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        user, item, rating = self.rating_data.iloc[idx]
        image, text, entity = self.__get_item_extra__(item)

        user = int(user)
        item = int(item)

        sample = {'user': user, 'item': item, 'rating': rating,
                  'image': image, 'text': text,'entity': entity}

        return sample


In [None]:
train_csv = os.path.join(base_path, 'train.dat')
test_csv = os.path.join(base_path, 'test.dat')
valid_csv = os.path.join(base_path, 'valid.dat')

In [None]:
train_dataframe = pandas.read_csv(train_csv, sep='\t',
                                   names=['user', 'item', 'rating'],
                                   index_col=False)
valid_dataframe = pandas.read_csv(valid_csv, sep='\t',
                                   names=['user', 'item', 'rating'],
                                   index_col=False)
test_dataframe = pandas.read_csv(test_csv, sep='\t',
                                   names=['user', 'item', 'rating'],
                                   index_col=False)
rating_mean = train_dataframe['rating'].mean()

In [None]:
train_dataset = Movielens1m_dataset(rating_data=train_dataframe,
                                    bert_embedding=bert_embd_scaled,
                                    items_extra_csv=items_extra,
                                    img_dir=posters_path,
                                    img_size=(3, 224, 224),
                                    entities_count=entities_count, img_transform=img_transform)

valid_dataset = Movielens1m_dataset(rating_data=valid_dataframe,
                                    bert_embedding=bert_embd_scaled,
                                    items_extra_csv=items_extra,
                                    img_dir=posters_path,
                                    img_size=(3, 224, 224),
                                    entities_count=entities_count, img_transform=img_transform)

test_dataset = Movielens1m_dataset(rating_data=test_dataframe,
                                    bert_embedding=bert_embd_scaled,
                                    items_extra_csv=items_extra,
                                    img_dir=posters_path,
                                    img_size=(3, 224, 224),
                                    entities_count=entities_count, img_transform=img_transform)

In [None]:
batch_size = 256

cke_train_loader = torch.utils.data.DataLoader(train_dataset,
                                             batch_size=batch_size, shuffle=True,
                                             num_workers=4)

cke_valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                             batch_size=batch_size, shuffle=True,
                                             num_workers=4)

cke_test_loader = torch.utils.data.DataLoader(test_dataset,
                                             batch_size=batch_size, shuffle=True,
                                             num_workers=4)

# CKE SOTA Model

In [None]:
class Improved_cke(nn.Module):
    def __init__(self, n_users, n_items, kg_net, rating_mean, embedding_size=32):
        super(Improved_cke, self).__init__()
        self.img_model = nn.Sequential(*list(models.resnet18(pretrained=True).children())[:-1])
        self.kg_net = kg_net
        
        self.user_embedding = nn.Embedding(num_embeddings=n_users,
                                           embedding_dim=embedding_size)
        self.item_embedding = nn.Embedding(num_embeddings=n_items,
                                           embedding_dim=embedding_size)
        
        self.user_bias = nn.Embedding(num_embeddings=n_users,
                                           embedding_dim=1)
        self.item_bias = nn.Embedding(num_embeddings=n_items,
                                           embedding_dim=1)
        self.rating_mean = rating_mean 
        
        self.linear1 = nn.Linear(512+1560+32+32, 512)
        self.linear2 = nn.Linear(512, 32)
        
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def forward(self, inputs):
        batch_size = inputs['entity'].size(0)

        kg_embedding = self.kg_net.ent_embeddings(inputs['entity'].cuda())
        text_embeddding = inputs['text'].cuda()
        img_embedding = self.img_model(inputs['image'].cuda()).view(batch_size, -1)

        item_id_embedding = self.item_embedding(inputs['item'].cuda())
        
        user_bias = self.user_bias(inputs['user'].cuda())
        item_bias = self.item_bias(inputs['item'].cuda())

        item_embedding = [text_embeddding, kg_embedding, img_embedding, item_id_embedding]
        item_embedding = torch.cat(item_embedding, axis=1)
        item_embedding = self.linear1(item_embedding)
        item_embedding = self.linear2(item_embedding)
        
        
        user_embedding = self.user_embedding(inputs['user'].cuda())

        preds = torch.sum(item_embedding*user_embedding, axis=1) +user_bias.squeeze(1) + item_bias.squeeze(1) + self.rating_mean

        return_vals = {'preds': preds, 'user_embedding': user_embedding,
                                         'item_embedding': item_id_embedding}
        return return_vals
    

def tensor_norm(tensor):
    return (tensor**2).sum()

def eval_cke(loader, device):
    rating_loss = nn.MSELoss()
    cke.module.eval()
    with torch.no_grad():
        mse_test_loss = 0.0
        for cke_test_inputs in loader:
            cke_test_preds = cke(cke_test_inputs)
            mse_loss = rating_loss(cke_test_preds, cke_test_inputs['rating'].type(torch.FloatTensor).to(device))
            mse_test_loss += mse_loss
        mse_test_loss = mse_test_loss/len(loader)
        return mse_test_loss

In [None]:
cke = Improved_cke(n_users=6040, n_items=3952,
                kg_net= kg_net,  rating_mean=rating_mean, embedding_size=32)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:
cke = cke.to(device)
cke = nn.DataParallel(cke)


# Training

## Embedding conf

In [None]:
from torch.autograd import Variable as V
import torch.optim as optim

cke.module.kg_net.requires_grad_=False
cke.module.img_model.requires_grad_=False

## CKE model conf

In [None]:
noise_lambda = 0.001
text_loss = nn.MSELoss()
img_loss = nn.MSELoss()
rating_loss = nn.MSELoss()
l2_lambda = 1e-1
cke_learning_rate = 0.001
cke_optimizer = optim.Adam(cke.parameters(),lr=cke_learning_rate)

In [None]:
cke_optimizer.param_groups[0]['lr']=0.0001

##  Traning Loop

In [None]:
logging.basicConfig(filename='./Improved_cke_traning_mse_bias.log' ,level=logging.INFO)

In [None]:
cke.train()
for epoch in range(85, 1000):
    cke_training_running_loss = 0.0
    rating_training_loss = 0.0
    kg_training_running_loss = 0.0
    start = time.time()
    for inputs_cke in tqdm(cke_train_loader):
        # ============CKE============
        cke_optimizer.zero_grad()
        
        cke_outputs =  cke(inputs_cke)
        
        regularization_loss = (tensor_norm(cke_outputs['user_embedding']) + 
                               tensor_norm(cke_outputs['item_embedding']))
        
        rating_mse = rating_loss(cke_outputs['preds'], inputs_cke['rating'].type(torch.FloatTensor).to(device))

        
        cke_loss = rating_mse +l2_lambda*regularization_loss
        cke_training_running_loss += cke_loss
        rating_training_loss += rating_mse
        
        cke_loss.backward()
        cke_optimizer.step()

    cke_training_running_loss = cke_training_running_loss/len(cke_train_loader)
    rating_training_loss = rating_training_loss/len(cke_train_loader)

    if epoch % 5 == 0:
        mse_test_loss = eval_cke(cke_valid_loader, device)
        end_time =  time.time()-start
        print(f'Epoch {epoch} | Time {end_time:2f} | CKE Loss {cke_training_running_loss:2f} | Train mse loss {rating_training_loss:2f} | Test mse loss {mse_test_loss:4f}')
        logging.info(f'Epoch {epoch} | Time {end_time:2f} | CKE Loss {cke_training_running_loss:2f} | Train mse loss {rating_training_loss:2f} | Test mse loss {mse_test_loss:4f}')
    else:
        end_time =  time.time()-start
        print(f'Epoch {epoch} | Time {end_time:2f} | CKE Loss {cke_training_running_loss:2f} | Train mse loss {rating_training_loss:4f}')
        logging.info(f'Epoch {epoch} | Time {end_time:2f} | CKE Loss {cke_training_running_loss:2f} | Train mse loss {rating_training_loss:4f}')
   
    if epoch % 5 == 0:
        state = cke.module.state_dict()
        torch.save(state, f'./models/imporved_cke_mse_bias_{epoch}_001.ckp.pth')
state = cke.module.state_dict()
torch.save(state, './models/imporved_cke_mse_bias_1000.ckp.pth')

# Testing

In [None]:
cke.module.load_state_dict(torch.load(f'./models/imporved_cke_mse_bias_85_001.ckp.pth'))

## MSE Evaluation

In [None]:
eval_cke(cke_test_loader, device)

## Recall and MAP evaluation

In [None]:
test_data = pandas.read_csv(os.path.join(base_path, 'test_for_recall.csv'))

In [None]:
test_dataset = Movielens1m_dataset(rating_data=test_data,
                                    bert_embedding=bert_embd_scaled,
                                    items_extra_csv=items_extra,
                                    img_dir=posters_path,
                                    img_size=(3, 224, 224),
                                    entities_count=entities_count, img_transform=img_transform)

In [None]:
batch_size = 256

cke_test_loader = torch.utils.data.DataLoader(test_dataset,
                                             batch_size=batch_size, shuffle=False,
                                             num_workers=1)

In [None]:
test_sample = iter(cke_test_loader).next()

In [None]:
preds = []
cke.module.eval()
with torch.no_grad():
    mse_test_loss = 0.0
    for batch_idx, cke_test_inputs in enumerate(cke_train_loader):
        cke_test_preds = cke(cke_test_inputs)
        preds.append(cke_test_preds)
        if batch_idx % 100 == 0:
            print(f'Finished {batch_idx} test epoches')
    preds = torch.cat(preds)

In [None]:
test_with_preds_data = pandas.DataFrame({'preds':preds.detach().cpu().numpy()})
test_with_preds_data.to_csv(os.path.join(base_path, 'test_for_recall_with_preds.csv'), index=False)

In [None]:
preds_raring = [x['preds'] for x in preds]

In [None]:
preds_rating = torch.cat(preds_raring)

In [None]:
complete_csv = pandas.DataFrame({'user': test_data['user'],
                                 'item': test_data['item'],   
                                 'True_val': test_data['True_val'],
                                  'Preds': preds_rating.detach().cpu().numpy()})

In [None]:
complete_csv.to_csv(os.path.join(base_path, 'Improved_final.csv'), index=False)

### The @k evaluation itself is coded into the calc recall@k and MAP@k_v1.1 notebook