In [51]:
import torch
import pytorch_lightning as pl 
import pandas as pd
import numpy as np
from dataset import extract_users_movies_ratings_lists
import torch.nn as nn
BASE_MODEL_FOLDER = '../results_ensemble'
MODEL_LIST = [
    #'AE_SWA',
    'AE_SWA_ensemble_mean',
    'ALS',
    'NCF_dist_exp_2_embeddings_SWA',
    'NCF_dist_exp_SWA',
    'SVDpp_ensemble_gaussian'
]



In [52]:
class DatasetEnsembleResult(torch.utils.data.Dataset):
    """
    Dataset
    x = (movie, user)
    y = rating

    """
    def __init__(self, users, movies, ratings_models, ratings=None) -> None:
        super().__init__()
        self.users = users
        self.movies = movies
        self.ratings =ratings
        self.ratings_models = ratings_models
    
    def __getitem__(self, index):
        user = self.users[index]
        movie = self.movies[index]
       
        rating_models = self.ratings_models[index]
       
        if self.ratings is None:
            return user, movie, rating_models
        
        rating = self.ratings[index]
        return user, movie, rating_models, rating
    
    def __len__(self):
        return len(self.users)

In [53]:
def get_dataset_train(split):
    users = None 
    movies = None
    models_ratings = []
    for m in MODEL_LIST:
        df = pd.read_csv(f'{BASE_MODEL_FOLDER}/{m}/{m}_split_{split}_val_results.csv')
        users, movies, ratings = extract_users_movies_ratings_lists(df)
        models_ratings.append(ratings)


    df_val = pd.read_csv(f'../data_val_train_kfold/partition_{split}_val.csv')
    _, _, y = extract_users_movies_ratings_lists(df_val)

    d_train = DatasetEnsembleResult(users, movies, np.column_stack(models_ratings), y)
    train_dataloader = torch.utils.data.DataLoader(d_train, batch_size=32, drop_last=False, shuffle=True)
    return train_dataloader

def get_dataset_test():
    users = None 
    movies = None
    models_ratings = []
    for m in MODEL_LIST:
        df = pd.read_csv(f'{BASE_MODEL_FOLDER}/{m}/{m}_final_results.csv')
        users, movies, ratings = extract_users_movies_ratings_lists(df)
        models_ratings.append(ratings)

    d_train = DatasetEnsembleResult(users, movies, np.column_stack(models_ratings))
    train_dataloader = torch.utils.data.DataLoader(d_train, batch_size=32, drop_last=False, shuffle=True)
    return train_dataloader

In [54]:

test_dataloader = get_dataset_test()

In [55]:
number_of_users, number_of_movies = (10000, 1000)

class Model(pl.LightningModule):

    def __init__(self, emb_size=3):
        super().__init__()
        self.users_embeddings = nn.Embedding(number_of_users, embedding_dim=emb_size)
        self.movies_embeddings = nn.Embedding(number_of_movies, embedding_dim=emb_size)

        self.ncf = nn.Sequential(
            nn.Linear(emb_size*2, len(MODEL_LIST)),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(len(MODEL_LIST),  len(MODEL_LIST)),
            nn.ReLU()
        )

    def forward(self, users, movies, models_ratings):
        u = self.users_embeddings(users)
        m = self.movies_embeddings(movies)

        concat = torch.cat([u, m], dim=1)
        coeff = self.ncf(concat)

        coeff= coeff + 1e-6
        coeff = coeff / coeff.sum(dim=1).unsqueeze(-1)
        
        out = torch.mul(models_ratings, coeff)
        out = out.sum(dim=1)
       
        return out

    def training_step(self, batch, batch_idx):
        users, movies, models_ratings, y = batch
        yhat = self.forward(users, movies, models_ratings)

        loss = torch.sqrt(torch.mean((yhat-y)**2))
        self.log('train_loss', loss, on_epoch=True, on_step=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

    def predict_step(self, batch, batch_idx):
        users, movies, models_ratings= batch
        yhat = self.forward(users, movies, models_ratings)
        return yhat



In [56]:
def run(split):
    train_dataloader = get_dataset_train(split)
    model = Model()
    trainer = pl.Trainer(max_epochs=2)
    trainer.fit(model, train_dataloaders=train_dataloader)

    pred = trainer.predict(model, dataloaders=test_dataloader)
    yhat = torch.concat(pred)
    return yhat

In [None]:
acc = []
for i in range(0,5):
    yhat = run(i)
    acc.append(yhat)

In [62]:
res = np.column_stack(acc).mean(axis=1)
res.shape

(1176952,)

In [63]:
from dataset import save_predictions
save_predictions('NCF_ensemble_attention.csv', res)