In [1]:
import torch
import numpy as np
from torch import nn
from torch.utils.data import DataLoader, Dataset, Subset
import transformers
import json, pickle
from models import *
from tqdm import tqdm
from utils import mapk
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
prefix = "combined_test_unseen"
user_embed_path = f"processed_datas/{prefix}_embeddings.pkl"
user_path = f"processed_datas/{prefix}.json"
course_embed_path = "processed_datas/train_courses_embeddings.pkl"

with open(user_path, "r") as f:
    users = json.load(f)

with open(user_embed_path, "rb") as f:
    user_embeds = pickle.load(f)

In [3]:
class UserCourseDataset(Dataset):
    def __init__(self, users, user_embeds, total_courses=92):
        self.users = users
        self.user_embeds = [torch.from_numpy(user_embed) for user_embed in user_embeds]
        self.total_courses = total_courses

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        labels = torch.zeros(self.total_courses)
        labels = labels.scatter_(0, torch.tensor(self.users[idx]["subgroups"]), 1)
        user_embed = self.user_embeds[idx]
        return user_embed, labels

def collate_fn(batch):
    ### pad the user_embeds to the same length
    # print(batch[0][0].shape)
    max_len = max([item[0].shape[0] for item in batch])
    embed_dim = batch[0][0].shape[1]
    user_embeds = torch.stack([torch.cat([item[0], torch.zeros(max_len - item[0].shape[0], embed_dim)], dim=0) for item in batch])
    labels = torch.stack([item[1] for item in batch])
    return user_embeds, labels

In [4]:
dataset = UserCourseDataset(users, user_embeds)

In [5]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
model = ValModelv0(nhead=8, num_tokens=92, in_size=512).cuda()
# model.load_state_dict(torch.load("val_modelv0.pth"))

In [6]:
top10_scores = [0.3180500737698619, 0.3131914555527006, 0.31731365500919856, 0.31211073845310017, 0.31571356170372056, 0.3167765206503458, 0.31362011125222683, 0.31175490433947184, 0.31295463092443715, 0.31206028410984915]

ranking = np.argsort(top10_scores)[::-1]

k_results = 1

In [7]:
model.eval()
total_samples = len(dataset)
preds = np.zeros((total_samples, 92))

for m in ranking[:k_results]:
    model.load_state_dict(torch.load(f"ckpts/val_modelv0_{m}.pth"))
    with torch.no_grad():
        labels = []
        for i, (user_embed, label) in enumerate(tqdm(dataloader)):
            user_embed = user_embed.cuda()
            label = label.cuda()
            pred = model(user_embed)
            preds[i*32:(i+1)*32] = pred.cpu().numpy()
            
            ### transform the label to discrete numbers
            for l in label:
                labels.append(list(np.where(l.cpu()==1)[0]))
        
preds = torch.sigmoid(torch.tensor(preds))
preds = preds.cpu().numpy()

pred_labels = np.argsort(-preds, axis=1)[:, :50]
print(mapk(labels, pred_labels, 50))

100%|██████████| 347/347 [00:05<00:00, 60.77it/s]


0.312826914707269


In [8]:
os.makedirs("results", exist_ok=True)
with open(f"results/{prefix}_results.csv", "w") as f:
    f.write(f"user_id,subgroup\n")
    for i, pred in enumerate(pred_labels):
        f.write(f"{users[i]['id']},")
        for p in pred:
            f.write(f"{p} ")
        f.write("\n")