In [10]:
import torch
import numpy as np
from torch import nn
from torch.utils.data import DataLoader, Dataset, Subset
import transformers
import json, pickle
from models import *
from tqdm import tqdm
from utils import mapk
import os

In [11]:
prefix = "test_unseen"
user_embed_path = f"processed_datas/{prefix}_embeddings.pkl"
user_path = f"processed_datas/{prefix}.json"
course_embed_path = "processed_datas/train_courses_embeddings.pkl"

with open(user_path, "r") as f:
    users = json.load(f)

with open(user_embed_path, "rb") as f:
    user_embeds = pickle.load(f)

In [12]:
class UserCourseDataset(Dataset):
    def __init__(self, users, user_embeds, total_courses=732):
        self.users = users
        self.user_embeds = [torch.from_numpy(user_embed) for user_embed in user_embeds]
        self.total_courses = total_courses

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        labels = torch.zeros(self.total_courses)
        labels = labels.scatter_(0, torch.tensor(self.users[idx]["labels"]), 1)
        user_embed = self.user_embeds[idx]
        return user_embed, labels

def collate_fn(batch):
    ### pad the user_embeds to the same length
    # print(batch[0][0].shape)
    max_len = max([item[0].shape[0] for item in batch])
    embed_dim = batch[0][0].shape[1]
    user_embeds = torch.stack([torch.cat([item[0], torch.zeros(max_len - item[0].shape[0], embed_dim)], dim=0) for item in batch])
    labels = torch.stack([item[1] for item in batch])
    return user_embeds, labels

In [13]:
dataset = UserCourseDataset(users, user_embeds)

In [14]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
model = ValModelv0(nhead=8).cuda()
model.load_state_dict(torch.load("val_modelv0.pth"))

<All keys matched successfully>

In [15]:
model.eval()
with torch.no_grad():
    preds = []
    labels = []
    for user_embed, label in tqdm(dataloader):
        user_embed = user_embed.cuda()
        label = label.cuda()
        pred = model(user_embed)
        preds.append(pred)
        ### transform the label to discrete numbers
        for l in label:
            labels.append(list(np.where(l.cpu()==1)[0]))
    
    preds = torch.sigmoid(torch.cat(preds, dim=0))
    preds = preds.cpu().numpy()

pred_labels = np.argsort(-preds, axis=1)[:, :50]
print(mapk(labels, pred_labels, 50))

100%|██████████| 347/347 [00:05<00:00, 57.91it/s]


0.0


In [16]:
from importlib import reload
import utils
reload(utils)
from utils import knn
# mutual_discard = [0, 1, 2, 3, 4, 22, 677, 41, 681, 689, 439, 697, 699, 62, 705, 708, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731]
mutual_discard = [0, 1, 2, 3]
feats = pickle.load(open("processed_datas/compressed_courses_embeddings.pkl", "rb"))
feats = np.concatenate([np.zeros((4, 512)), feats], axis=0)
new_preds = knn(preds, feats, 30, mutual_discard, 60)
pred_labels = np.argsort(-new_preds, axis=1)[:, :50]
print(mapk(labels, pred_labels, 50))

0.0


In [17]:
course2int = {}
with open("course_tokenizer.json", 'r') as file:
	all_datas = json.load(file)
	all_tokens = all_datas["added_tokens"] 
	for ctoken in all_tokens:
		course2int[ctoken["content"]] = ctoken["id"]

int2course = {v: k for k, v in course2int.items()}

In [18]:
os.makedirs("results", exist_ok=True)
with open(f"results/{prefix}_results.csv", "w") as f:
    f.write(f"user_id,course_id\n")
    for i, pred in enumerate(pred_labels):
        f.write(f"{users[i]['uid']},")
        for p in pred:
            f.write(f"{int2course[p]} ")
        f.write("\n")