# Demo baseline

### document embedding decoder
1. demo utils
2. demo loss
3. demo evaluation

In [1]:
import os
import sys
from collections import defaultdict
import numpy as np 
import pandas as pd
import json

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.optim as optim
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

sys.path.append('../')
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all, semantic_precision_all, precision_recall_f1_all
from utils.loss import *
from utils.data_loader import load_document
from utils.toolbox import preprocess_document, get_preprocess_document, get_preprocess_document_embs,\
                          get_preprocess_document_labels, get_word_embs

## Data preprocess
1. filter special characters, punctuation (remain english & number character)
2. filter stopwords
3. filter by term frequency
4. pos tagging

## Parameters

### preprocess parameters:
1. min word frequency
2. max word frequency(max_df)
3. min word per doc(min_words)
4. pos tagging select

### training parameters:
1. decoder label
2. model parameters

## Load Data, Label
label -> bow, tf-idf, keybert, classification

In [2]:
dataset_name='agnews'
model_name = 'average'
label_type = 'tf-idf'
criterion = 'ListNet_sigmoid_L1'
n_time = 5
seed = 33
experiment_dir = f'{dataset_name}_{model_name}_{label_type}'
experiment_dir2 = 'test'

config = {}
config['experiment_dir'] = experiment_dir
config['experiment_dir2'] = experiment_dir2
config['dataset_name'] = dataset_name
config['model_name'] = model_name
config['label_type'] = label_type
config['criterion'] = criterion
config['n_time'] = n_time
config['seed'] = seed

save_dir = os.path.join('experiment', config['experiment_dir'], config['experiment_dir2'])
os.makedirs(save_dir, exist_ok=False)

In [3]:
with open(os.path.join('../chris/parameters', f'preprocess_config_{dataset_name}.json'), 'r') as f:
    preprocess_config = json.load(f)
        
unpreprocessed_docs ,preprocessed_docs = get_preprocess_document(**preprocess_config)
print('doc num', len(preprocessed_docs))

Getting preprocess documents: agnews
min_df: 100 max_df: 1.0 vocabulary_size: None min_doc_word: 15


Using custom data configuration default
Reusing dataset ag_news (/home/chrisliu/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


doc num 127542


In [4]:
doc_embs, doc_model = get_preprocess_document_embs(preprocessed_docs, model_name)
print('doc_embs', doc_embs.shape)

Getting preprocess documents embeddings
Using cuda 0 for training...


Batches:   0%|          | 0/638 [00:00<?, ?it/s]

doc_embs (127542, 300)


In [5]:
labels, vocabularys = get_preprocess_document_labels(preprocessed_docs)

Getting preprocess documents labels


In [6]:
targets = labels[config['label_type']] 
vocabularys = vocabularys[config['label_type']]
word_embs = get_word_embs(vocabularys)
print('word_embs', word_embs.shape)

Getting word embeddings


0it [00:00, ?it/s]

Number of words:400001
word_embs (4265, 300)


In [7]:
word_embs_tensor = torch.FloatTensor(word_embs)

## MLP Decoder

In [8]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

In [9]:
class DNNDecoderDataset(Dataset):
    def __init__(self, doc_embs, targets):
        
        assert len(doc_embs) == len(targets)

        self.doc_embs = torch.FloatTensor(doc_embs)
        self.targets = torch.FloatTensor(targets)        
        self.targets_rank = torch.argsort(self.targets, dim=1, descending=True)
        self.topk = torch.sum(self.targets > 0, dim=1)
        
    def __getitem__(self, idx):
        return self.doc_embs[idx], self.targets[idx], self.targets_rank[idx], self.topk[idx]

    def __len__(self):
        return len(self.doc_embs)

In [10]:
def prepare_dataloader(doc_embs, targets, batch_size=100, train_valid_test_ratio=[0.7, 0.1, 0.2],\
                       target_normalize=False, seed=123):
    train_size = int(len(doc_embs) * train_valid_test_ratio[0])
    valid_size = int(len(doc_embs) * (train_valid_test_ratio[0] + train_valid_test_ratio[1])) - train_size
    test_size = len(doc_embs) - train_size - valid_size
    
    print('Preparing dataloader')
    print('train size', train_size)
    print('valid size', valid_size)
    print('test size', test_size)

    if target_normalize:
        # normalize target summation of each document to 1 
        norm = targets.sum(axis=1).reshape(-1, 1)
        targets = (targets / norm)
        # normalize target L2 norm of each document to 1
        # norm = np.linalg.norm(targets, axis=1).reshape(-1, 1)
        # targets = (targets / norm)

    # shuffle
    randomize = np.arange(len(doc_embs))
    np.random.seed(seed)
    np.random.shuffle(randomize)
    doc_embs = doc_embs[randomize]
    targets = targets[randomize]
    
    # dataloader
    train_dataset = DNNDecoderDataset(doc_embs[:train_size], targets[:train_size])
    train_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    valid_dataset = DNNDecoderDataset(doc_embs[train_size:train_size+valid_size], targets[train_size:train_size+valid_size])
    valid_loader  = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    test_dataset = DNNDecoderDataset(doc_embs[train_size+valid_size:], targets[train_size+valid_size:])
    test_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    
    return train_loader, valid_loader, test_loader


In [11]:
# prepare dataloader
train_loader, valid_loader, test_loader = prepare_dataloader(doc_embs, targets, batch_size=100,\
                                                             train_valid_test_ratio=[0.7, 0.1, 0.2],target_normalize=False,\
                                                             seed=seed)

Preparing dataloader
train size 89279
valid size 12754
test size 25509


In [12]:
class DNNDecoder(nn.Module):
    def __init__(self, doc_emb_dim, num_words, h_dim=300):
        super().__init__()
        self.decoder = nn.Sequential(
            nn.Linear(doc_emb_dim, h_dim),
            # nn.Dropout(p=0.5),
            nn.Tanh(),
            nn.Linear(h_dim, h_dim),
            # nn.Dropout(p=0.5),
            nn.Tanh(),
            nn.Linear(h_dim, num_words),
            # nn.Dropout(p=0.5),
            # nn.Sigmoid(),
        )
    def forward(self, x):
        return self.decoder(x)

In [13]:
def evaluate_DNNDecoder(model, data_loader, config, pred_semantic=False):
    results = defaultdict(list)
    model.eval()
    
    # predict all data
    for data in data_loader:
        doc_embs, target, _, _ = data
        
        doc_embs = doc_embs.to(device)
        target = target.to(device)
                
        pred = model(doc_embs)
        if config['label_type'] == 'bow':
            # Precision / Recall / F1
            p, r, f = precision_recall_f1_all(pred, target)
            results['precision'].append(p)
            results['recall'].append(r)
            results['f1_score'].append(f)
        else:
            # Precision
            precision_scores = retrieval_precision_all(pred, target, k=config["valid_topk"])
            for k, v in precision_scores.items():
                results['precision@{}'.format(k)].append(v)

            # NDCG
            ndcg_scores = retrieval_normalized_dcg_all(pred, target, k=config["valid_topk"])
            for k, v in ndcg_scores.items():
                results['ndcg@{}'.format(k)].append(v)
            
            # Semantic Precision
            if pred_semantic:
                semantic_precision_scores, word_result = semantic_precision_all(pred, target, word_embs_tensor, vocabularys,\
                                                                                k=config["valid_topk"], th=0.7, display_word_result=False)
                for k, v in semantic_precision_scores.items():
                    results['semantic_precision@{}'.format(k)].append(v)

    for k in results:
        results[k] = np.mean(results[k])

    return results

In [14]:
def calculate_loss(train_train_config, criterion, pred, target, target_rank, target_topk):
    if train_config["criterion"] == "MultiLabelMarginLoss":
        target_rank[:, train_config["loss_topk"]] = -1
        loss = criterion(pred, target_rank)
    elif train_config["criterion"].startswith("MultiLabelMarginLossCustomV"):
        loss = criterion(pred, target_rank, target_topk)
    elif train_config["criterion"].startswith("MultiLabelMarginLossCustom"):
        loss = criterion(pred, target_rank, train_config["loss_topk"])
    else:
        loss = criterion(pred, target)
        
    return loss
    
def train_decoder(doc_embs, targets, train_config):
    model = DNNDecoder(doc_emb_dim=doc_embs.shape[1], num_words=targets.shape[1],\
                       h_dim=train_config["h_dim"]).to(device)
    model.train()

    opt = torch.optim.Adam(model.parameters(), lr=train_config["lr"], weight_decay=train_config["weight_decay"])
    # prepare loss
    if train_config["criterion"] == "MultiLabelMarginLoss":
        criterion = nn.MultiLabelMarginLoss(reduction='mean')
    elif train_config["criterion"] == "BCE":
        criterion = nn.BCEWithLogitsLoss(reduction='mean')
    elif train_config["criterion"].startswith("MultiLabelMarginLossCustomV"):
        def criterion(a, b, c): return MultiLabelMarginLossCustomV(
            a, b, c, float(train_config["criterion"].split(':')[-1]))
    elif train_config["criterion"].startswith("MultiLabelMarginLossCustom"):
        def criterion(a, b, c): return MultiLabelMarginLossCustom(
            a, b, c, float(train_config["criterion"].split(':')[-1]))
    else:
        criterion = eval(train_config["criterion"])

    results = []
    n_epoch = train_config["n_epoch"]
    valid_epoch = train_config["valid_epoch"]
    valid_verbose = train_config["valid_verbose"]

    for epoch in tqdm(range(n_epoch)):
        train_loss_his = []
        valid_loss_his = []

        model.train()

        for data in train_loader:
            doc_embs, target, target_rank, target_topk = data
            doc_embs = doc_embs.to(device)
            target = target.to(device)
            target_rank = target_rank.to(device)
            target_topk = target_topk.to(device)
            y_pos_id = target_rank[:, :4]
            y_neg_id = target_rank[:, 4:]
            # loss
            pred = model(doc_embs)
            loss = calculate_loss(train_config, criterion, pred, target, target_rank, target_topk)
            train_loss_his.append(loss.item())

            # Model backwarding
            model.zero_grad()
            loss.backward()
            opt.step()

        model.eval()
        for data in valid_loader:
            doc_embs, target, target_rank, target_topk = data
            doc_embs = doc_embs.to(device)
            target = target.to(device)
            target_rank = target_rank.to(device)
            target_topk = target_topk.to(device)

            # loss
            pred = model(doc_embs)
            loss = calculate_loss(train_config, criterion, pred, target, target_rank, target_topk)
            valid_loss_his.append(loss.item())

        print("Epoch", epoch, np.mean(train_loss_his), np.mean(valid_loss_his))

        # show decoder result
        if (valid_epoch > 0 and epoch % valid_epoch == 0) or epoch == n_epoch-1:
            res = {}
            res['epoch'] = epoch

            train_res_ndcg = evaluate_DNNDecoder(model, train_loader, train_config, epoch == n_epoch-1)
            valid_res_ndcg = evaluate_DNNDecoder(model, valid_loader, train_config, epoch == n_epoch-1)
            test_res_ndcg = evaluate_DNNDecoder(model, test_loader, train_config, epoch == n_epoch-1)
            
            res['train'] = train_res_ndcg
            res['valid'] = valid_res_ndcg
            res['test'] = test_res_ndcg 
            results.append(res)

            if valid_verbose:
                print()
                print('train', train_res_ndcg)
                print('valid', valid_res_ndcg)
    return results

def train_experiment(n_time):
    # train n_time in different seed
    results = []
    for _ in range(n_time):
        result = train_decoder(doc_embs, targets, train_config)
        results.append(result)

    with open(os.path.join(save_dir, 'result.json'), 'w') as f:
        json.dump(results, f)

    return results

In [18]:
train_config = {
    "n_time": config['n_time'],
    "lr": 0.001,
    "weight_decay": 0.0,
    "loss_topk": 15,
    
    "n_epoch": 100,
    "valid_epoch": 10,
    "valid_verbose": True,
    "valid_topk": [5, 10, 15],
    
    "h_dim": 300,
    "label_type": config['label_type'],
    "criterion": config['criterion']
}

In [19]:
train_experiment(train_config['n_time'])

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch 0 6.657649041824085 5.769884817302227

train defaultdict(<class 'list'>, {'precision@5': 0.49713169543900654, 'precision@10': 0.3697350823799146, 'precision@15': 0.3003241909044187, 'ndcg@5': 0.43561317926024534, 'ndcg@10': 0.401836245617968, 'ndcg@15': 0.4032687182722807, 'ndcg@all': 0.5988024110756689})
valid defaultdict(<class 'list'>, {'precision@5': 0.4913524240255356, 'precision@10': 0.36465798201970756, 'precision@15': 0.29605132224969566, 'ndcg@5': 0.4313194779679179, 'ndcg@10': 0.39652566192671657, 'ndcg@15': 0.39757651812396944, 'ndcg@all': 0.5928894239477813})
Epoch 1 5.319901653793865 5.017820030450821
Epoch 2 4.719868618323314 4.619564343243837
Epoch 3 4.3635093185963045 4.372267674654722
Epoch 4 4.123314377975891 4.203077433630824
Epoch 5 3.9489065225821194 4.079804016277194
Epoch 6 3.814673486241835 3.9846011213958263
Epoch 7 3.7079142440206394 3.9072022289037704
Epoch 8 3.6201517488494566 3.8479408230632544
Epoch 9 3.5466227592839905 3.7951154932379723
Epoch 10 3.

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch 0 6.646074637320114 5.778725769370794

train defaultdict(<class 'list'>, {'precision@5': 0.49117064989961406, 'precision@10': 0.367336099352041, 'precision@15': 0.2988934947755542, 'ndcg@5': 0.42840921328155884, 'ndcg@10': 0.3967905379956941, 'ndcg@15': 0.3988947809276495, 'ndcg@all': 0.5950948754857396})
valid defaultdict(<class 'list'>, {'precision@5': 0.4867962908465415, 'precision@10': 0.3614791645668447, 'precision@15': 0.29362423880957067, 'ndcg@5': 0.42471392918378115, 'ndcg@10': 0.39098270051181316, 'ndcg@15': 0.3923597189132124, 'ndcg@all': 0.5891727777197957})
Epoch 1 5.329282934161077 5.027341164648533
Epoch 2 4.725915474160121 4.624773558229208
Epoch 3 4.364485932357634 4.373117703944445
Epoch 4 4.121786274541532 4.198131542652845
Epoch 5 3.9465590882274633 4.0794781986624
Epoch 6 3.8131915601877724 3.984085079282522
Epoch 7 3.707009868685903 3.9086715802550316
Epoch 8 3.619915829260165 3.8501140158623457
Epoch 9 3.547316921518204 3.797074753791094
Epoch 10 3.48509350

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch 0 6.654915440122598 5.768376935273409

train defaultdict(<class 'list'>, {'precision@5': 0.4990825381751557, 'precision@10': 0.37192878160471343, 'precision@15': 0.30229257175663415, 'ndcg@5': 0.43652080224716383, 'ndcg@10': 0.40349286637808146, 'ndcg@15': 0.40526660105415846, 'ndcg@all': 0.6000173918210379})
valid defaultdict(<class 'list'>, {'precision@5': 0.49401562102138996, 'precision@10': 0.3671082102227956, 'precision@15': 0.29729939298704267, 'ndcg@5': 0.4319503572769463, 'ndcg@10': 0.3978617328684777, 'ndcg@15': 0.39862822205759585, 'ndcg@all': 0.5935744540765882})
Epoch 1 5.306821315312573 5.0082129538059235
Epoch 2 4.711125536701847 4.6181708090007305
Epoch 3 4.364339720628831 4.376937054097652
Epoch 4 4.128248900341961 4.207868106663227
Epoch 5 3.953803138796987 4.085012415423989
Epoch 6 3.8174282185574797 3.9835261832922697
Epoch 7 3.708000301380307 3.909377660602331
Epoch 8 3.618253831500302 3.843034414574504
Epoch 9 3.5436900524382895 3.791941350325942
Epoch 10 3.4

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch 0 6.660688318570811 5.778303008526564

train defaultdict(<class 'list'>, {'precision@5': 0.4950643662042073, 'precision@10': 0.3686228732913232, 'precision@15': 0.29957729427838514, 'ndcg@5': 0.43480788570101825, 'ndcg@10': 0.40106146529834186, 'ndcg@15': 0.40264692729024043, 'ndcg@all': 0.5979595409261953})
valid defaultdict(<class 'list'>, {'precision@5': 0.4886313620954752, 'precision@10': 0.364665794884786, 'precision@15': 0.29524171468801796, 'ndcg@5': 0.42950741830281913, 'ndcg@10': 0.39538748539052904, 'ndcg@15': 0.3960166478063911, 'ndcg@all': 0.5915395962074399})
Epoch 1 5.324945762202828 5.020228695124388
Epoch 2 4.721947786388846 4.624636489897966
Epoch 3 4.364448431491318 4.374059092253447
Epoch 4 4.122655459774569 4.2000237219035625
Epoch 5 3.946797343411077 4.07274348847568
Epoch 6 3.8128681762221146 3.980337616056204
Epoch 7 3.7067422033824684 3.9047782253473997
Epoch 8 3.61971631776312 3.8462825547903776
Epoch 9 3.547406292976217 3.7962284926325083
Epoch 10 3.4848

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch 0 6.664650877005309 5.773086246103048

train defaultdict(<class 'list'>, {'precision@5': 0.498257783132064, 'precision@10': 0.3714763878240831, 'precision@15': 0.30144016623230024, 'ndcg@5': 0.4348877727117677, 'ndcg@10': 0.4021953883990729, 'ndcg@15': 0.40384280828452457, 'ndcg@all': 0.5987313622723501})
valid defaultdict(<class 'list'>, {'precision@5': 0.49290277319960296, 'precision@10': 0.366080722771585, 'precision@15': 0.2971776705235243, 'ndcg@5': 0.43077777069993317, 'ndcg@10': 0.39703822741284966, 'ndcg@15': 0.3981902238447219, 'ndcg@all': 0.592971992213279})
Epoch 1 5.320013808242952 5.020988542586565
Epoch 2 4.72333452175606 4.626024711877108
Epoch 3 4.368852274682194 4.379685170948505
Epoch 4 4.128945562099957 4.2076164819300175
Epoch 5 3.9518732012719884 4.080541856586933
Epoch 6 3.8148112411862125 3.9893021062016487
Epoch 7 3.70516827525644 3.9042945951223373
Epoch 8 3.616240431720439 3.841662349179387
Epoch 9 3.541642903480658 3.794379720464349
Epoch 10 3.479154856

[[{'epoch': 0,
   'train': defaultdict(list,
               {'precision@5': 0.49713169543900654,
                'precision@10': 0.3697350823799146,
                'precision@15': 0.3003241909044187,
                'ndcg@5': 0.43561317926024534,
                'ndcg@10': 0.401836245617968,
                'ndcg@15': 0.4032687182722807,
                'ndcg@all': 0.5988024110756689}),
   'valid': defaultdict(list,
               {'precision@5': 0.4913524240255356,
                'precision@10': 0.36465798201970756,
                'precision@15': 0.29605132224969566,
                'ndcg@5': 0.4313194779679179,
                'ndcg@10': 0.39652566192671657,
                'ndcg@15': 0.39757651812396944,
                'ndcg@all': 0.5928894239477813}),
   'test': defaultdict(list,
               {'precision@5': 0.4971287190780661,
                'precision@10': 0.3697371660061139,
                'precision@15': 0.3003239919330345,
                'ndcg@5': 0.43561462896134523,

In [20]:
# save config, training config
with open(os.path.join(save_dir, 'config.json'), 'w') as f:
    json.dump(config, f)
with open(os.path.join(save_dir, 'train_config.json'), 'w') as f:
    json.dump(train_config, f)

## Result
Run 5 times, different model seed, same train/valid/test split, mean/std
1. precision, recall, f1
2. precision, ndcg, semantic precision

Exp:
1. different doc encoder
2. different dataset(mpnet)
3. cross domain(mpnet)
4. different target(mpnet, agnews)(bow, tf-idf, keybert, yake)

* bow:
    3 dataset * bce * 4 models
* tf-idf:
    3 dataset * listnet * 4 models
* keybert, yake:
    agnews * listnet * 4 models
* cross domain