# Demo baseline

### document embedding decoder
1. demo utils
2. demo loss
3. demo evaluation

In [1]:
import os
import sys
from collections import defaultdict

import numpy as np 
import pandas as pd
import random

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.optim as optim
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

sys.path.append('../')
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all, semantic_precision_all
from utils.loss import *
from utils.data_loader import load_document
from utils.toolbox import preprocess_document, get_preprocess_document, get_preprocess_document_embs,\
                          get_preprocess_document_labels, get_word_embs

In [2]:
config = {}
config['label_type'] = 'tf-idf'

## Data preprocess
1. filter special characters, punctuation (remain english & number character)
2. filter stopwords
3. filter by term frequency
4. pos tagging

## Parameters

### preprocess parameters:
1. min word frequency
2. max word frequency(max_df)
3. min word per doc(min_words)
4. pos tagging select

### training parameters:
1. decoder label
2. model parameters

## Load Data, Label
label -> bow, tf-idf, keybert, classification

In [3]:
dataset_name='agnews'
min_df=1
max_df=1.0
vocabulary_size=2000
min_doc_word=15
model_name = 'average'
seed = 33

preprocess_config = {}
preprocess_config['dataset_name'] = dataset_name
preprocess_config['min_df'] = min_df
preprocess_config['max_df'] = max_df
preprocess_config['vocabulary_size'] = vocabulary_size
preprocess_config['min_doc_word'] = min_doc_word

unpreprocessed_docs ,preprocessed_docs = get_preprocess_document(**preprocess_config)

Getting preprocess documents: agnews
min_df: 1 max_df: 1.0 vocabulary_size: 2000 min_doc_word: 15


Using custom data configuration default
Reusing dataset ag_news (/home/chrisliu/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


In [4]:
doc_embs, doc_model = get_preprocess_document_embs(preprocessed_docs, model_name)

Getting preprocess documents embeddings
Using cuda 0 for training...


Batches:   0%|          | 0/635 [00:00<?, ?it/s]

In [5]:
labels, vocabularys = get_preprocess_document_labels(preprocessed_docs)

Getting preprocess documents labels


In [6]:
targets = labels[config['label_type']] 
word_embs = get_word_embs(vocabularys['tf-idf'])
word_embs.shape

Getting word embeddings


0it [00:00, ?it/s]

Number of words:400001


(1811, 300)

In [7]:
word_embs_tensor = torch.FloatTensor(word_embs)

In [8]:
training_config = {}
training_config["topk"] = [5, 10, 15]

## MLP Decoder

In [9]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

In [10]:
class DNNDecoderDataset(Dataset):
    def __init__(self, doc_embs, targets):
        
        assert len(doc_embs) == len(targets)

        self.doc_embs = torch.FloatTensor(doc_embs)
        self.targets = torch.FloatTensor(targets)        
        self.targets_rank = torch.argsort(self.targets, dim=1, descending=True)
        self.topk = torch.sum(self.targets > 0, dim=1)
        
    def __getitem__(self, idx):
        return self.doc_embs[idx], self.targets[idx], self.targets_rank[idx], self.topk[idx]

    def __len__(self):
        return len(self.doc_embs)

In [11]:
def prepare_dataloader(doc_embs, targets, batch_size=100, train_valid_test_ratio=[0.7, 0.1, 0.2],\
                       target_normalize=False, seed=123):
    train_size = int(len(doc_embs) * train_valid_test_ratio[0])
    valid_size = int(len(doc_embs) * (train_valid_test_ratio[0] + train_valid_test_ratio[1])) - train_size
    test_size = len(doc_embs) - train_size - valid_size
    
    print('Preparing dataloader')
    print('train size', train_size)
    print('valid size', valid_size)
    print('test size', test_size)

    if target_normalize:
        # normalize target summation of each document to 1 
        norm = targets.sum(axis=1).reshape(-1, 1)
        targets = (targets / norm)
        # normalize target L2 norm of each document to 1
        # norm = np.linalg.norm(targets, axis=1).reshape(-1, 1)
        # targets = (targets / norm)

    # shuffle
    randomize = np.arange(len(doc_embs))
    np.random.seed(seed)
    np.random.shuffle(randomize)
    doc_embs = doc_embs[randomize]
    targets = targets[randomize]
    
    # dataloader
    train_dataset = DNNDecoderDataset(doc_embs[:train_size], targets[:train_size])
    train_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    valid_dataset = DNNDecoderDataset(doc_embs[train_size:train_size+valid_size], targets[train_size:train_size+valid_size])
    valid_loader  = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    test_dataset = DNNDecoderDataset(doc_embs[train_size+valid_size:], targets[train_size+valid_size:])
    test_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    
    return train_loader, valid_loader, test_loader


In [12]:
# prepare dataloader
train_loader, valid_loader, test_loader = prepare_dataloader(doc_embs, labels['tf-idf'], batch_size=100,\
                                                             train_valid_test_ratio=[0.7, 0.1, 0.2],target_normalize=False,\
                                                             seed=seed)

Preparing dataloader
train size 88892
valid size 12699
test size 25398


In [18]:
class DNNDecoder(nn.Module):
    def __init__(self, doc_emb_dim, num_words, h_dim=300):
        super().__init__()
        self.decoder = nn.Sequential(
            nn.Linear(doc_emb_dim, h_dim),
            # nn.Dropout(p=0.5),
            nn.Tanh(),
            nn.Linear(h_dim, h_dim),
            # nn.Dropout(p=0.5),
            nn.Tanh(),
            nn.Linear(h_dim, num_words),
            # nn.Dropout(p=0.5),
            # nn.Sigmoid(),
        )
    def forward(self, x):
        return self.decoder(x)

In [27]:
def evaluate_DNNDecoder(model, data_loader, config):
    results = defaultdict(list)
    model.eval()
    
    # predict all data
    for data in data_loader:
        doc_embs, target, _, _ = data
        
        doc_embs = doc_embs.to(device)
        target = target.to(device)
                
        pred = model(doc_embs)
    
        # Precision
        precision_scores = retrieval_precision_all(pred, target, k=config["topk"])
        for k, v in precision_scores.items():
            results['precision@{}'.format(k)].append(v)
        
        # Semantic Precision
#         semantic_precision_scores, word_result = semantic_precision_all(pred, target, word_embs_tensor, vocabularys['tf-idf'],\
#                                                                         k=config["topk"], th=0.7, display_word_result=False)
#         for k, v in semantic_precision_scores.items():
#             results['semantic_precision@{}'.format(k)].append(v)
            
        # NDCG
        ndcg_scores = retrieval_normalized_dcg_all(pred, target, k=config["topk"])
        for k, v in ndcg_scores.items():
            results['ndcg@{}'.format(k)].append(v)
        
    for k in results:
        results[k] = np.mean(results[k])

    return results

In [28]:
def calculate_loss(config, criterion, pred, target, target_rank, target_topk):
    if config["criterion"] == "MultiLabelMarginLoss":
        target_rank[:, config["topk"]] = -1
        loss = criterion(pred, target_rank)
    elif config["criterion"].startswith("MultiLabelMarginLossCustomV"):
        loss = criterion(pred, target_rank, target_topk)
    elif config["criterion"].startswith("MultiLabelMarginLossCustom"):
        loss = criterion(pred, target_rank, config["topk"])
    else:
        loss = criterion(pred, target)
        
    return loss

def train_decoder(doc_embs, targets, config):
    model = DNNDecoder(doc_emb_dim=doc_embs.shape[1], num_words=targets.shape[1],\
                       h_dim=config["h_dim"]).to(device)
    model.train()

    opt = torch.optim.SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"],
                          weight_decay=config["weight_decay"])
    # prepare loss
    if config["criterion"] == "MultiLabelMarginLoss":
        criterion = nn.MultiLabelMarginLoss(reduction='mean')
    elif config["criterion"] == "MultiLabel":
        criterion = nn.BCEWithLogitsLoss(reduction='mean')
    elif config["criterion"].startswith("MultiLabelMarginLossCustomV"):
        def criterion(a, b, c): return MultiLabelMarginLossCustomV(
            a, b, c, float(config["criterion"].split(':')[-1]))
    elif config["criterion"].startswith("MultiLabelMarginLossCustom"):
        def criterion(a, b, c): return MultiLabelMarginLossCustom(
            a, b, c, float(config["criterion"].split(':')[-1]))
    else:
        criterion = eval(config["criterion"])

    results = []
    n_epoch = config["n_epoch"]
    valid_epoch = config["valid_epoch"]
    verbose = config["verbose"]

    for epoch in tqdm(range(n_epoch)):
        train_loss_his = []
        valid_loss_his = []

        model.train()

        for data in train_loader:
            doc_embs, target, target_rank, target_topk = data
            doc_embs = doc_embs.to(device)
            target = target.to(device)
            target_rank = target_rank.to(device)
            target_topk = target_topk.to(device)

            # loss
            pred = model(doc_embs)
            loss = calculate_loss(config, criterion, pred, target, target_rank, target_topk)
            train_loss_his.append(loss.item())

            # Model backwarding
            model.zero_grad()
            loss.backward()
            opt.step()

        model.eval()
        for data in valid_loader:
            doc_embs, target, target_rank, target_topk = data
            doc_embs = doc_embs.to(device)
            target = target.to(device)
            target_rank = target_rank.to(device)
            target_topk = target_topk.to(device)

            # loss
            pred = model(doc_embs)
            loss = calculate_loss(config, criterion, pred, target, target_rank, target_topk)
            valid_loss_his.append(loss.item())

        print("Epoch", epoch, np.mean(train_loss_his), np.mean(valid_loss_his))

        # show decoder result
        if valid_epoch > 0 and epoch % valid_epoch == 0:
            res = {}
            res['epoch'] = epoch

            train_res_ndcg = evaluate_DNNDecoder(model, train_loader, config)
            valid_res_ndcg = evaluate_DNNDecoder(model, valid_loader, config)

            res.update(valid_res_ndcg)
            results.append(res)

            if verbose:
                print()
                print('train', train_res_ndcg)
                print('valid', valid_res_ndcg)

In [None]:
def ListNet_sigmoid_L1(y_pred, y_true, eps=1e-10):
    # ListNet switch softmax to L2 norm
    # (1) y_pred: the decoded vector. 
    #     ex: tfidf score of each word in certain document.
    # (2) y_true: the vector before encoded. 
    #     ex: same as above.
    # (3) eps: a small number to avoid error when computing log operation. 
    #     ex: log0 will cause error while log(0+eps) will not.

    y_pred = torch.sigmoid(y_pred) 
    y_pred = torch.nn.functional.normalize(y_pred, dim=1, p=1)
    y_true = torch.sigmoid(y_true) 
    y_true = torch.nn.functional.normalize(y_true, dim=1, p=1)
    pred = y_pred + eps
    pred_log = torch.log(pred)

    return torch.mean(torch.sum(-y_true * pred_log, dim=1))

In [29]:
train_config = {
    "lr": 0.5,
    "momentum": 0.0,
    "weight_decay": 0.0,
    
    "n_epoch": 200,
    "verbose": True,
    "valid_epoch": 10,
    
    "topk": [5, 10, 15],
    
    "h_dim": 3000,
    "criterion": "ListNet_sigmoid_L1"
}

train_decoder(doc_embs, targets, train_config)

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch 0 7.501767903532837 7.501728005296602

train defaultdict(<class 'list'>, {'precision@5': 0.006756688158590393, 'precision@10': 0.006621607157890973, 'precision@15': 0.0065966974488167246, 'ndcg@5': 0.004793891958086736, 'ndcg@10': 0.00574605351323639, 'ndcg@15': 0.007053534532056469, 'ndcg@all': 0.2186133117657008})
valid defaultdict(<class 'list'>, {'precision@5': 0.006960948194032462, 'precision@10': 0.006913942620378193, 'precision@15': 0.006667250219189863, 'ndcg@5': 0.0050865688107313775, 'ndcg@10': 0.006069583674401456, 'ndcg@15': 0.007187034040102808, 'ndcg@all': 0.21909274888320232})
Epoch 1 7.501719417400381 7.501713737728089
Epoch 2 7.501710791957928 7.501708199658732
Epoch 3 7.5017062789007305 7.501704384961466
Epoch 4 7.501702877748402 7.501701339961976
Epoch 5 7.501699967378975 7.501698632878582
Epoch 6 7.5016974319355025 7.501696229919674
Epoch 7 7.501695173261404 7.501694052238164
Epoch 8 7.501693130209958 7.501692114852545
Epoch 9 7.501691263625807 7.5016903802165

Epoch 81 7.501653353730793 7.501653299556942
Epoch 82 7.501653136499017 7.501653111825778
Epoch 83 7.501652941258605 7.501652924094613
Epoch 84 7.501652735827074 7.501652665025606
Epoch 85 7.501652543268536 7.501652499822181
Epoch 86 7.501652362510243 7.5016523308641325
Epoch 87 7.501652176388203 7.501652116850605
Epoch 88 7.50165199777541 7.501651955401804
Epoch 89 7.501651811653369 7.501651778934509
Epoch 90 7.501651612658335 7.501651553657111

train defaultdict(<class 'list'>, {'precision@5': 0.316838555985012, 'precision@10': 0.21001075883684062, 'precision@15': 0.16182547412955184, 'ndcg@5': 0.28214632822839697, 'ndcg@10': 0.2544231286567832, 'ndcg@15': 0.256947159700104, 'ndcg@all': 0.4511785108943892})
valid defaultdict(<class 'list'>, {'precision@5': 0.3160599653176435, 'precision@10': 0.2091517528211038, 'precision@15': 0.16176702011757949, 'ndcg@5': 0.28177890263673827, 'ndcg@10': 0.253564078155465, 'ndcg@15': 0.2562863826751709, 'ndcg@all': 0.4499996330325059})
Epoch 91 7.50

Epoch 166 7.501642147789387 7.501642178362749
Epoch 167 7.501642064651301 7.5016421220434
Epoch 168 7.501641984195087 7.50164202066857
Epoch 169 7.501641920902866 7.501641975613091
Epoch 170 7.501641819528037 7.501641919293742

train defaultdict(<class 'list'>, {'precision@5': 0.4651457890281527, 'precision@10': 0.3173273308674047, 'precision@15': 0.24496611630956824, 'ndcg@5': 0.40520619330309643, 'ndcg@10': 0.37460058957707093, 'ndcg@15': 0.38100304089997833, 'ndcg@all': 0.555061140800622})
valid defaultdict(<class 'list'>, {'precision@5': 0.4616263431357586, 'precision@10': 0.3150881207364751, 'precision@15': 0.2437947026387913, 'ndcg@5': 0.4038074349793862, 'ndcg@10': 0.372575097900676, 'ndcg@15': 0.37901224958615043, 'ndcg@all': 0.5529456870762381})
Epoch 171 7.501641739608198 7.501641825428159
Epoch 172 7.501641650569988 7.5016417353172
Epoch 173 7.501641572259274 7.501641637696995
Epoch 174 7.501641485366564 7.501641585132268
Epoch 175 7.501641397937479 7.501641491266686
Epoch 1

In [None]:
train_config = {
    "lr": 1,
    "momentum": 0.0,
    "weight_decay": 0.0,
    
    "n_epoch": 500,
    "verbose": True,
    "valid_epoch": 10,
    
    "topk": [5, 10, 15],
    
    "h_dim": 3000,
    "criterion": "ListNet_sigmoid_L1"
}

train_decoder(doc_embs, targets, train_config)

  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 0 7.501745184307292 7.501713632598636

train defaultdict(<class 'list'>, {'precision@5': 0.008272998606161758, 'precision@10': 0.008183107595745143, 'precision@15': 0.008100390002127271, 'ndcg@5': 0.005896730493299487, 'ndcg@10': 0.007019600477404851, 'ndcg@15': 0.008498310407804884, 'ndcg@all': 0.22099509793338948})
valid defaultdict(<class 'list'>, {'precision@5': 0.008504414346113097, 'precision@10': 0.008197486701267442, 'precision@15': 0.008179273478334814, 'ndcg@5': 0.005920075453033187, 'ndcg@10': 0.006942194800339933, 'ndcg@15': 0.008500429436813776, 'ndcg@all': 0.22120623888931876})
Epoch 1 7.501708239886838 7.501704118383213
Epoch 2 7.501701056756104 7.501698268680122
Epoch 3 7.501695944568303 7.501693778150663
Epoch 4 7.501691885284149 7.501690132411446
Epoch 5 7.501688553324164 7.501687057374969
Epoch 6 7.501685639200114 7.501684324009212
Epoch 7 7.501683147202997 7.50168204119825
Epoch 8 7.501680955039369 7.501679987419308
Epoch 9 7.501679023553872 7.501678117616909


Epoch 81 7.501642518424344 7.501642520033468
Epoch 82 7.5016423199656845 7.5016422797375775
Epoch 83 7.501642162271506 7.501642152080386
Epoch 84 7.5016419831223375 7.501641960594598
Epoch 85 7.501641820064411 7.501641862974392
Epoch 86 7.501641675243228 7.501641705280214
Epoch 87 7.501641506821554 7.501641528812919
Epoch 88 7.50164132820876 7.501641374873364
Epoch 89 7.501641160859837 7.501641172123707
Epoch 90 7.501641026766148 7.501641082012747

train defaultdict(<class 'list'>, {'precision@5': 0.47720672631022365, 'precision@10': 0.32649933604609443, 'precision@15': 0.25252911668057115, 'ndcg@5': 0.41745542246257494, 'ndcg@10': 0.38653486353608163, 'ndcg@15': 0.39373273606107256, 'ndcg@all': 0.5653792047795616})
valid defaultdict(<class 'list'>, {'precision@5': 0.47233531770743725, 'precision@10': 0.3241271726259096, 'precision@15': 0.2508117495559332, 'ndcg@5': 0.4145348158877666, 'ndcg@10': 0.3838937460906862, 'ndcg@15': 0.39049000298883035, 'ndcg@all': 0.5625948267658865})
Epoch

In [None]:
train_config = {
    "lr": 0.05,
    "momentum": 0.0,
    "weight_decay": 0.0,
    
    "n_epoch": 500,
    "verbose": True,
    "valid_epoch": 10,
    
    "topk": [5, 10, 15],
    
    "h_dim": 3000,
    "criterion": "ListNet_sigmoid_L1"
}

train_decoder(doc_embs, targets, train_config)

In [None]:
# train_config["lr"] = 0.05
# train_config["criterion"] = "MultiLabelMarginLossCustomV:1"
# train_decoder(doc_embs, targets, train_config)

In [None]:
# train_config["lr"] = 0.1
# train_config["criterion"] = "MultiLabelMarginLossCustom:1"
# train_decoder(doc_embs, targets, train_config)