# MLP baseline

In [1]:
import os
import sys
from collections import defaultdict
import numpy as np 
import pandas as pd
import json

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.optim as optim
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

sys.path.append('../')
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all, retrieval_precision_all_v2, semantic_precision_all, semantic_precision_all_v2, precision_recall_f1_all
from utils.loss import *
from utils.data_loader import load_document
from utils.toolbox import preprocess_document, get_preprocess_document, get_preprocess_document_embs,\
                          get_preprocess_document_labels, get_preprocess_document_labels_v2, get_word_embs,\
                          get_free_gpu, merge_targets


## Load Data, Label
label -> bow, tf-idf, keybert, classification

In [2]:
dataset ='20news'
# cross domain
dataset2 = None # None
model_name = 'average'
label_type = 'tf-idf'
# 用binary(f1) evaluation或rank evaluation
eval_f1 = False
criterion = 'BCE'#'ListNet_sigmoid_L1'
# 選preprocess config
preprocess_config_dir = 'parameters_baseline2'
n_gram = 1

lr = 1e-3
n_epoch = 300
valid_epoch = 10
h_dim = 300
target_normalization = False

# 訓練幾次
n_time = 1
seed = 33
if dataset2:
    experiment_dir = f'cross_{dataset}_{dataset2}_{model_name}_{label_type}_{criterion}'
else:
    experiment_dir = f'{dataset}_{model_name}_{label_type}_{criterion}'
    
save_dir = 'default'

config = {}
config['experiment_dir'] = experiment_dir
config['preprocess_config_dir'] = preprocess_config_dir
config['save_dir'] = save_dir
config['dataset'] = dataset
config['dataset2'] = dataset2
config['model_name'] = model_name
config['label_type'] = label_type
config['eval_f1'] = eval_f1
config['n_gram'] = n_gram
config['criterion'] = criterion
config['n_time'] = n_time
config['seed'] = seed

config['lr'] = lr
config['n_epoch'] = n_epoch
config['valid_epoch'] = valid_epoch
config['h_dim'] = h_dim
config['target_normalization'] = target_normalization
        
save_dir = os.path.join('experiment', config['experiment_dir'], config['save_dir'])
os.makedirs(save_dir, exist_ok=False)

In [3]:
def load_training_data(config, dataset):
    preprocess_config_dir = config['preprocess_config_dir']
    with open(os.path.join(f'../chris/{preprocess_config_dir}', f'preprocess_config_{dataset}.json'), 'r') as f:
        preprocess_config = json.load(f)
        
    # load preprocess dataset
    unpreprocessed_docs, preprocessed_docs = get_preprocess_document(**preprocess_config)
    print('doc num', len(preprocessed_docs))

    # get document embeddings
    doc_embs, doc_model, device = get_preprocess_document_embs(preprocessed_docs, model_name)
    print('doc_embs', doc_embs.shape)
    
    # load labels
    labels, vocabularys = get_preprocess_document_labels_v2(preprocessed_docs, preprocess_config, preprocess_config_dir, config['n_gram'])    
    # check nonzero numbers
    for k in labels:
        print(k, np.sum(labels[k]!=0), labels[k].shape)
    print(len(vocabularys))
    # select label type
    targets = labels[config['label_type']].toarray()
    vocabularys = vocabularys
    
    return unpreprocessed_docs ,preprocessed_docs, doc_embs, targets, vocabularys, device

In [4]:
unpreprocessed_docs, preprocessed_docs, doc_embs, targets, vocabularys, device = load_training_data(config, config['dataset'])

Getting preprocess documents: 20news
min_df: 62 max_df: 1.0 vocabulary_size: None min_doc_word: 15
doc num 18589
Getting preprocess documents embeddings
Using cuda 0 for training...


Batches:   0%|          | 0/372 [00:00<?, ?it/s]

doc_embs (18589, 300)
Getting preprocess documents labels
Finding precompute_keyword by preprocess_config {'dataset': '20news', 'min_df': 62, 'max_df': 1.0, 'vocabulary_size': None, 'min_doc_word': 15}
tf-idf 1092802 (18589, 4823)
bow 1092802 (18589, 4823)
keybert 1028492 (18589, 4823)
yake 892783 (18589, 4823)
4823


In [5]:
if config['dataset2'] is not None:
    unpreprocessed_docs2, preprocessed_docs2, doc_embs2, targets2, vocabularys2, device = load_training_data(config, config['dataset2'])
    targets, targets2, vocabularys = merge_targets(targets, targets2, vocabularys, vocabularys2)
    

In [6]:
word_embs = get_word_embs(vocabularys)
print('word_embs', word_embs.shape)
word_embs_tensor = torch.FloatTensor(word_embs)

0it [00:00, ?it/s]

Number of words:400001
Getting [ndarray] word embeddings
word_embs (4823, 300)


## MLP Decoder

In [7]:
class DNNDecoderDataset(Dataset):
    def __init__(self, doc_embs, targets):
        
        assert len(doc_embs) == len(targets)

        self.doc_embs = torch.FloatTensor(doc_embs)
        self.targets = torch.FloatTensor(targets)        
        self.targets_rank = torch.argsort(self.targets, dim=1, descending=True)
        self.topk = torch.sum(self.targets > 0, dim=1)
        
    def __getitem__(self, idx):
        return self.doc_embs[idx], self.targets[idx], self.targets_rank[idx], self.topk[idx]

    def __len__(self):
        return len(self.doc_embs)

In [8]:
def prepare_dataloader(doc_embs, targets, batch_size=100, train_valid_test_ratio=[0.7, 0.1, 0.2],\
                       target_normalize=False, seed=123):
    train_size = int(len(doc_embs) * train_valid_test_ratio[0])
    valid_size = int(len(doc_embs) * (train_valid_test_ratio[0] + train_valid_test_ratio[1])) - train_size
    test_size = len(doc_embs) - train_size - valid_size
    
    print('Preparing dataloader')
    print('train size', train_size)
    print('valid size', valid_size)
    print('test size', test_size)

    if target_normalize:
        # normalize target summation of each document to 1 
        norm = targets.sum(axis=1).reshape(-1, 1)
        targets = (targets / norm)
        # normalize target L2 norm of each document to 1
        # norm = np.linalg.norm(targets, axis=1).reshape(-1, 1)
        # targets = (targets / norm)

    # shuffle
    randomize = np.arange(len(doc_embs))
    np.random.seed(seed)
    np.random.shuffle(randomize)
    doc_embs = doc_embs[randomize]
    targets = targets[randomize]
    
    # dataloader
    train_dataset = DNNDecoderDataset(doc_embs[:train_size], targets[:train_size])
    train_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    valid_dataset = DNNDecoderDataset(doc_embs[train_size:train_size+valid_size], targets[train_size:train_size+valid_size])
    valid_loader  = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    test_dataset = DNNDecoderDataset(doc_embs[train_size+valid_size:], targets[train_size+valid_size:])
    test_loader  = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    
    return train_loader, valid_loader, test_loader


In [9]:
# prepare dataloader
train_loader, valid_loader, test_loader = prepare_dataloader(doc_embs, targets, batch_size=64,\
                                                             train_valid_test_ratio=[0.7, 0.1, 0.2],\
                                                             target_normalize=config['target_normalization'],\
                                                             seed=seed)
if config['dataset2'] is not None:
    _, _, test_loader = prepare_dataloader(doc_embs2, targets2, batch_size=64,\
                                           train_valid_test_ratio=[0.7, 0.1, 0.2],\
                                           target_normalize=config['target_normalization'],\
                                           seed=seed)

Preparing dataloader
train size 13012
valid size 1859
test size 3718


In [10]:
class DNNDecoder(nn.Module):
    def __init__(self, doc_emb_dim, num_words, h_dim=300):
        super().__init__()
        self.decoder = nn.Sequential(
            nn.Linear(doc_emb_dim, h_dim),
            # nn.Dropout(p=0.5),
            nn.Tanh(),
            nn.Linear(h_dim, h_dim),
            # nn.Dropout(p=0.5),
            nn.Tanh(),
            nn.Linear(h_dim, num_words),
            # nn.Dropout(p=0.5),
            # nn.Sigmoid(),
        )
    def forward(self, x):
        return self.decoder(x)

In [11]:
def evaluate_DNNDecoder(model, data_loader, config, pred_semantic=False):
    results = defaultdict(list)
    model.eval()
    
    # predict all data
    for data in data_loader:
        doc_embs, target, _, _ = data
        
        doc_embs = doc_embs.to(device)
        target = target.to(device)
                
        pred = model(doc_embs)
        if config['eval_f1']:
            # Precision / Recall / F1
            p, r, f = precision_recall_f1_all(pred, target)
            results['precision'].append(p)
            results['recall'].append(r)
            results['f1_score'].append(f)
        else:
            # Precision
            precision_scores = retrieval_precision_all(pred, target, k=config["valid_topk"])
            for k, v in precision_scores.items():
                results['precision@{}'.format(k)].append(v)

            # Precision
            precision_scores = retrieval_precision_all_v2(pred, target, k=config["valid_topk"])
            for k, v in precision_scores.items():
                results['precisionv2@{}'.format(k)].append(v)

            # NDCG
            ndcg_scores = retrieval_normalized_dcg_all(pred, target, k=config["valid_topk"])
            for k, v in ndcg_scores.items():
                results['ndcg@{}'.format(k)].append(v)
            
            # Semantic Precision
            if pred_semantic:
                semantic_precision_scores, word_result = semantic_precision_all(pred, target, word_embs_tensor, vocabularys,\
                                                                                k=config["valid_topk"], th=0.5, display_word_result=False)
                for k, v in semantic_precision_scores.items():
                    results['semantic_precision@{}'.format(k)].append(v)
                    
                semantic_precision_scores, word_result = semantic_precision_all_v2(pred, target, word_embs_tensor, vocabularys,\
                                                                                k=config["valid_topk"], th=0.5, display_word_result=False)
                for k, v in semantic_precision_scores.items():
                    results['semantic_precision_v2@{}'.format(k)].append(v)

    for k in results:
        results[k] = np.mean(results[k])

    return results

In [12]:
def calculate_loss(train_train_config, criterion, pred, target, target_rank, target_topk):
    if train_config["criterion"] == "MultiLabelMarginLoss":
        assert target_rank.shape[0] == len(target_topk)
        for i in range(len(target_topk)):
            target_rank[i, target_topk[i]] = -1
        loss = criterion(pred, target_rank)
    elif train_config["criterion"].startswith("MultiLabelMarginLossCustomV"):
        loss = criterion(pred, target_rank, target_topk)
    elif train_config["criterion"].startswith("MultiLabelMarginLossCustom"):
        loss = criterion(pred, target_rank, train_config["loss_topk"])
    else:
        loss = criterion(pred, target)
        
    return loss
    
def train_decoder(doc_embs, targets, train_config):
    model = DNNDecoder(doc_emb_dim=doc_embs.shape[1], num_words=targets.shape[1],\
                       h_dim=train_config["h_dim"]).to(device)
    model.train()

    opt = torch.optim.Adam(model.parameters(), lr=train_config["lr"], weight_decay=train_config["weight_decay"])
    # prepare loss
    if train_config["criterion"] == "MultiLabelMarginLoss":
        criterion = nn.MultiLabelMarginLoss(reduction='mean')
    elif train_config["criterion"] == "BCE":
        criterion = nn.BCEWithLogitsLoss(reduction='mean')
    elif train_config["criterion"].startswith("MultiLabelMarginLossCustomV"):
        def criterion(a, b, c): return MultiLabelMarginLossCustomV(
            a, b, c, float(train_config["criterion"].split(':')[-1]))
    elif train_config["criterion"].startswith("MultiLabelMarginLossCustom"):
        def criterion(a, b, c): return MultiLabelMarginLossCustom(
            a, b, c, float(train_config["criterion"].split(':')[-1]))
    else:
        criterion = eval(train_config["criterion"])

    results = []
    n_epoch = train_config["n_epoch"]
    valid_epoch = train_config["valid_epoch"]
    valid_verbose = train_config["valid_verbose"]

    for epoch in tqdm(range(n_epoch)):
        train_loss_his = []
        valid_loss_his = []

        model.train()

        for data in train_loader:
            doc_embs, target, target_rank, target_topk = data
            doc_embs = doc_embs.to(device)
            target = target.to(device)
            target_rank = target_rank.to(device)
            target_topk = target_topk.to(device)
            # loss
            pred = model(doc_embs)
            loss = calculate_loss(train_config, criterion, pred, target, target_rank, target_topk)
            train_loss_his.append(loss.item())

            # Model backwarding
            model.zero_grad()
            loss.backward()
            opt.step()

        model.eval()
        for data in valid_loader:
            doc_embs, target, target_rank, target_topk = data
            doc_embs = doc_embs.to(device)
            target = target.to(device)
            target_rank = target_rank.to(device)
            target_topk = target_topk.to(device)

            # loss
            pred = model(doc_embs)
            loss = calculate_loss(train_config, criterion, pred, target, target_rank, target_topk)
            valid_loss_his.append(loss.item())

        print("Epoch", epoch, np.mean(train_loss_his), np.mean(valid_loss_his))

        # show decoder result
        if (valid_epoch > 0 and epoch % valid_epoch == 0) or epoch == n_epoch-1:
            res = {}
            res['epoch'] = epoch

            train_res_ndcg = evaluate_DNNDecoder(model, train_loader, train_config, True)
            valid_res_ndcg = evaluate_DNNDecoder(model, valid_loader, train_config, True)
            test_res_ndcg = evaluate_DNNDecoder(model, test_loader, train_config, True)
            
            res['train'] = train_res_ndcg
            res['valid'] = valid_res_ndcg
            res['test'] = test_res_ndcg 
            results.append(res)

            if valid_verbose:
                print()
                print('train', train_res_ndcg)
                print('valid', valid_res_ndcg)
                print('test', test_res_ndcg)
    return results

def train_experiment(n_time):
    # train n_time in different seed
    results = []
    for _ in range(n_time):
        result = train_decoder(doc_embs, targets, train_config)
        results.append(result)

    with open(os.path.join(save_dir, 'result.json'), 'w') as f:
        json.dump(results, f)

    return results

In [13]:
train_config = {
    "n_time": config['n_time'],
    "lr": config['lr'],
    "weight_decay": 0.0,
    "loss_topk": 15,
    
    "n_epoch": config['n_epoch'],
    "valid_epoch": config['valid_epoch'],
    "valid_verbose": True,
    "valid_topk": [5, 10, 15],
    
    "h_dim": config['h_dim'],
    "label_type": config['label_type'],
    "eval_f1": config['eval_f1'],
    "criterion": config['criterion']
}

In [None]:
train_experiment(train_config['n_time'])

  0%|          | 0/300 [00:00<?, ?it/s]

Epoch 0 0.03918401733972132 0.00878793808321158

train defaultdict(<class 'list'>, {'precision@5': 0.7035968283227846, 'precision@10': 0.5397258045918801, 'precision@15': 0.42988359971958046, 'precisionv2@5': 0.005438112856948054, 'precisionv2@10': 0.014342831108746502, 'precisionv2@15': 0.030004086876835895, 'ndcg@5': 0.10569694722253903, 'ndcg@10': 0.10623866889406652, 'ndcg@15': 0.10661182384572777, 'ndcg@all': 0.3901283904617908, 'semantic_precision@5': 0.7176378676470588, 'semantic_precision@10': 0.5782291666666666, 'semantic_precision@15': 0.5771006944444445, 'semantic_precision_v2@5': 0.009025735294117647, 'semantic_precision_v2@10': 0.031421568627450984, 'semantic_precision_v2@15': 0.10375612745098038})
valid defaultdict(<class 'list'>, {'precision@5': 0.7156250178813934, 'precision@10': 0.5446875125169754, 'precision@15': 0.434016223748525, 'precisionv2@5': 0.005104166789290806, 'precisionv2@10': 0.014062500186264515, 'precisionv2@15': 0.030069446377456188, 'ndcg@5': 0.1063417

Epoch 31 0.007495969748489705 0.007659391965717077
Epoch 32 0.007424191275027161 0.007674167553583781
Epoch 33 0.00736150017702112 0.007550055999308825
Epoch 34 0.007299444229597701 0.007530270206431548
Epoch 35 0.007233985640364243 0.0074957319224874175
Epoch 36 0.007167161739084358 0.0074423435144126415
Epoch 37 0.007104630844539725 0.0073941457861413555
Epoch 38 0.007044324815711554 0.00738908932544291
Epoch 39 0.006978418326516654 0.00731164044700563
Epoch 40 0.0069180389628836925 0.007221362181007862

train defaultdict(<class 'list'>, {'precision@5': 0.7495496416208791, 'precision@10': 0.6404687569421881, 'precision@15': 0.5714512128455966, 'precisionv2@5': 0.32661765309817653, 'precisionv2@10': 0.29732077824426634, 'precisionv2@15': 0.28788502470535393, 'ndcg@5': 0.5749370826810014, 'ndcg@10': 0.5411022683861209, 'ndcg@15': 0.5199167528281025, 'ndcg@all': 0.6857049117485682, 'semantic_precision@5': 0.8153523284313725, 'semantic_precision@10': 0.7182720588235295, 'semantic_precisi

Epoch 71 0.005783513790982611 0.0068720333278179165
Epoch 72 0.005761692740990981 0.006839470146223903
Epoch 73 0.005742936260432151 0.006839777032534282
Epoch 74 0.005723404218716656 0.006860890441263715
Epoch 75 0.005701580835853284 0.006827092279369632
Epoch 76 0.005683851521918732 0.006840139115229249
Epoch 77 0.005666157999989011 0.006832362944260239
Epoch 78 0.005652447794929293 0.0069250843022018675
Epoch 79 0.005636152619605555 0.006865883649637302
Epoch 80 0.005618214098207068 0.006822077433268229

train defaultdict(<class 'list'>, {'precision@5': 0.9715778225777196, 'precision@10': 0.904739600478434, 'precision@15': 0.8314757443526212, 'precisionv2@5': 0.6173652147545534, 'precisionv2@10': 0.5773697965869716, 'precisionv2@15': 0.5535600749300975, 'ndcg@5': 0.878033299072116, 'ndcg@10': 0.8499383704335082, 'ndcg@15': 0.8224318746258231, 'ndcg@all': 0.8724753310283025, 'semantic_precision@5': 0.9658731617647058, 'semantic_precision@10': 0.9180116421568628, 'semantic_precision@1

In [None]:
# save config, training config
with open(os.path.join(save_dir, 'config.json'), 'w') as f:
    json.dump(config, f)
with open(os.path.join(save_dir, 'train_config.json'), 'w') as f:
    json.dump(train_config, f)