## Load

In [1]:
import os
import sys

from collections import defaultdict

import numpy as np 
import pandas as pd
import random

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import MultiStepLR
from tqdm.auto import tqdm

# Used to get the data
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
nltk.download('stopwords')

import matplotlib.pyplot as plt 
import matplotlib
matplotlib.use('Agg')

sys.path.append('../')
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all
from utils.loss import ListNet, ListNet2, ListNet_origin, MultiLabelMarginLossCustom, MultiLabelMarginLossCustomV, MSE
from utils.data_processing import get_process_data

seed = 33

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/chrisliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
embedding_type = ''
dataset = '20news'
documentembedding_normalize = False

embedding_dim = 128
data = get_process_data(dataset='20news', agg='IDF', embedding_type=embedding_type, 
                     word2embedding_path='../data/glove.6B.100d.txt', word2embedding_normalize=False,
#                      documentembedding_normalize=documentembedding_normalize,
#                      embedding_dim=embedding_dim, 
                     max_seq_length=128,
#                      load_embedding=True
                       )

document_TFIDF = np.array(data["document_word_weight"])
document_vectors = np.array(data["document_embedding"])

Loading word2embedding from ../data/glove.6B.100d.txt


0it [00:00, ?it/s]

Number of words:400000
Generating document tfidf representation...
Document TFIDF dim:(18846, 21365)


Start buiding vocabulary...:   0%|          | 0/18846 [00:00<?, ?it/s]

doc num 18846
eliminate freq words


TypeError: 'float' object cannot be interpreted as an integer

In [None]:
config = {}
config["topk"] = [10, 30, 50]

## MLP Decoder

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'


In [None]:
class MLPDecoderDataset(Dataset):
    def __init__(self, 
                 doc_vectors,
                 weight_ans):
        
        assert len(doc_vectors) == len(weight_ans)

        self.doc_vectors = torch.FloatTensor(doc_vectors)
        self.weight_ans = torch.FloatTensor(weight_ans)        
        self.weight_ans_s = torch.argsort(self.weight_ans, dim=1, descending=True)
        self.topk = torch.sum(self.weight_ans > 0, dim=1)
        
    def __getitem__(self, idx):
        return self.doc_vectors[idx], self.weight_ans[idx], self.weight_ans_s[idx], self.topk[idx]

    def __len__(self):
        return len(self.doc_vectors)

In [None]:
def prepare_dataloader(batch_size=100, train_size_ratio=0.8, topk=50, TFIDF_normalize=False):
    train_size = int(len(document_vectors) * train_size_ratio)
    
    print('train size', train_size)
    print('valid size', len(document_vectors) - train_size)

    if TFIDF_normalize:
        # normalize TFIDF summation of each document to 1 
        norm = document_TFIDF.sum(axis=1).reshape(-1, 1)
        document_TFIDF_ = (document_TFIDF / norm)
        # normalize TFIDF L2 norm of each document to 1
        # norm = np.linalg.norm(document_TFIDF, axis=1).reshape(-1, 1)
        # document_TFIDF_ = (document_TFIDF / norm)
    else:
        document_TFIDF_ = document_TFIDF
    
    # shuffle
    randomize = np.arange(len(document_vectors))
    np.random.shuffle(randomize)
    document_vectors_ = document_vectors[randomize]
    document_TFIDF_ = document_TFIDF_[randomize]
    
    # dataloader
    train_dataset = MLPDecoderDataset(document_vectors_[:train_size], document_TFIDF_[:train_size])
    train_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    valid_dataset = MLPDecoderDataset(document_vectors_[train_size:], document_TFIDF_[train_size:])
    valid_loader  = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    return train_loader, valid_loader


In [None]:
class MLPDecoder(nn.Module):
    def __init__(self, doc_emb_dim, num_words, h_dim=300):
        super().__init__()
        
        self.fc1 = nn.Linear(doc_emb_dim, h_dim) 
        self.fc4 = nn.Linear(h_dim, num_words)
        self.dropout = nn.Dropout(p=0.2)
        
    def forward(self, x):
        x = F.tanh(self.fc1(x))
        x = self.dropout(x)
        x = self.fc4(x)
        
        return x

In [None]:
def evaluate_MLPDecoder(model, data_loader):
    results = defaultdict(list)
    model.eval()
        
    # predict all data
    for data in data_loader:
        doc_embs, target, _, _ = data
        
        doc_embs = doc_embs.to(device)
        target = target.to(device)
                
        pred = model(doc_embs)
    
        # Precision
        precision_scores = retrieval_precision_all(pred, target, k=config["topk"])
        for k, v in precision_scores.items():
            results['precision@{}'.format(k)].append(v)
        
        # NDCG
        ndcg_scores = retrieval_normalized_dcg_all(pred, target, k=config["topk"])
        for k, v in ndcg_scores.items():
            results['ndcg@{}'.format(k)].append(v)
        
    for k in results:
        results[k] = np.mean(results[k])

    return results

In [None]:
# train_loader, valid_loader = prepare_dataloader(batch_size=100,\
#                                                 train_size_ratio=0.8, topk=30,
#                                                 TFIDF_normalize=True)
# for data in train_loader:
#     doc_embs, target, target_rank, target_topk = data
#     print(doc_embs.shape)
#     print(target.shape)
#     print(target_rank.shape)
#     print(target_topk.shape)
#     break

In [None]:
def train_decoder(config):
    model = MLPDecoder(
        doc_emb_dim=document_vectors.shape[1], num_words=document_TFIDF.shape[1], h_dim=config["h_dim"]).to(device)
    model.train()

    opt = torch.optim.SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"],
                          weight_decay=config["weight_decay"])
    # prepare loss
    if config["criterion"] == "MultiLabelMarginLoss":
        criterion = nn.MultiLabelMarginLoss(reduction='mean')
    elif config["criterion"].startswith("MultiLabelMarginLossCustomV"):
        def criterion(a, b, c): return MultiLabelMarginLossCustomV(
            a, b, c, float(config["criterion"].split(':')[-1]))
    elif config["criterion"].startswith("MultiLabelMarginLossCustom"):
        def criterion(a, b, c): return MultiLabelMarginLossCustom(
            a, b, c, float(config["criterion"].split(':')[-1]))
    elif config["criterion"] == "ListNet":
        criterion = ListNet_origin
    elif config["criterion"] == "ListNet2":
        criterion = ListNet2
    elif config["criterion"] == "MSE":
        criterion = MSE
    else:
        print("loss not found")
        return

    results = []
    n_epoch = config["n_epoch"]
    valid_epoch = config["valid_epoch"]
    verbose = config["verbose"]

    for epoch in tqdm(range(n_epoch)):
        train_loss_his = []
        valid_loss_his = []

        model.train()

        for data in train_loader:
            doc_embs, target, target_rank, target_topk = data
            doc_embs = doc_embs.to(device)
            target = target.to(device)
            target_rank = target_rank.to(device)
            target_topk = target_topk.to(device)

            # loss
            pred = model(doc_embs)
            if config["criterion"] == "MultiLabelMarginLoss":
                target_rank[:, config["topk"]] = -1
                loss = criterion(pred, target_rank)
            elif config["criterion"].startswith("MultiLabelMarginLossCustomV"):
                loss = criterion(pred, target_rank, target_topk)
            elif config["criterion"].startswith("MultiLabelMarginLossCustom"):
                loss = criterion(pred, target_rank, config["topk"])
            elif config["criterion"] == "ListNet":
                loss = criterion(pred, target)
            elif config["criterion"] == "ListNet2":
                loss = criterion(pred, target)
            elif config["criterion"] == "MSE":
                loss = criterion(pred, target)

            train_loss_his.append(loss.item())

            # Model backwarding
            model.zero_grad()
            loss.backward()
            opt.step()

        model.eval()
        for data in valid_loader:
            doc_embs, target, target_rank, target_topk = data
            doc_embs = doc_embs.to(device)
            target = target.to(device)
            target_rank = target_rank.to(device)
            target_topk = target_topk.to(device)

            # loss
            pred = model(doc_embs)
            if config["criterion"] == "MultiLabelMarginLoss":
                target_rank[:, config["topk"]] = -1
                loss = criterion(pred, target_rank)
            elif config["criterion"].startswith("MultiLabelMarginLossCustomV"):
                loss = criterion(pred, target_rank, target_topk)
            elif config["criterion"].startswith("MultiLabelMarginLossCustom"):
                loss = criterion(pred, target_rank, config["topk"])
            elif config["criterion"] == "ListNet":
                loss = criterion(pred, target)
            elif config["criterion"] == "ListNet2":
                loss = criterion(pred, target)
            elif config["criterion"] == "MSE":
                loss = criterion(pred, target)

            valid_loss_his.append(loss.item())

        print("Epoch", epoch, np.mean(train_loss_his), np.mean(valid_loss_his))

        # show decoder result
        if epoch % valid_epoch == 0:
            res = {}
            res['epoch'] = epoch

            train_res_ndcg = evaluate_MLPDecoder(model, train_loader)
            valid_res_ndcg = evaluate_MLPDecoder(model, valid_loader)

            res.update(valid_res_ndcg)
            results.append(res)

            if verbose:
                print()
                print('train', train_res_ndcg)
                print('valid', valid_res_ndcg)

In [None]:
# prepare dataloader
train_loader, valid_loader = prepare_dataloader(batch_size=100,\
                                                train_size_ratio=0.8,
                                                TFIDF_normalize=False)

In [None]:
train_config = {
    "lr": 0.05,
    "momentum": 0.0,
    "weight_decay": 0.0,
    
    "n_epoch": 200,
    "verbose": True,
    "valid_epoch": 10,
    
    "topk": 50,
    
    "h_dim": 3000,
    "criterion": "MultiLabelMarginLoss", # "ListNet",
    "TFIDF_normalize": False
}

train_decoder(train_config)

In [None]:
train_config["lr"] = 0.05
train_config["criterion"] = "ListNet"
train_decoder(train_config)

In [None]:
train_config["lr"] = 2
train_config["criterion"] = "ListNet2"
train_decoder(train_config)

In [None]:
train_config["lr"] = 0.05
train_config["criterion"] = "MultiLabelMarginLossCustomV:1"
train_decoder(train_config)

In [None]:
train_config["lr"] = 0.1
train_config["criterion"] = "MultiLabelMarginLossCustom:1"
train_decoder(train_config)

In [None]:
train_config["lr"] = 1e-4
train_config["criterion"] = "MSE"
train_decoder(train_config)