## Load

In [1]:
import os
import sys

from collections import defaultdict

import numpy as np 
import pandas as pd
import random

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import MultiStepLR
from tqdm.auto import tqdm

# Used to get the data
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
nltk.download('stopwords')

import matplotlib.pyplot as plt 
import matplotlib
matplotlib.use('Agg')

sys.path.append('../')
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all
from utils.loss import ListNet, listNet_origin, MultiLabelMarginLossPos, MSE, KL
from utils.data_processing import get_process_data

seed = 33

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/chrisliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
embedding_type = ''
dataset = '20news'
documentembedding_normalize = True

embedding_dim = 128
data = get_process_data(dataset='20news', agg='IDF', embedding_type=embedding_type, 
                     word2embedding_path='../data/glove.6B.100d.txt', word2embedding_normalize=False,
                     documentembedding_normalize=documentembedding_normalize,
                     embedding_dim=embedding_dim, max_seq_length=128,
                     load_embedding=True)

document_TFIDF = np.array(data["document_word_weight"])
document_vectors = np.array(data["document_embedding"])

Loading word2embedding from ../data/glove.6B.100d.txt


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Number of words:400000


HBox(children=(IntProgress(value=0, description='Start buiding vocabulary...', max=18846, style=ProgressStyle(…


doc num 18846
eliminate freq words
Load from saving
delete items 150


In [3]:
config = {}
config["topk"] = [10, 30, 50]

## MLP Decoder

In [4]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [5]:
class MLPDecoderDataset1(Dataset):
    # MultiLabelMarginLoss
    # ListNet
    def __init__(self, 
                 doc_vectors,
                 weight_ans,
                 topk=50):
        
        assert len(doc_vectors) == len(weight_ans)

        self.doc_vectors = torch.FloatTensor(doc_vectors)
        self.weight_ans = torch.FloatTensor(weight_ans)        
        self.weight_ans_s = torch.argsort(self.weight_ans, dim=1, descending=True)
        self.weight_ans_s[:, topk:] = -1
        
    def __getitem__(self, idx):
        return self.doc_vectors[idx], self.weight_ans[idx], self.weight_ans_s[idx]

    def __len__(self):
        return len(self.doc_vectors)

In [6]:
class MLPDecoderDataset2(Dataset):
    # MultiLabelMarginLossPos
    def __init__(self, 
                 doc_vectors,
                 weight_ans,
                 topk=50):
        self.doc_vectors = torch.FloatTensor(doc_vectors)
        self.weight_ans = torch.FloatTensor(weight_ans)
        self.weight_ans_s = torch.argsort(self.weight_ans, dim=1, descending=True)
        
        self.weight_ans_s_pos = self.weight_ans_s[:, :topk]
        self.weight_ans_s_neg = self.weight_ans_s[:, topk:]
        
        assert len(doc_vectors) == len(weight_ans)
        
    def __getitem__(self, idx):
        return self.doc_vectors[idx], self.weight_ans[idx], (self.weight_ans_s_pos[idx], self.weight_ans_s_neg[idx])

    def __len__(self):
        return len(self.doc_vectors)

In [7]:
def prepare_dataloader(batch_size=100, train_size_ratio=0.8, topk=50, TFIDF_normalize=False):
    train_size = int(len(document_vectors) * train_size_ratio)
    
    print('train size', train_size)
    print('valid size', len(document_vectors) - train_size)

    if TFIDF_normalize:
        norm = document_TFIDF.sum(axis=1).reshape(-1, 1)
        document_TFIDF_ = (document_TFIDF / norm)
#         norm = np.linalg.norm(document_TFIDF, axis=1).reshape(-1, 1)
#         document_TFIDF_ = (document_TFIDF / norm)
    else:
        document_TFIDF_ = document_TFIDF
        
    randomize = np.arange(len(document_vectors))
    np.random.shuffle(randomize)
    
    document_vectors_ = document_vectors[randomize]
    document_TFIDF_ = document_TFIDF_[randomize]
    
    train_dataset = MLPDecoderDataset1(document_vectors_[:train_size], document_TFIDF_[:train_size], topk=topk)
    train_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    valid_dataset = MLPDecoderDataset1(document_vectors_[train_size:], document_TFIDF_[train_size:], topk=topk)
    valid_loader  = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    train_dataset2 = MLPDecoderDataset2(document_vectors_[:train_size], document_TFIDF_[:train_size], topk=topk)
    train_loader2  = torch.utils.data.DataLoader(train_dataset2, batch_size=batch_size, shuffle=True, pin_memory=True)

    valid_dataset2 = MLPDecoderDataset2(document_vectors_[train_size:], document_TFIDF_[train_size:], topk=topk)
    valid_loader2  = torch.utils.data.DataLoader(valid_dataset2, batch_size=batch_size, shuffle=True, pin_memory=True)
    
    return train_loader, valid_loader, train_loader2, valid_loader2


In [8]:
class MLPDecoder(nn.Module):
    def __init__(self, doc_emb_dim, num_words, h_dim=300):
        super().__init__()
        
        self.fc1 = nn.Linear(doc_emb_dim, h_dim) 
        self.fc4 = nn.Linear(h_dim, num_words)
        self.dropout = nn.Dropout(p=0.2)
        
    def forward(self, x):
        x = F.tanh(self.fc1(x))
        x = self.dropout(x)
        x = self.fc4(x)
        
        return x

In [9]:
def evaluate_MLPDecoder(model, data_loader):
    results = {}
    model.eval()
    
    pred_all = []
    target_all = []
    
    # predict all data
    for data in data_loader:
        doc_embs, target, _ = data
        
        doc_embs = doc_embs.to(device)
        target = target.to(device)
                
        pred = model(doc_embs)
        pred_all.append(pred)
        target_all.append(target)
        
    pred_all = torch.cat(pred_all, dim=0)
    target_all = torch.cat(target_all, dim=0)
    
    # Precision
    precision_scores = retrieval_precision_all(pred_all, target_all, k=config["topk"])
    for k, v in precision_scores.items():
        results['precision@{}'.format(k)] = v
        
    # NDCG
    ndcg_scores = retrieval_normalized_dcg_all(pred_all, target_all, k=config["topk"])
    for k, v in ndcg_scores.items():
        results['ndcg@{}'.format(k)] = v
        
    return results

In [10]:
def evaluate_MLPDecoder(model, data_loader):
    results = defaultdict(list)
    model.eval()
        
    # predict all data
    for data in data_loader:
        doc_embs, target, _ = data
        
        doc_embs = doc_embs.to(device)
        target = target.to(device)
                
        pred = model(doc_embs)
    
        # Precision
        precision_scores = retrieval_precision_all(pred, target, k=config["topk"])
        for k, v in precision_scores.items():
            results['precision@{}'.format(k)].append(v)
        
        # NDCG
        ndcg_scores = retrieval_normalized_dcg_all(pred, target, k=config["topk"])
        for k, v in ndcg_scores.items():
            results['ndcg@{}'.format(k)].append(v)
        
    for k in results:
        results[k] = np.mean(results[k])

    return results

In [11]:
def train_decoder(config):
    model = MLPDecoder(doc_emb_dim=document_vectors.shape[1], num_words=document_TFIDF.shape[1], h_dim=config["h_dim"]).to(device)
    model.train()

    opt = torch.optim.SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"],\
                          weight_decay=config["weight_decay"])
    # prepare loss
    if config["criterion"] == "MultiLabelMarginLoss":
        criterion = nn.MultiLabelMarginLoss(reduction='mean')
    elif config["criterion"] == "MultiLabelMarginLossPos":
        criterion = MultiLabelMarginLossPos
    elif config["criterion"] == "ListNet":
        criterion = listNet_origin
    elif config["criterion"] == "KL":
        criterion = KL
    elif config["criterion"] == "MSE":
        criterion = MSE
        
    # prepare dataloader
    train_loader1, valid_loader1, train_loader2, valid_loader2 = prepare_dataloader(batch_size=100,\
                                                                              train_size_ratio=0.8, topk=config["topk"],
                                                                              TFIDF_normalize=config["TFIDF_normalize"])
    train_loader = train_loader1
    valid_loader = valid_loader1
    if config["criterion"] == "MultiLabelMarginLossPos":
        train_loader = train_loader2
        valid_loader = valid_loader2
    
    results = []
    n_epoch = config["n_epoch"]
    valid_epoch = config["valid_epoch"]
    verbose = config["verbose"]
    
    for epoch in tqdm(range(n_epoch)):    
        train_loss_his = []
        valid_loss_his = []

        model.train()

        for data in train_loader:
            doc_embs, target, target_rank = data
            doc_embs = doc_embs.to(device)
            
            # loss
            pred = model(doc_embs)    
            if config["criterion"] == "MultiLabelMarginLoss":
                loss = criterion(pred, target_rank.to(device))
            elif config["criterion"] == "MultiLabelMarginLossPos":
                loss = criterion(pred, target_rank[0].to(device), target_rank[1].to(device))
            elif config["criterion"] == "ListNet":
                loss = criterion(pred, target.to(device))
            elif config["criterion"] == "KL":
                loss = criterion(pred, target.to(device))
            elif config["criterion"] == "MSE":
                loss = criterion(pred, target.to(device))
                
            # Model backwarding
            model.zero_grad()
            loss.backward()
            opt.step()

            train_loss_his.append(loss.item())

        model.eval()
        for data in valid_loader:
            doc_embs, target, target_rank = data
            doc_embs = doc_embs.to(device)

            # loss
            pred = model(doc_embs)    
            if config["criterion"] == "MultiLabelMarginLoss":
                loss = criterion(pred, target_rank.to(device))
            elif config["criterion"] == "MultiLabelMarginLossPos":
                loss = criterion(pred, target_rank[0].to(device), target_rank[1].to(device))
            elif config["criterion"] == "ListNet":
                loss = criterion(pred, target.to(device))
            elif config["criterion"] == "KL":
                loss = criterion(pred, target.to(device))
            elif config["criterion"] == "MSE":
                loss = criterion(pred, target.to(device))

            valid_loss_his.append(loss.item())

        print("Epoch", epoch, np.mean(train_loss_his), np.mean(valid_loss_his))

        if epoch % valid_epoch == 0:
            res = {}
            res['epoch'] = epoch

            train_res_ndcg = evaluate_MLPDecoder(model, train_loader)
            valid_res_ndcg = evaluate_MLPDecoder(model, valid_loader)

            res.update(valid_res_ndcg)
            results.append(res)

            if verbose:
                print()
                print('train', train_res_ndcg)
                print('valid', valid_res_ndcg)

In [None]:
train_config = {
    "lr": 0.05,
    "momentum": 0.0,
    "weight_decay": 0.0,
    
    "n_epoch": 600,
    "verbose": True,
    "valid_epoch": 10,
    
    "topk": 50,
    
    "h_dim": 3000,
    "criterion": "MultiLabelMarginLoss", # "ListNet", "MultiLabelMarginLossPos"
    "TFIDF_normalize": False
}

train_decoder(train_config)

train size 14956
valid size 3740


HBox(children=(IntProgress(value=0, max=600), HTML(value='')))



Epoch 0 12.894923826853434 10.416469624167995

train defaultdict(<class 'list'>, {'precision@10': 0.01845285722054541, 'precision@30': 0.024642540545513233, 'precision@50': 0.03920685471345981, 'ndcg@10': 0.012078703642667582, 'ndcg@30': 0.01772142723513146, 'ndcg@50': 0.030151826726893583, 'ndcg@all': 0.31091484864552815})
valid defaultdict(<class 'list'>, {'precision@10': 0.01831578951034891, 'precision@30': 0.02416666790744976, 'precision@50': 0.03832631362111945, 'ndcg@10': 0.012313411080915677, 'ndcg@30': 0.017787261571931213, 'ndcg@50': 0.02971278567259249, 'ndcg@all': 0.308475524187088})
Epoch 1 8.6802729733785 8.635530647478605
Epoch 2 6.937849006652832 7.688166743830631


* train defaultdict(<class 'list'>, {'precision@10': 0.4468299978971481, 'precision@30': 0.4765825545787811, 'precision@50': 0.4932941671212514, 'ndcg@10': 0.35141815225283307, 'ndcg@30': 0.4388885551691055, 'ndcg@50': 0.5403420054912567, 'ndcg@all': 0.6485507190227509})
* valid defaultdict(<class 'list'>, {'precision@10': 0.28502631030584635, 'precision@30': 0.21985088249570445, 'precision@50': 0.19671841318670072, 'ndcg@10': 0.2641608922889358, 'ndcg@30': 0.2646897682233861, 'ndcg@50': 0.28631100842827245, 'ndcg@all': 0.5073321830285223})

In [None]:
train_config["criterion"] = "ListNet"
train_config["TFIDF_normalize"] = False
train_decoder(train_config)

* train defaultdict(<class 'list'>, {'precision@10': 0.5408376163244247, 'precision@30': 0.2708717542886734, 'precision@50': 0.19191084623336793, 'ndcg@10': 0.7237670310338338, 'ndcg@30': 0.5995183233420054, 'ndcg@50': 0.5832904160022736, 'ndcg@all': 0.7355261572202046})
* valid defaultdict(<class 'list'>, {'precision@10': 0.4000394736465655, 'precision@30': 0.21653948371347628, 'precision@50': 0.15969472613773847, 'ndcg@10': 0.49008390542707947, 'ndcg@30': 0.4215040944124523, 'ndcg@50': 0.41643378530677994, 'ndcg@all': 0.5964698571907846})

In [None]:
train_config["criterion"] = "MultiLabelMarginLossPos"
train_config["TFIDF_normalize"] = False
train_decoder(train_config)

* train defaultdict(<class 'list'>, {'precision@10': 0.48348952174186705, 'precision@30': 0.4728728707631429, 'precision@50': 0.48005197564760843, 'ndcg@10': 0.38053884088993073, 'ndcg@30': 0.4542527727286021, 'ndcg@50': 0.5518478057781855, 'ndcg@all': 0.6627423493067424})
* valid defaultdict(<class 'list'>, {'precision@10': 0.3156052620003098, 'precision@30': 0.22210965619275444, 'precision@50': 0.19529209207547338, 'ndcg@10': 0.29573119588588415, 'ndcg@30': 0.281982432854803, 'ndcg@50': 0.3005151176138928, 'ndcg@all': 0.5244677521680531})

In [None]:
# train_config["criterion"] = "MultiLabelMarginLossPos"
# train_config["TFIDF_normalize"] = True
# train_decoder(train_config)

* train defaultdict(<class 'list'>, {'precision@10': 0.5554014225800832, 'precision@30': 0.5590281081199646, 'precision@50': 0.5573797816038132, 'ndcg@10': 0.444824323852857, 'ndcg@30': 0.5369966959953308, 'ndcg@50': 0.6349835149447123, 'ndcg@all': 0.7023906556765238})
* valid defaultdict(<class 'list'>, {'precision@10': 0.32681578397750854, 'precision@30': 0.23064035804648148, 'precision@50': 0.20169999097522937, 'ndcg@10': 0.3154387238778566, 'ndcg@30': 0.30100254165498835, 'ndcg@50': 0.31951345973893214, 'ndcg@all': 0.5373694473191312})

In [None]:
train_config["criterion"] = "MSE"
train_config["TFIDF_normalize"] = False
train_decoder(train_config)

In [None]:
train_config["criterion"] = "KL"
train_config["TFIDF_normalize"] = False
train_decoder(train_config)