In [1]:
import sys
sys.path.append("../..")

import torch
import time
import random
import argparse
import numpy as np
import torch.optim as optim
import torch.nn as nn

from sklearn.feature_extraction.text import TfidfVectorizer
# from model import Decoder, Seq2Seq
from torch.utils.data import DataLoader, Dataset
from utils.toolbox import same_seeds, show_settings, get_preprocess_document, \
                            get_preprocess_document_embs, get_free_gpu, get_preprocess_document_labels


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class LSTMDecoderDataset(Dataset):
    def __init__(self, doc_embs, targets, labels):
        
        assert len(doc_embs) == len(targets)

        self.doc_embs = torch.FloatTensor(doc_embs)
        self.targets = torch.LongTensor(targets)      
        self.labels = torch.FloatTensor(labels)        # TFIDF
        # self.targets_rank = torch.argsort(self.targets, dim=1, descending=True)
        # self.topk = torch.sum(self.targets > 0, dim=1)
        
    def __getitem__(self, idx):
        return self.doc_embs[idx], self.targets[idx], self.labels[idx]

    def __len__(self):
        return len(self.doc_embs)

def pad_sequence(sentence, word2idx, sen_len):
    # 將每個句子變成一樣的長度
    if len(sentence) > sen_len:
        sentence = sentence[:sen_len]
    else:
        pad_len = sen_len - len(sentence)
        for _ in range(pad_len):
            sentence.append(word2idx["<PAD>"])
    assert len(sentence) == sen_len
    return sentence

def prepare_dataloader(doc_embs, targets, labels, batch_size=100, train_valid_test_ratio=[0.7, 0.1, 0.2]):
    train_size = int(len(doc_embs) * train_valid_test_ratio[0])
    valid_size = int(len(doc_embs) * (train_valid_test_ratio[0] + train_valid_test_ratio[1])) - train_size
    test_size = len(doc_embs) - train_size - valid_size
    
    print('Preparing dataloader')
    print('train size', train_size)
    print('valid size', valid_size)
    print('test size', test_size)

    # shuffle
    randomize = np.arange(len(doc_embs))
    np.random.shuffle(randomize)
    doc_embs = doc_embs[randomize]
    targets = targets[randomize]
    labels = labels[randomize]
    
    # dataloader
    train_dataset = LSTMDecoderDataset(doc_embs[:train_size], targets[:train_size], labels[:train_size])
    train_loader  = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    valid_dataset = LSTMDecoderDataset(doc_embs[train_size:train_size+valid_size], targets[train_size:train_size+valid_size], labels[train_size:train_size+valid_size])
    valid_loader  = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

    test_dataset = LSTMDecoderDataset(doc_embs[train_size+valid_size:], targets[train_size+valid_size:], labels[train_size+valid_size:])
    test_loader  = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader, test_loader

def get_document_labels(texts, max_len=50):
    word2idx = {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>" : 3}
    idx2word = {0 : "<SOS>", 1 : "<EOS", 2 : "<PAD>", 3 : "<UNK>"}
    # Build dictionary
    for text in texts:
        for word in text:
            if (word2idx.get(word, -1) == -1):
                idx2word[len(word2idx)] = word
                word2idx[word] = len(word2idx)
    
    # Build labels
    # 把句子裡面的字轉成相對應的 index
    sentence_list = []
    for i, sen in enumerate(texts):
        sentence_idx = [word2idx["<SOS>"]]
        for word in sen:
            if (word in word2idx.keys()):
                sentence_idx.append(word2idx[word])
            else:
                sentence_idx.append(word2idx["<UNK>"])
        # 將每個句子變成一樣的長度
        sentence_idx = pad_sequence(sentence_idx, word2idx, max_len)
        sentence_idx[-1] = word2idx["<EOS>"]
        sentence_list.append(sentence_idx)

    labels = torch.LongTensor(sentence_list)
    return word2idx, idx2word, labels


In [3]:
# parser = argparse.ArgumentParser(description='document decomposition.')
# parser.add_argument('--model', type=str, default="ZTM")
# parser.add_argument('--dataset', type=str, default="20news")
# parser.add_argument('--min_df', type=int, default=1)
# parser.add_argument('--max_df', type=float, default=1.0)
# parser.add_argument('--max_len', type=int, default=50)
# parser.add_argument('--num_epoch', type=int, default=50)
# parser.add_argument('--min_doc_word', type=int, default=15)
# parser.add_argument('--min_doc_len', type=int, default=15)
# parser.add_argument('--encoder', type=str, default='bert')
# parser.add_argument('--seed', type=int, default=123)
# args = parser.parse_args()
# config = vars(args)

config = {
    "model": "ZTM",
    "dataset": "agnews",
    "max_len": 30,
    "num_epoch": 50,
    "min_doc_len": 15,
    "encoder": "bert",
    "seed": 123,
    "topk": [5, 10, 15],
    "target": "tf-idf"
}

if config['dataset'] == '20news':
    config['min_df'], config['max_df'], config['min_doc_word'] = 62, 1.0, 15
elif config['dataset'] == 'agnews':
    config['min_df'], config['max_df'], config['min_doc_word'] = 425, 1.0, 15
elif config['dataset'] == 'IMDB':
    config['min_df'], config['max_df'], config['min_doc_word'] = 166, 1.0, 15
elif config['dataset'] == 'wiki':
    config['min_df'], config['max_df'], config['min_doc_word'] = 2872, 1.0, 15
elif config['dataset'] == 'tweet':
    config['min_df'], config['max_df'], config['min_doc_word'] = 5, 1.0, 15

In [4]:
show_settings(config)
same_seeds(config["seed"])

# data preprocessing
unpreprocessed_corpus, preprocessed_corpus = get_preprocess_document(**config)

# for testing
preprocessed_corpus = preprocessed_corpus

texts = [text.split() for text in preprocessed_corpus]

word2idx, idx2word, labels = get_document_labels(texts, max_len=config["max_len"])

# Create tfidf target
vectorizer = TfidfVectorizer()
targets = vectorizer.fit_transform(preprocessed_corpus).toarray()
tfidf_word2idx = vectorizer.vocabulary_

# generating document embedding
doc_embs, doc_model, device = get_preprocess_document_embs(preprocessed_corpus, config['encoder'])
print("Get doc embedding done.")

vocabulary_size = len(word2idx)
embedding_size = 512
hidden_size = doc_embs.shape[1]
num_layer = 1
drop_out = 0

print("doc_emb shape: {}".format(doc_embs.shape))
print("voc size: {}".format(vocabulary_size))
print("labels size: {}".format(labels.size()))

train_loader, valid_loader, test_loader = prepare_dataloader(doc_embs, labels, targets, batch_size=32)


-------- Info ---------
model: ZTM
dataset: 20news
max_len: 30
num_epoch: 50
min_doc_len: 15
encoder: bert
seed: 123
topk: [5, 10, 15]
target: tf-idf
min_df: 62
max_df: 1.0
min_doc_word: 15

-----------------------
Getting preprocess documents: 20news
min_df: 62 max_df: 1.0 vocabulary_size: None min_doc_word: 15
Getting preprocess documents embeddings
Using cuda 0 for training...


Some weights of the model checkpoint at /home/coffree/.cache/torch/sentence_transformers/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Batches: 100%|██████████| 372/372 [00:23<00:00, 15.62it/s]


Get doc embedding done.
doc_emb shape: (18589, 768)
voc size: 4833
labels size: torch.Size([18589, 30])
Preparing dataloader
train size 13012
valid size 1859
test size 3718


In [5]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

class Seq2Seq(nn.Module):
    def __init__(self, decoder, device):
        super().__init__()

        self.device = device
        self.decoder = decoder
        self.apply(init_weights)
        
    def forward(self, doc_emb, trg, teacher_forcing_ratio = 0.5):
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #doc_emb = [batch size, embedding_dim]
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        # hidden = [n layers, batch size, hid dim]
        # cell = [n layers, batch size, hid dim]
        hidden = torch.unsqueeze(doc_emb, 0)
        cell = torch.unsqueeze(doc_emb, 0)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1)
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

    def predict(self, doc_emb, word2idx, idx2word, max_len=50):
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #doc_emb = [batch size, embedding_dim]
        trg_len = max_len
        batch_size = len(doc_emb)
        trg_vocab_size = len(word2idx)
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        prediction = torch.zeros(trg_len, batch_size)
        predict_voc = torch.zeros(batch_size, len(tfidf_word2idx))
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        # hidden = [n layers, batch size, hid dim]
        # cell = [n layers, batch size, hid dim]
        hidden = torch.unsqueeze(doc_emb, 0)
        cell = torch.unsqueeze(doc_emb, 0)
        
        #first input to the decoder is the <sos> tokens
        input = torch.LongTensor([word2idx["<SOS>"]] * len(doc_emb)).to(self.device)
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #get the highest predicted token from our predictions
            input = output.argmax(1)

            prediction[t] = input

            for b in range(batch_size):
                if (input[b] != word2idx["<SOS>"] and input[b] != word2idx["<PAD>"]
                     and input[b] != word2idx["<EOS>"] and input[b] != word2idx["<UNK>"]):
                    input_idx = int(input[b])
                    predict_label = tfidf_word2idx.get(idx2word[input_idx], -1)
                    if (predict_label != -1):
                        predict_voc[b][predict_label] += 1

        return prediction.transpose(0, 1), predict_voc

In [6]:
# We only need decoder part
dec = Decoder(vocabulary_size, embedding_size, hidden_size, num_layer, drop_out)
model = Seq2Seq(dec, device).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=word2idx["<PAD>"])

CLIP = 1

In [7]:
def train(model, iterator, optimizer, criterion, clip):

    model.train()

    epoch_loss = 0

    for i, batch in enumerate(iterator):

        doc_emb, trg, _ = batch
        doc_emb = doc_emb.to(device)
        trg = torch.transpose(trg, 0, 1).to(device)
        # doc_emb = [batch_size, emb_dim]
        # trg = [trg len, batch size]
        # output = [trg len, batch size, output dim]
        output = model(doc_emb, trg)

        output_dim = output.shape[-1]

        # print(trg.size())
        # print(output.size())

        trg = trg[1:].reshape(-1)
        output = output[1:].view(-1, output_dim)

        # trg = [(trg len - 1) * batch size]
        # output = [(trg len - 1) * batch size, output dim]

        loss = criterion(output, trg)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


In [8]:
for epoch in range(config["num_epoch"]):
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    if ((epoch + 1) % 10 == 0): 
        print("Epoch:{}/{}, train_loss:{}".format(epoch+1, config["num_epoch"], train_loss))

Epoch:10/50, train_loss:2.988927815997337
Epoch:20/50, train_loss:1.3029007848709162
Epoch:30/50, train_loss:0.3408559844154105
Epoch:40/50, train_loss:0.1577367298579626
Epoch:50/50, train_loss:0.03332847275105316


In [9]:
docemb = torch.FloatTensor(doc_embs[:2]).to(device)
prediction, predict_voc = model.predict(docemb, word2idx, idx2word, config["max_len"])

In [10]:
print(prediction.shape)
for token in prediction[0]:
    idx = int(token)
    print(idx2word[idx], end=" ")

torch.Size([2, 30])
<SOS> berkeley edu cubs article organization university california berkeley lines posting host berkeley pilot net writes era run year cubs think pitcher season helped lead era rotation cubs era <EOS 

In [11]:
print(predict_voc.size())
print(len(word2idx))
print(len(tfidf_word2idx))

torch.Size([2, 4823])
4833
4823


In [12]:
from collections import defaultdict
from utils.eval import retrieval_normalized_dcg_all, retrieval_precision_all, retrieval_precision_all_v2

def evaluate_Decoder(model, data_loader, config):
    results = defaultdict(list)
    model.eval()
    
    # predict all data
    for data in data_loader:
        doc_embs, _, target = data
        
        doc_embs = doc_embs.to(device)
        target = target.to(device)
        _, pred = model.predict(doc_embs, word2idx, idx2word)
        pred = pred.to(device)

        # Precision
        precision_scores = retrieval_precision_all(pred, target, k=config["topk"])
        for k, v in precision_scores.items():
            results['precision@{}'.format(k)].append(v)

        # Precision
        precision_scores = retrieval_precision_all_v2(pred, target, k=config["topk"])
        for k, v in precision_scores.items():
            results['precisionv2@{}'.format(k)].append(v)

        # NDCG
        ndcg_scores = retrieval_normalized_dcg_all(pred, target, k=config["topk"])
        for k, v in ndcg_scores.items():
            results['ndcg@{}'.format(k)].append(v)

    for k in results:
        results[k] = np.mean(results[k])

    return results

In [13]:
res = evaluate_Decoder(model, test_loader, config)
for key, val in res.items():
    print(f"{key}:{val:.4f}")

precision@5:0.9749
precision@10:0.9693
precision@15:0.9692
precisionv2@5:0.3955
precisionv2@10:0.3889
precisionv2@15:0.4272
ndcg@5:0.6484
ndcg@10:0.6548
ndcg@15:0.6639
ndcg@all:0.7293
