## Siamese network 
Steps:
1. load word embeding and document embedding
2. create pytorch dataset and dataloader
3. Try Contrastive loss and triplet loss
4. further improve negative sampling (e.g. hard negative or word2vec negative sampling)

#### raw data
* word embedding: glove
* doc text: ./data/IMDB.txt

### preprocess
1. truncate smallest k word in IDF
2. stemming

### model
1. k highest freq words
2. CBOW
3. Triplet
4.

### evaluation
1. F1
2. F1 weighted by TF-IDF

In [93]:
from collections import defaultdict
import math
import numpy as np 
import re
import torch
import torch.nn as nn
from itertools import cycle
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from tqdm.auto import tqdm


import matplotlib.pyplot as plt 
import pandas as pd

In [94]:
n_document = 10000
min_word_freq_threshold = 20
topk_word_freq_threshold = 0
document_vector_agg = 'mean'
select_topk_TFIDF = None

In [95]:
# load word embedding
embedding_file = "../data/glove.6B.100d.txt"

word2embedding = dict()
word_dim = int(re.findall(r".(\d+)d",embedding_file)[0])

with open(embedding_file,"r") as f:
    for line in tqdm(f):
        line = line.strip().split()
        word = line[0]
        embedding = list(map(float,line[1:]))
        word2embedding[word] = embedding

print("Number of words:%d" % len(word2embedding))

0it [00:00, ?it/s]

Number of words:400000


In [96]:
class Vocabulary:
    def __init__(self, word2embedding, min_word_freq_threshold=0, topk_word_freq_threshold=0):
        # The low frequency words will be assigned as <UNK> token
        self.itos = {0: "<UNK>"}
        self.stoi = {"<UNK>": 0}
        self.word2embedding = word2embedding
        self.min_word_freq_threshold = min_word_freq_threshold
        self.topk_word_freq_threshold = topk_word_freq_threshold
        
    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
        return text.strip().split()

    def build_vocabulary(self, sentence_list):
        self.word_freq_in_corpus = defaultdict(int)
        self.doc_freq = defaultdict(int) # # of document a word appear
        self.document_num = len(sentence_list)
        
        self.word_vectors = [[0]*word_dim] # init zero padding
        self.mask_word = set()
        idx = 1
        
        for sentence in tqdm(sentence_list, desc="Preprocessing documents"):
            # for doc_freq
            document_words = set()
            
            for word in self.tokenizer_eng(sentence):
                # pass unknown word
                if word not in self.word2embedding:
                    continue
                    
                # calculate word freq
                self.word_freq_in_corpus[word] += 1
                
                # validate word if it is more than min_word_freq_threshold times
                if self.word_freq_in_corpus[word] == self.min_word_freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    self.word_vectors.append(self.word2embedding[word])
                    idx += 1
                
                document_words.add(word)
                
            for word in document_words:
                self.doc_freq[word] += 1
        
        # calculate IDF
        self.IDF = {}
        print('doc num', self.document_num)
        for word, freq in self.doc_freq.items():
            self.IDF[word] = math.log(self.document_num / (freq+1))
        
        # eliminate smallest K IDF words
        IDF = [(word, freq) for word, freq in self.IDF.items()]
        IDF.sort(key=lambda x: x[1])
        
        print('eliminate words')
        for i in range(self.topk_word_freq_threshold):
            if IDF[i][0] not in self.stoi:
                continue
            print(IDF[i][0])
            idx = self.stoi[IDF[i][0]]
            del self.stoi[IDF[i][0]]
            del self.itos[idx]
            del self.word_freq_in_corpus[IDF[i][0]]
                
    def calculate_document_vector(self, sentence_list, agg, select_topk_TFIDF=None):
        document_vectors = []
        document_answers = []
        
        for sentence in tqdm(sentence_list, desc="calculate document vectors"):
            document_vector = np.zeros(len(self.word_vectors[0]))
            select_words = []
            for word in self.tokenizer_eng(sentence):
                # pass unknown word
                if word not in self.stoi:
                    continue
                else:
                    select_words.append(word)

            # select topk TDIDF
            if select_topk_TFIDF is not None:
                doc_TFIDF = defaultdict(float)
                for word in select_words:    
                    doc_TFIDF[word] += self.IDF[word]

                doc_TFIDF_l = [(word, TFIDF) for word, TFIDF in doc_TFIDF.items()]
                doc_TFIDF_l.sort(key=lambda x:x[1], reverse=True)
                
                select_topk_words = set(list(map(lambda x:x[0], doc_TFIDF_l[:select_topk_TFIDF])))
                select_words = [word for word in select_words if word in select_topk_words]
            else:
                pass
            
            # aggregate to doc vectors
            for word in select_words:
                if agg == 'mean':
                    document_vector += self.word2embedding[word]
                elif agg == 'TF-IDF':
                    document_vector += np.array(self.word2embedding[word]) * self.IDF[word]

            if len(select_words) == 0:
                print('error', sentence)
                return -1
            else:
                document_vector /= len(select_words)
            
            document_vectors.append(document_vector)
            document_answers.append(select_words)
        
        # get answers
        document_answers_idx = []    
        for ans in document_answers:
            ans_idx = []
            for token in ans:
                if token in self.stoi:
                    ans_idx.append(self.stoi[token])                    
            document_answers_idx.append(ans_idx)
            
        return document_vectors, document_answers_idx
        
    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [97]:
class CBowDataset(Dataset):
    def __init__(self, 
                 raw_data_file_path,
                 word2embedding,
                 skip_header = False,
                 n_document = None, # read first n document
                 min_word_freq_threshold = 20, # eliminate less freq words
                 topk_word_freq_threshold = 5, # eliminate smallest k IDF words
                 select_topk_TFIDF = None, # select topk tf-idf as ground-truth
                 document_vector_agg = 'mean',
                 ):

        assert document_vector_agg in ['mean', 'TF-IDF']
        
        # raw documents
        self.documents = []
        # document vectors
        self.document_vectors = []
        
        with open(raw_data_file_path,'r',encoding='utf-8') as f:
            if skip_header:
                f.readline()
            for line in tqdm(f, desc="Loading documents"):
                # read firt n document
                if n_document is not None and len(self.documents) >= n_document:
                    break    
                self.documents.append(line.strip("\n"))

        # build vocabulary
        self.vocab = Vocabulary(word2embedding, min_word_freq_threshold, topk_word_freq_threshold)
        self.vocab.build_vocabulary(self.documents)
        self.vocab_size = len(self.vocab)
        
#         self.words_tokenized = [self.vocab.numericalize(text) for text in self.documents]
        
        # calculate document vectors
        self.document_vectors, self.words_tokenized = self.vocab.calculate_document_vector(self.documents, \
                                                                                           document_vector_agg, select_topk_TFIDF)
        # train-test split
        # training
        self.train_length = int(len(self.words_tokenized)*0.8)
        self.train_vectors = self.document_vectors[:self.train_length]
        self.train_words = self.words_tokenized[:self.train_length]
        self.document_ids = list(range(self.train_length))
        self.generator = cycle(self.context_target_generator())
        self.dataset_size = sum([len(s) for s in self.train_words])
        
        # testing
        self.test_vectors = self.document_vectors[self.train_length:]
        self.test_words = self.words_tokenized[self.train_length:]

    def context_target_generator(self):
        np.random.shuffle(self.document_ids) # inplace shuffle

        # randomly select a document and create its training example
        for document_id in self.document_ids: 
            word_list = set(self.train_words[document_id])
            negative_sample_space = list(set(range(self.vocab_size)) - word_list)
            negative_samples = np.random.choice(negative_sample_space,size=len(word_list),replace = False)
            for word_id, negative_wordID in zip(word_list, negative_samples):
                yield [document_id, word_id, negative_wordID]
                
    def __getitem__(self, idx):
        doc_id, word_id, negative_wordID = next(self.generator)
        doc_id = torch.FloatTensor(self.document_vectors[doc_id])
        word_id = torch.FloatTensor(self.vocab.word_vectors[word_id])
        negative_word = torch.FloatTensor(self.vocab.word_vectors[negative_wordID])

        return doc_id, word_id, negative_word

    def __len__(self):
        return self.dataset_size 


In [98]:
# load and build torch dataset
data_file_path = '../data/IMDB.txt'

print("Building dataset....")
dataset = CBowDataset(
                    raw_data_file_path=data_file_path,
                    word2embedding=word2embedding,
                    skip_header=False,
                    n_document = n_document,
                    min_word_freq_threshold = min_word_freq_threshold,
                    topk_word_freq_threshold = topk_word_freq_threshold,
                    document_vector_agg = document_vector_agg,
                    select_topk_TFIDF = select_topk_TFIDF
                    )
print("Finish building dataset!")
print(f"Number of documents:{len(dataset.documents)}")
print(f"Number of words:{dataset.vocab_size}")

Building dataset....


Loading documents: 0it [00:00, ?it/s]

Preprocessing documents:   0%|          | 0/10000 [00:00<?, ?it/s]

doc num 10000
eliminate words


calculate document vectors:   0%|          | 0/10000 [00:00<?, ?it/s]

Finish building dataset!
Number of documents:10000
Number of words:7533


In [99]:
class TestDataset(Dataset):
    def __init__(self, 
                 doc_vectors,
                 ans_words,
                 ):
        self.doc_vectors = doc_vectors
        self.ans_words = ans_words
        assert len(doc_vectors) == len(ans_words)
        
    def __getitem__(self, idx):
        doc_vec = torch.FloatTensor(self.doc_vectors[idx])
        ans_w = torch.tensor(list(set(self.ans_words[idx])))
        return doc_vec, ans_w

    def collate_fn(self,batch):
        # Batch: List of tuples [(batch1), (batch2)]
        
        doc_vec = torch.cat([item[0].unsqueeze(0) for item in batch], dim=0)
        ans_w = [item[1] for item in batch]
        ans_w = pad_sequence(ans_w, batch_first=True, padding_value=-1)
        
        return doc_vec, ans_w 

    def __len__(self):
        return len(self.doc_vectors)


In [100]:
class TripletNet(nn.Module):
    def __init__(self, hdim):
        super(TripletNet, self).__init__()
        self.fc = nn.Sequential(nn.Linear(hdim, 256),
                        nn.PReLU(),
                        nn.Linear(256, 256),
                        nn.PReLU(),
                        nn.Linear(256, 2)
                        )


    def forward(self, x1, x2, x3):
        output1 = self.fc(x1)
        output2 = self.fc(x2)
        output3 = self.fc(x3)
        return output1, output2, output3

    def get_embedding(self, x):
        return self.fc(x)

In [101]:
class TripletLoss(nn.Module):
    """
    Triplet loss
    Takes embeddings of an anchor sample, a positive sample and a negative sample
    """

    def __init__(self, margin):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative, size_average=True):
        distance_positive = (anchor - positive).pow(2).sum(1)  # .pow(.5)
        distance_negative = (anchor - negative).pow(2).sum(1)  # .pow(.5)
        losses = F.relu(distance_positive - distance_negative + self.margin)
        return losses.mean() if size_average else losses.sum()

In [102]:
margin = 1.
BATCH_SIZE = 1024
EPOCH = 300

device = "cuda:0"
model = TripletNet(word_dim).to(device)
loss_fn = TripletLoss(margin).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

In [103]:
train_loader = DataLoader(
                        dataset, 
                        batch_size=BATCH_SIZE,
                        num_workers=4,
                        shuffle=True,
                        )

In [104]:
test_docvec = dataset.test_vectors
test_ans = dataset.test_words
test_dataset = TestDataset(test_docvec,test_ans)
test_loader = DataLoader(test_dataset,                         
                         batch_size=BATCH_SIZE,
                         num_workers=4,
                         collate_fn=test_dataset.collate_fn)
word_embedding_tensor = torch.FloatTensor(dataset.vocab.word_vectors).to(device)

In [105]:
def evaluate(test_word_emb, loader,Ks = [50,100,150,200]):
    avg_precision, avg_recall = [], []
    for batch in test_loader:
        batch = [item.to(device) for item in batch]
        emb, ans = batch
        emb = model.get_embedding(emb)
        scores = torch.cdist(emb, test_word_emb)
        ans_length = torch.sum((~ans.eq(-1)).float(), dim=-1)
        mask = ~ans.eq(-1).unsqueeze(-1)
        
        # calculate precision and recall
        tmp_pr, tmp_re = [],[]
        for K in Ks:
            top_indices = torch.argsort(scores,dim=1)[:,:K]
            hit = top_indices.unsqueeze(-2) == ans.unsqueeze(-1)
            hit = torch.sum((hit * mask).flatten(1),dim=-1)
            precision = hit / K
            recall = hit / ans_length
            tmp_pr.append(precision)
            tmp_re.append(recall)
        tmp_pr = torch.stack(tmp_pr).T.detach().cpu().numpy().tolist()
        tmp_re = torch.stack(tmp_re).T.detach().cpu().numpy().tolist()
        avg_precision.extend(tmp_pr)
        avg_recall.extend(tmp_re)
        
    avg_precision = np.mean(avg_precision,axis=0)
    avg_recall = np.mean(avg_recall, axis=0)
    for idx, kval in enumerate(Ks):
        print(f"[K={kval}] Precision:{avg_precision[idx]:.4f} Recall:{avg_recall[idx]:.4f}")
    return avg_precision, avg_recall

In [106]:
for epoch in range(EPOCH):
    avg_loss = []
    model.train()
    for batch in tqdm(train_loader):
        batch = [item.to(device) for item in batch]
        doc_id,pos_w,neg_w = batch
        optimizer.zero_grad()
        loss = loss_fn(*model(doc_id,pos_w,neg_w))
        loss.backward()
        optimizer.step()
        avg_loss.append(loss.item())
    avg_loss = np.mean(avg_loss)
    print(f"Loss:{avg_loss:4f}")
    
    # evaluate
    model.eval()
    test_word_emb = model.get_embedding(word_embedding_tensor)
    res = evaluate(test_word_emb,test_loader)

  0%|          | 0/1684 [00:00<?, ?it/s]

Loss:0.322057
[K=50] Precision:0.5557 Recall:0.2523
[K=100] Precision:0.4120 Recall:0.3627
[K=150] Precision:0.3307 Recall:0.4294
[K=200] Precision:0.2791 Recall:0.4788


  0%|          | 0/1684 [00:00<?, ?it/s]

Loss:0.311204
[K=50] Precision:0.5526 Recall:0.2506
[K=100] Precision:0.4097 Recall:0.3604
[K=150] Precision:0.3316 Recall:0.4307
[K=200] Precision:0.2807 Recall:0.4823


  0%|          | 0/1684 [00:00<?, ?it/s]

Loss:0.308062
[K=50] Precision:0.5721 Recall:0.2596
[K=100] Precision:0.4158 Recall:0.3657
[K=150] Precision:0.3312 Recall:0.4305
[K=200] Precision:0.2789 Recall:0.4796


  0%|          | 0/1684 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [174]:
from sklearn.metrics import ndcg_score

def evaluate_NDCG(test_word_emb, loader):
    NDCGs = []
    
    for batch in tqdm(test_loader):
        batch = [item.to(device) for item in batch]
        emb, ans = batch
        TFIDF_ans = np.zeros((len(ans), test_word_emb.shape[0]))
        for i in range(len(ans)):
            ans_row = ans[i]
            ans_row = ans_row[~ans_row.eq(-1)]
            for word_id in ans_row:
                word_id = word_id.item()
                word = dataset.vocab.itos[word_id]
                TFIDF_ans[i][word_id] += dataset.vocab.IDF[word]
             
        emb = model.get_embedding(emb)
        scores = -torch.cdist(emb, test_word_emb).cpu().detach().numpy()

        true_relevance = TFIDF_ans
        NDCG_score = ndcg_score(true_relevance, scores)
        NDCGs.append(NDCG_score)
        
    
    avg_NDCGs = np.mean(NDCGs)
    print('avg_NDCG', avg_NDCGs)
    return avg_NDCGs

In [175]:
evaluate_NDCG(test_word_emb,test_loader)

  0%|          | 0/2 [00:00<?, ?it/s]

avg_NDCG 0.4466589128981244


0.4466589128981244

## DNN model

## Lasso

In [122]:
# select > 0
def metric1(binary_x, answer, verbose=0):
    binary_x = binary_x > 0
    pred = np.arange(len(binary_x))[binary_x]
    
    hit = np.intersect1d(pred, answer)
    hit_num = len(hit)
    recall = hit_num / len(answer)
    precision = hit_num / len(pred)
    if verbose == 1:
        print('answer:', word_list[answer])
        print('hit:', word_list[hit])    
    return {"recall": recall, "precision": precision}

# select > 0.5
def metric2(binary_x, answer, th=0.5, verbose=0):
#     binary_x = np.array([int(np.round(i)) for i in binary_x])
    pred = np.arange(len(binary_x))[binary_x>=th]

    hit = np.intersect1d(pred, answer)
    hit_num = len(hit)
    recall = hit_num / len(answer)
    precision = hit_num / len(pred)
    if verbose == 1:
        print('answer:', word_list[answer])
        print('hit:', word_list[hit])    
    return {"recall": recall, "precision": precision}

# select #answer largest pred
def metric3(binary_x, answer, w_idx=None, scale=1, verbose=0):
    select_num = int(len(answer) * scale)
    if w_idx is not None:
        pred = w_idx[np.argsort(binary_x)[-select_num:]]
    else:
        pred = np.arange(len(binary_x))[np.argsort(binary_x)[-select_num:]]
    
    hit = np.intersect1d(pred, answer)
    hit_num = len(hit)
    recall = hit_num / len(answer)
    precision = hit_num / len(pred)
    if verbose == 1:
        print('answer:', word_list[answer])
        print('hit:', word_list[hit])
    return {"recall": recall, "precision": precision}

# select #answer largest pred
def metric4(binary_x, answer, w_idx=None, topk=50, verbose=0):
    select_num = topk
    answer = list(set(answer))
    
    if w_idx is not None:
        pred = w_idx[np.argsort(binary_x)[-select_num:]]
    else:
        pred = np.arange(len(binary_x))[np.argsort(binary_x)[-select_num:]]
    
    hit = np.intersect1d(pred, answer)
    hit_num = len(hit)
    recall = hit_num / len(answer)
    precision = hit_num / len(pred)
    if verbose == 1:
        print('answer:', word_list[answer])
        print('hit:', word_list[hit])
    return {"recall": recall, "precision": precision}

In [115]:
from sklearn.metrics import r2_score

class PyTorchLinearRegression:
    ''' Class that implemnets Multiple Linear Regression with PyTorch'''
    def __init__(self, num_of_features, lr, constraintHigh, constraintLow, total, init_type=0, L1=0, L2=0):
        if init_type == 0:
            self.w = torch.zeros(num_of_features, requires_grad=True)
        elif init_type == 1:
            self.w = torch.ones(num_of_features, requires_grad=True)
        elif init_type == 2:  
            self.w = torch.rand(num_of_features, requires_grad=True)
        elif init_type == 3:
            self.w = -torch.ones(num_of_features, requires_grad=True)

        self.learning_rate = lr
        self.high = constraintHigh
        self.low = constraintLow
        self.total = total
        self.rg2 = total / num_of_features
        self.L1 = L1
        self.L2 = L2
        
    def _model(self, X):
        return X @ self.w.t()# + self.b
    
    def _mse(self, pred, real):
        difference = pred - real
        return torch.sum(difference * difference) / difference.numel()
    
    def _regularization_weightdist(self):
        difference = self.w - 1
        return -torch.sum(difference * difference) / difference.numel()
    
    def _regularization_weightsum(self):
        difference = torch.sum(self.w) - self.total
        return difference * difference / self.w.numel()
        
#     def _regularization_L1(self):
#         return self.w.norm(1)#torch.sum(torch.abs(self.w))
    
#     def _regularization_L2(self):
#         return self.w.norm(2)
    
    def fit(self, X, y, epochs):
        print(loss_weight)
        X = torch.from_numpy(X).float()
        y = torch.from_numpy(y).float()
        
        for i in range(epochs):
            predictions = self._model(X)
            loss1 = self._mse(predictions, y)
            loss2 = self._regularization_weightdist()
            loss3 = self._regularization_weightsum()
            loss = loss1 * loss_weight[0] + loss2 * loss_weight[1] + loss3 * loss_weight[2]
#             loss = loss1
            
            if (i % (epochs//20)) == 0:
                print(f'Epoch: {i} - Loss: {loss1}')
            
            loss.backward()
            with torch.no_grad():
                self.w -= (self.w.grad) * self.learning_rate + torch.sign(self.w)*self.L1 + self.w*self.L2
                self.w.grad.zero_()
                self.w.data.clamp_(min=self.low, max=self.high)

#             x = 100
#             if i % x == x-1:
# #                 self.w=torch.tensor(self.low + (self.high-self.low)*(self.w - torch.min(self.w))/(torch.max(self.w) - torch.min(self.w)), requires_grad=True)
#                 self.w.data.clamp_(min=self.low, max=self.high)
#                 pass
                
    def predict(self, X):
        X = torch.from_numpy(X).float()
        return self._model(X)
    
    def score(self, X, y):
        X = torch.from_numpy(X).float()
        y_pred = self._model(X).detach().numpy()
        return r2_score(y, y_pred)

In [125]:
word_embs = np.array(dataset.vocab.word_vectors)
doc_embs = np.array(dataset.document_vectors)
doc_answers = dataset.words_tokenized
word_list = dataset.vocab.itos

print(word_embs.shape)
print(doc_embs.shape)
print(len(doc_answers))


(7533, 100)
(10000, 100)
10000


In [139]:
pr, re = [[],[],[]], [[],[],[]]
lr = 0.001
epochs = 1000
constraintHigh=1
constraintLow=0
# constraintHigh=float('inf')
# constraintLow=-float('inf')
loss_weight = [1, 10, 0]
L1, L2 = 1e-5, 0
rand_type = 0

total_mul = 1

for uid, uemb in enumerate(tqdm(doc_embs[:100])):
    x = word_embs.T
    y = uemb
    total = len(doc_answers[uid])

    torch_model = PyTorchLinearRegression(x.shape[1], lr, constraintHigh, constraintLow, int(total*total_mul), rand_type, L1, L2)
    torch_model.fit(x, y, epochs)
    
    m1 = metric4(torch_model.w.detach().numpy(), doc_answers[select_user], w_idx=None, topk=50, verbose=0)
    m2 = metric4(torch_model.w.detach().numpy(), doc_answers[select_user], w_idx=None, topk=100, verbose=0)
    m3 = metric4(torch_model.w.detach().numpy(), doc_answers[select_user], w_idx=None, topk=200, verbose=0)
    pr[0].append(m1["precision"])
    re[0].append(m1["recall"])
    pr[1].append(m2["precision"])
    re[1].append(m2["recall"])
    pr[2].append(m3["precision"])
    re[2].append(m3["recall"])

  0%|          | 0/100 [00:00<?, ?it/s]

[1, 10, 0]
Epoch: 0 - Loss: 0.17827610671520233
Epoch: 50 - Loss: 0.0025473786517977715
Epoch: 100 - Loss: 0.0017679232405498624
Epoch: 150 - Loss: 0.001450026291422546
Epoch: 200 - Loss: 0.0012604641960933805
Epoch: 250 - Loss: 0.001124239875935018
Epoch: 300 - Loss: 0.001023078104481101
Epoch: 350 - Loss: 0.0009503070032224059
Epoch: 400 - Loss: 0.0008962148567661643
Epoch: 450 - Loss: 0.0008509302278980613
Epoch: 500 - Loss: 0.0008104497683234513
Epoch: 550 - Loss: 0.0007780898595228791
Epoch: 600 - Loss: 0.000749944883864373
Epoch: 650 - Loss: 0.0007277996046468616
Epoch: 700 - Loss: 0.0007051937282085419
Epoch: 750 - Loss: 0.0006865779869258404
Epoch: 800 - Loss: 0.0006683788960799575
Epoch: 850 - Loss: 0.0006555995205417275
Epoch: 900 - Loss: 0.0006414588424377143
Epoch: 950 - Loss: 0.0006292320322245359
[1, 10, 0]
Epoch: 0 - Loss: 0.17931674420833588
Epoch: 50 - Loss: 0.0025776217225939035
Epoch: 100 - Loss: 0.001732667675241828
Epoch: 150 - Loss: 0.0013651929330080748
Epoch: 20

Epoch: 250 - Loss: 0.0011074277572333813
Epoch: 300 - Loss: 0.0010322825983166695
Epoch: 350 - Loss: 0.0009716759668663144
Epoch: 400 - Loss: 0.0009281123639084399
Epoch: 450 - Loss: 0.0008911907789297402
Epoch: 500 - Loss: 0.0008642126340419054
Epoch: 550 - Loss: 0.0008403874817304313
Epoch: 600 - Loss: 0.0008194331894628704
Epoch: 650 - Loss: 0.0008022725814953446
Epoch: 700 - Loss: 0.0007856122683733702
Epoch: 750 - Loss: 0.0007722590235061944
Epoch: 800 - Loss: 0.0007598865195177495
Epoch: 850 - Loss: 0.0007508056587539613
Epoch: 900 - Loss: 0.0007410405669361353
Epoch: 950 - Loss: 0.0007326100021600723
[1, 10, 0]
Epoch: 0 - Loss: 0.1747533082962036
Epoch: 50 - Loss: 0.0021771984174847603
Epoch: 100 - Loss: 0.0014987352769821882
Epoch: 150 - Loss: 0.0011870298767462373
Epoch: 200 - Loss: 0.0010040339548140764
Epoch: 250 - Loss: 0.0008834835607558489
Epoch: 300 - Loss: 0.0007968731224536896
Epoch: 350 - Loss: 0.0007320552249439061
Epoch: 400 - Loss: 0.0006809664773754776
Epoch: 450 

Epoch: 700 - Loss: 0.0006487315404228866
Epoch: 750 - Loss: 0.0006318443920463324
Epoch: 800 - Loss: 0.0006137600285001099
Epoch: 850 - Loss: 0.0005992481601424515
Epoch: 900 - Loss: 0.0005861297249794006
Epoch: 950 - Loss: 0.0005736635648645461
[1, 10, 0]
Epoch: 0 - Loss: 0.16584837436676025
Epoch: 50 - Loss: 0.0027074026875197887
Epoch: 100 - Loss: 0.0018831308698281646
Epoch: 150 - Loss: 0.0015532454708591104
Epoch: 200 - Loss: 0.0013630569446831942
Epoch: 250 - Loss: 0.0012291350867599249
Epoch: 300 - Loss: 0.0011241261381655931
Epoch: 350 - Loss: 0.0010476458119228482
Epoch: 400 - Loss: 0.0009843426523730159
Epoch: 450 - Loss: 0.0009321675170212984
Epoch: 500 - Loss: 0.0008930459152907133
Epoch: 550 - Loss: 0.0008595000253990293
Epoch: 600 - Loss: 0.0008321077330037951
Epoch: 650 - Loss: 0.0008067035814747214
Epoch: 700 - Loss: 0.0007866839878261089
Epoch: 750 - Loss: 0.0007673271466046572
Epoch: 800 - Loss: 0.000750822713598609
Epoch: 850 - Loss: 0.0007375401328317821
Epoch: 900 

Epoch: 900 - Loss: 0.0006704070256091654
Epoch: 950 - Loss: 0.0006537676090374589
[1, 10, 0]
Epoch: 0 - Loss: 0.16816118359565735
Epoch: 50 - Loss: 0.0023565683513879776
Epoch: 100 - Loss: 0.0016410652315244079
Epoch: 150 - Loss: 0.0013496286701411009
Epoch: 200 - Loss: 0.0011838997015729547
Epoch: 250 - Loss: 0.0010638957610353827
Epoch: 300 - Loss: 0.00097485794685781
Epoch: 350 - Loss: 0.0009060262236744165
Epoch: 400 - Loss: 0.0008498824900016189
Epoch: 450 - Loss: 0.0008046538569033146
Epoch: 500 - Loss: 0.0007644615252502263
Epoch: 550 - Loss: 0.000733960245270282
Epoch: 600 - Loss: 0.0007076047477312386
Epoch: 650 - Loss: 0.0006843068986199796
Epoch: 700 - Loss: 0.0006653477903455496
Epoch: 750 - Loss: 0.0006464075413532555
Epoch: 800 - Loss: 0.0006318718660622835
Epoch: 850 - Loss: 0.0006150745321065187
Epoch: 900 - Loss: 0.0006012500962242484
Epoch: 950 - Loss: 0.0005899625830352306
[1, 10, 0]
Epoch: 0 - Loss: 0.19794349372386932
Epoch: 50 - Loss: 0.00261895009316504
Epoch: 10

Epoch: 50 - Loss: 0.002484549768269062
Epoch: 100 - Loss: 0.0017592598451301455
Epoch: 150 - Loss: 0.001465404755435884
Epoch: 200 - Loss: 0.0013025104999542236
Epoch: 250 - Loss: 0.0011879961239174008
Epoch: 300 - Loss: 0.0011024209670722485
Epoch: 350 - Loss: 0.0010280914139002562
Epoch: 400 - Loss: 0.0009682632517069578
Epoch: 450 - Loss: 0.0009182748035527766
Epoch: 500 - Loss: 0.0008794620516709983
Epoch: 550 - Loss: 0.0008440089295618236
Epoch: 600 - Loss: 0.0008134423405863345
Epoch: 650 - Loss: 0.000786221818998456
Epoch: 700 - Loss: 0.0007624243735335767
Epoch: 750 - Loss: 0.0007420868496410549
Epoch: 800 - Loss: 0.0007224330329336226
Epoch: 850 - Loss: 0.0007047282997518778
Epoch: 900 - Loss: 0.0006919457809999585
Epoch: 950 - Loss: 0.0006773598724976182
[1, 10, 0]
Epoch: 0 - Loss: 0.18059608340263367
Epoch: 50 - Loss: 0.002042071195319295
Epoch: 100 - Loss: 0.0014313501305878162
Epoch: 150 - Loss: 0.0011641350574791431
Epoch: 200 - Loss: 0.0010123620741069317
Epoch: 250 - Lo

Epoch: 100 - Loss: 0.0014710001414641738
Epoch: 150 - Loss: 0.0011710218386724591
Epoch: 200 - Loss: 0.0009983573108911514
Epoch: 250 - Loss: 0.0008835679036565125
Epoch: 300 - Loss: 0.0008063804125413299
Epoch: 350 - Loss: 0.0007410216494463384
Epoch: 400 - Loss: 0.000696481904014945
Epoch: 450 - Loss: 0.0006600160268135369
Epoch: 500 - Loss: 0.0006296074134297669
Epoch: 550 - Loss: 0.000605473353061825
Epoch: 600 - Loss: 0.0005857882788404822
Epoch: 650 - Loss: 0.0005661834729835391
Epoch: 700 - Loss: 0.0005483645363710821
Epoch: 750 - Loss: 0.000534384569618851
Epoch: 800 - Loss: 0.0005244703497737646
Epoch: 850 - Loss: 0.0005153842503204942
Epoch: 900 - Loss: 0.000507102464325726
Epoch: 950 - Loss: 0.0004978244542144239
[1, 10, 0]
Epoch: 0 - Loss: 0.19884270429611206
Epoch: 50 - Loss: 0.0028918650932610035
Epoch: 100 - Loss: 0.0020153566729277372
Epoch: 150 - Loss: 0.0016469416441395879
Epoch: 200 - Loss: 0.0014326616656035185
Epoch: 250 - Loss: 0.0012912581441923976
Epoch: 300 - L

Epoch: 500 - Loss: 0.0011278889141976833
Epoch: 550 - Loss: 0.001095327315852046
Epoch: 600 - Loss: 0.0010663222055882215
Epoch: 650 - Loss: 0.0010421487968415022
Epoch: 700 - Loss: 0.0010213416535407305
Epoch: 750 - Loss: 0.0010029349941760302
Epoch: 800 - Loss: 0.0009823970030993223
Epoch: 850 - Loss: 0.0009675237815827131
Epoch: 900 - Loss: 0.0009525562054477632
Epoch: 950 - Loss: 0.0009373400243930519
[1, 10, 0]
Epoch: 0 - Loss: 0.16304561495780945
Epoch: 50 - Loss: 0.0024826182052493095
Epoch: 100 - Loss: 0.001786702312529087
Epoch: 150 - Loss: 0.0014980777632445097
Epoch: 200 - Loss: 0.0013269054470583797
Epoch: 250 - Loss: 0.0012112538097426295
Epoch: 300 - Loss: 0.001124634756706655
Epoch: 350 - Loss: 0.001051798346452415
Epoch: 400 - Loss: 0.0009902145247906446
Epoch: 450 - Loss: 0.0009409834165126085
Epoch: 500 - Loss: 0.0009026575717143714
Epoch: 550 - Loss: 0.0008742252830415964
Epoch: 600 - Loss: 0.000845101079903543
Epoch: 650 - Loss: 0.0008205960621125996
Epoch: 700 - Lo

Epoch: 850 - Loss: 0.0008061318076215684
Epoch: 900 - Loss: 0.000790842401329428
Epoch: 950 - Loss: 0.000776616099756211
[1, 10, 0]
Epoch: 0 - Loss: 0.1888064593076706
Epoch: 50 - Loss: 0.0027573055122047663
Epoch: 100 - Loss: 0.0018675258615985513
Epoch: 150 - Loss: 0.001497673336416483
Epoch: 200 - Loss: 0.0012769411550834775
Epoch: 250 - Loss: 0.0011370584834367037
Epoch: 300 - Loss: 0.0010339929722249508
Epoch: 350 - Loss: 0.0009522709297016263
Epoch: 400 - Loss: 0.0008894415223039687
Epoch: 450 - Loss: 0.0008369356510229409
Epoch: 500 - Loss: 0.000795426603872329
Epoch: 550 - Loss: 0.0007611719775013626
Epoch: 600 - Loss: 0.0007322314195334911
Epoch: 650 - Loss: 0.0007067622500471771
Epoch: 700 - Loss: 0.0006815873784944415
Epoch: 750 - Loss: 0.0006615808815695345
Epoch: 800 - Loss: 0.0006468411302193999
Epoch: 850 - Loss: 0.0006305871065706015
Epoch: 900 - Loss: 0.0006146127125248313
Epoch: 950 - Loss: 0.0006053465767763555
[1, 10, 0]
Epoch: 0 - Loss: 0.18492966890335083
Epoch: 5

Epoch: 850 - Loss: 0.000716599402949214
Epoch: 900 - Loss: 0.000706146820448339
Epoch: 950 - Loss: 0.0006945326458662748
[1, 10, 0]
Epoch: 0 - Loss: 0.18244224786758423
Epoch: 50 - Loss: 0.002383534563705325
Epoch: 100 - Loss: 0.001665937015786767
Epoch: 150 - Loss: 0.0013599314261227846
Epoch: 200 - Loss: 0.0011794946622103453
Epoch: 250 - Loss: 0.0010512302396818995
Epoch: 300 - Loss: 0.0009606469538994133
Epoch: 350 - Loss: 0.0008958487887866795
Epoch: 400 - Loss: 0.0008411644957959652
Epoch: 450 - Loss: 0.00079882494173944
Epoch: 500 - Loss: 0.0007660843548364937
Epoch: 550 - Loss: 0.0007371437386609614
Epoch: 600 - Loss: 0.000714646652340889
Epoch: 650 - Loss: 0.0006948694353923202
Epoch: 700 - Loss: 0.0006783563876524568
Epoch: 750 - Loss: 0.0006631431169807911
Epoch: 800 - Loss: 0.0006501146126538515
Epoch: 850 - Loss: 0.0006378639372996986
Epoch: 900 - Loss: 0.0006282342947088182
Epoch: 950 - Loss: 0.0006135145085863769
[1, 10, 0]
Epoch: 0 - Loss: 0.1624622344970703
Epoch: 50 -

Epoch: 50 - Loss: 0.0024254852905869484
Epoch: 100 - Loss: 0.001698460429906845
Epoch: 150 - Loss: 0.0013948738342151046
Epoch: 200 - Loss: 0.0012213146546855569
Epoch: 250 - Loss: 0.0011039874516427517
Epoch: 300 - Loss: 0.0010138702346011996
Epoch: 350 - Loss: 0.0009454736718907952
Epoch: 400 - Loss: 0.0008881325484253466
Epoch: 450 - Loss: 0.0008447114960290492
Epoch: 500 - Loss: 0.0008105452870950103
Epoch: 550 - Loss: 0.0007832199335098267
Epoch: 600 - Loss: 0.0007566851563751698
Epoch: 650 - Loss: 0.0007364132907241583
Epoch: 700 - Loss: 0.0007165978895500302
Epoch: 750 - Loss: 0.000702673802152276
Epoch: 800 - Loss: 0.0006869733333587646
Epoch: 850 - Loss: 0.0006762867560610175
Epoch: 900 - Loss: 0.0006660722428932786
Epoch: 950 - Loss: 0.0006550212274305522
[1, 10, 0]
Epoch: 0 - Loss: 0.1824425905942917
Epoch: 50 - Loss: 0.0033487901091575623
Epoch: 100 - Loss: 0.002369505353271961
Epoch: 150 - Loss: 0.001977580366656184
Epoch: 200 - Loss: 0.001749497838318348
Epoch: 250 - Loss

In [140]:
print(f"Precision:{np.mean(pr[0]):.4f} Recall:{np.mean(re[0]):.4f}")
print(f"Precision:{np.mean(pr[1]):.4f} Recall:{np.mean(re[1]):.4f}")
print(f"Precision:{np.mean(pr[2]):.4f} Recall:{np.mean(re[2]):.4f}")


Precision:0.3566 Recall:0.1415
Precision:0.2631 Recall:0.2088
Precision:0.1834 Recall:0.2910


## Top K freq word

In [107]:
word_freq = [(word, freq) for word, freq in dataset.vocab.word_freq_in_corpus.items()]
word_freq.sort(key=lambda x:x[1], reverse=True)
word_freq[:10]

[('the', 137959),
 ('and', 71459),
 ('a', 66359),
 ('of', 61514),
 ('to', 52823),
 ('is', 45511),
 ('in', 39808),
 ('it', 31652),
 ('i', 29011),
 ('this', 27891)]

In [108]:
def topk_word_evaluation(k=50):
    topk_word = [word for (word, freq) in word_freq[:k]]

    pr, re = [], []
    for ans in tqdm(test_ans):
        ans = set(ans)
        ans = [dataset.vocab.itos[a] for a in ans]

        hit = []
        for word in ans:
            if word in topk_word:
                hit.append(word)

        precision = len(hit) / k
        recall = len(hit) / len(ans)
        pr.append(precision)
        re.append(recall)

    print('top {} word'.format(k))
    print('percision', np.mean(pr))
    print('recall', np.mean(re))

topk_word_evaluation(k=50)
topk_word_evaluation(k=100)
topk_word_evaluation(k=200)


  0%|          | 0/2000 [00:00<?, ?it/s]

top 50 word
percision 0.57312
recall 0.25995568997152513


  0%|          | 0/2000 [00:00<?, ?it/s]

top 100 word
percision 0.42140999999999995
recall 0.3705359126710024


  0%|          | 0/2000 [00:00<?, ?it/s]

top 200 word
percision 0.2830875
recall 0.48635047389378483
