## Siamese network 
Steps:
1. load word embeding and document embedding
2. create pytorch dataset and dataloader
3. Try Contrastive loss and triplet loss
4. further improve negative sampling (e.g. hard negative or word2vec negative sampling)

## Dataset
Goal:
    for dataloader
1. Read raw file
2. 

CBOW:
    local words and center word
DNN:
    Document vectors and words emb
Triplet:
    

#### raw data
* word embedding: glove
* doc text: ./data/IMDB.txt

### preprocess
1. truncate smallest k word in IDF
2. stemming

### model
1. k highest freq words
2. CBOW
3. Triplet
4.

### evaluation
1. F1
2. F1 weighted by TF-IDF

In [1]:
from collections import defaultdict
import math
import numpy as np 
import re
import torch
import torch.nn as nn
from itertools import cycle
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from tqdm.auto import tqdm


import matplotlib.pyplot as plt 
import pandas as pd

In [25]:
config = {}

config["n_document"] = 10000
config["min_word_freq_threshold"] = 20
config["topk_word_freq_threshold"] = 0
config["document_vector_agg"] = 'TF-IDF'
config["select_topk_TFIDF"] = None


In [26]:
# load word embedding
embedding_file = "../data/glove.6B.100d.txt"

word2embedding = dict()
word_dim = int(re.findall(r".(\d+)d",embedding_file)[0])

with open(embedding_file,"r") as f:
    for line in tqdm(f):
        line = line.strip().split()
        word = line[0]
        embedding = list(map(float,line[1:]))
        word2embedding[word] = embedding

print("Number of words:%d" % len(word2embedding))

0it [00:00, ?it/s]

Number of words:400000


In [27]:
class Vocabulary:
    def __init__(self, word2embedding, min_word_freq_threshold=0, topk_word_freq_threshold=0):
        # The low frequency words will be assigned as <UNK> token
        self.itos = {0: "<UNK>"}
        self.stoi = {"<UNK>": 0}
        
        self.word2embedding = word2embedding
        self.min_word_freq_threshold = min_word_freq_threshold
        self.topk_word_freq_threshold = topk_word_freq_threshold
        
        self.word_freq_in_corpus = defaultdict(int)
        self.IDF = {}
        
    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
        return text.strip().split()

    def build_vocabulary(self, sentence_list):
        
        self.doc_freq = defaultdict(int) # # of document a word appear
        self.document_num = len(sentence_list)
        self.word_vectors = [[0]*word_dim] # unknown word emb
        
        for sentence in tqdm(sentence_list, desc="Preprocessing documents"):
            # for doc_freq
            document_words = set()
            
            for word in self.tokenizer_eng(sentence):
                # pass unknown word
                if word not in self.word2embedding:
                    continue
                    
                # calculate word freq
                self.word_freq_in_corpus[word] += 1
                document_words.add(word)
                
            for word in document_words:
                self.doc_freq[word] += 1
        
        # calculate IDF
        print('doc num', self.document_num)
        for word, freq in self.doc_freq.items():
            self.IDF[word] = math.log(self.document_num / (freq+1))
        
        # delete less freq words:
        delete_words = []
        for word, v in self.word_freq_in_corpus.items():
            if v < self.min_word_freq_threshold:
                delete_words.append(word)     
        for word in delete_words:
            del self.IDF[word]    
            del self.word_freq_in_corpus[word]    
        
        # delete too freq words
        print('eliminate freq words')
        IDF = [(word, freq) for word, freq in self.IDF.items()]
        IDF.sort(key=lambda x: x[1])

        for i in range(self.topk_word_freq_threshold):
            print(word)
            word = IDF[i][0]
            del self.IDF[word]
            del self.word_freq_in_corpus[word]
        
        # construct word_vectors
        idx = 1
        for word in self.word_freq_in_corpus:
            self.word_vectors.append(self.word2embedding[word])
            self.stoi[word] = idx
            self.itos[idx] = word
            idx += 1
            
    def calculate_document_vector(self, sentence_list, agg, select_topk_TFIDF=None):
        document_vectors = []
        document_answers = []
        
        for sentence in tqdm(sentence_list, desc="calculate document vectors"):
            document_vector = np.zeros(len(self.word_vectors[0]))
            select_words = []
            for word in self.tokenizer_eng(sentence):
                # pass unknown word
                if word not in self.stoi:
                    continue
                else:
                    select_words.append(word)

            # select topk TDIDF
            if select_topk_TFIDF is not None:
                doc_TFIDF = defaultdict(float)
                for word in select_words:    
                    doc_TFIDF[word] += self.IDF[word]

                doc_TFIDF_l = [(word, TFIDF) for word, TFIDF in doc_TFIDF.items()]
                doc_TFIDF_l.sort(key=lambda x:x[1], reverse=True)
                
                select_topk_words = set(list(map(lambda x:x[0], doc_TFIDF_l[:select_topk_TFIDF])))
                select_words = [word for word in select_words if word in select_topk_words]
            else:
                pass
            
            # aggregate to doc vectors
            for word in select_words:
                if agg == 'mean':
                    document_vector += self.word2embedding[word]
                elif agg == 'TF-IDF':
                    document_vector += np.array(self.word2embedding[word]) * self.IDF[word]

            if len(select_words) == 0:
                print('error', sentence)
                return -1
            else:
                document_vector /= len(select_words)
            
            document_vectors.append(document_vector)
            document_answers.append(select_words)
        
        # get answers
        document_answers_idx = []    
        for ans in document_answers:
            ans_idx = []
            for token in ans:
                if token in self.stoi:
                    ans_idx.append(self.stoi[token])                    
            document_answers_idx.append(ans_idx)
            
        return document_vectors, document_answers_idx
        
    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [28]:
class CBowDataset(Dataset):
    def __init__(self, 
                 raw_data_file_path,
                 word2embedding,
                 skip_header = False,
                 n_document = None, # read first n document
                 min_word_freq_threshold = 20, # eliminate less freq words
                 topk_word_freq_threshold = 5, # eliminate smallest k IDF words
                 select_topk_TFIDF = None, # select topk tf-idf as ground-truth
                 document_vector_agg = 'mean',
                 ):

        assert document_vector_agg in ['mean', 'TF-IDF']
        
        # raw documents
        self.documents = []
        # document vectors
        self.document_vectors = []
        
        with open(raw_data_file_path,'r',encoding='utf-8') as f:
            if skip_header:
                f.readline()
            for line in tqdm(f, desc="Loading documents"):
                # read firt n document
                if n_document is not None and len(self.documents) >= n_document:
                    break    
                self.documents.append(line.strip("\n"))

        # build vocabulary
        self.vocab = Vocabulary(word2embedding, min_word_freq_threshold, topk_word_freq_threshold)
        self.vocab.build_vocabulary(self.documents)
        self.vocab_size = len(self.vocab)
        
#         self.words_tokenized = [self.vocab.numericalize(text) for text in self.documents]
        
        # calculate document vectors
        self.document_vectors, self.words_tokenized = self.vocab.calculate_document_vector(self.documents, \
                                                                                           document_vector_agg, select_topk_TFIDF)
        # train-test split
        # training
        self.train_length = int(len(self.words_tokenized)*0.8)
        self.train_vectors = self.document_vectors[:self.train_length]
        self.train_words = self.words_tokenized[:self.train_length]
        self.document_ids = list(range(self.train_length))
        self.generator = cycle(self.context_target_generator())
        self.dataset_size = sum([len(s) for s in self.train_words])
        
        # testing
        self.test_vectors = self.document_vectors[self.train_length:]
        self.test_words = self.words_tokenized[self.train_length:]

    def context_target_generator(self):
        np.random.shuffle(self.document_ids) # inplace shuffle

        # randomly select a document and create its training example
        for document_id in self.document_ids: 
            word_list = set(self.train_words[document_id])
            negative_sample_space = list(set(range(self.vocab_size)) - word_list)
            negative_samples = np.random.choice(negative_sample_space,size=len(word_list),replace = False)
            for word_id, negative_wordID in zip(word_list, negative_samples):
                yield [document_id, word_id, negative_wordID]
                
    def __getitem__(self, idx):
        doc_id, word_id, negative_wordID = next(self.generator)
        doc_id = torch.FloatTensor(self.document_vectors[doc_id])
        word_id = torch.FloatTensor(self.vocab.word_vectors[word_id])
        negative_word = torch.FloatTensor(self.vocab.word_vectors[negative_wordID])

        return doc_id, word_id, negative_word

    def __len__(self):
        return self.dataset_size 


In [29]:
# load and build torch dataset
data_file_path = '../data/IMDB.txt'

print("Building dataset....")
dataset = CBowDataset(
                    raw_data_file_path=data_file_path,
                    word2embedding=word2embedding,
                    skip_header=False,
                    n_document = config["n_document"],
                    min_word_freq_threshold = config["min_word_freq_threshold"],
                    topk_word_freq_threshold = config["topk_word_freq_threshold"],
                    document_vector_agg = config["document_vector_agg"],
                    select_topk_TFIDF = config["select_topk_TFIDF"]
                    )
print("Finish building dataset!")
print(f"Number of documents:{len(dataset.documents)}")
print(f"Number of words:{dataset.vocab_size}")

Building dataset....


Loading documents: 0it [00:00, ?it/s]

Preprocessing documents:   0%|          | 0/10000 [00:00<?, ?it/s]

doc num 10000
eliminate freq words


calculate document vectors:   0%|          | 0/10000 [00:00<?, ?it/s]

Finish building dataset!
Number of documents:10000
Number of words:7533


In [30]:
class TestDataset(Dataset):
    def __init__(self, 
                 doc_vectors,
                 ans_words,
                 ):
        self.doc_vectors = doc_vectors
        self.ans_words = ans_words
        assert len(doc_vectors) == len(ans_words)
        
    def __getitem__(self, idx):
        doc_vec = torch.FloatTensor(self.doc_vectors[idx])
        ans_w = torch.tensor(list(set(self.ans_words[idx])))
        return doc_vec, ans_w

    def collate_fn(self,batch):
        # Batch: List of tuples [(batch1), (batch2)]
        
        doc_vec = torch.cat([item[0].unsqueeze(0) for item in batch], dim=0)
        ans_w = [item[1] for item in batch]
        ans_w = pad_sequence(ans_w, batch_first=True, padding_value=-1)
        
        return doc_vec, ans_w 

    def __len__(self):
        return len(self.doc_vectors)


In [31]:
class TripletNet(nn.Module):
    def __init__(self, hdim):
        super(TripletNet, self).__init__()
        self.fc = nn.Sequential(nn.Linear(hdim, 256),
                        nn.PReLU(),
                        nn.Linear(256, 256),
                        nn.PReLU(),
                        nn.Linear(256, 2)
                        )


    def forward(self, x1, x2, x3):
        output1 = self.fc(x1)
        output2 = self.fc(x2)
        output3 = self.fc(x3)
        return output1, output2, output3

    def get_embedding(self, x):
        return self.fc(x)

In [32]:
class TripletLoss(nn.Module):
    """
    Triplet loss
    Takes embeddings of an anchor sample, a positive sample and a negative sample
    """

    def __init__(self, margin):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative, size_average=True):
        distance_positive = (anchor - positive).pow(2).sum(1)  # .pow(.5)
        distance_negative = (anchor - negative).pow(2).sum(1)  # .pow(.5)
        losses = F.relu(distance_positive - distance_negative + self.margin)
        return losses.mean() if size_average else losses.sum()

In [33]:
margin = 1.
BATCH_SIZE = 1024
EPOCH = 300

device = "cuda:0"
model = TripletNet(word_dim).to(device)
loss_fn = TripletLoss(margin).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

In [34]:
train_loader = DataLoader(
                        dataset, 
                        batch_size=BATCH_SIZE,
                        num_workers=4,
                        shuffle=True,
                        )

In [35]:
test_docvec = dataset.test_vectors
test_ans = dataset.test_words
test_dataset = TestDataset(test_docvec,test_ans)
test_loader = DataLoader(test_dataset,                         
                         batch_size=BATCH_SIZE,
                         num_workers=4,
                         collate_fn=test_dataset.collate_fn)
word_embedding_tensor = torch.FloatTensor(dataset.vocab.word_vectors).to(device)

In [36]:
def evaluate(test_word_emb, loader,Ks = [50,100,150,200]):
    avg_precision, avg_recall = [], []
    for batch in test_loader:
        batch = [item.to(device) for item in batch]
        emb, ans = batch
        emb = model.get_embedding(emb)
        scores = torch.cdist(emb, test_word_emb)
        ans_length = torch.sum((~ans.eq(-1)).float(), dim=-1)
        mask = ~ans.eq(-1).unsqueeze(-1)
        
        # calculate precision and recall
        tmp_pr, tmp_re = [],[]
        for K in Ks:
            top_indices = torch.argsort(scores,dim=1)[:,:K]
            hit = top_indices.unsqueeze(-2) == ans.unsqueeze(-1)
            hit = torch.sum((hit * mask).flatten(1),dim=-1)
            precision = hit / K
            recall = hit / ans_length
            tmp_pr.append(precision)
            tmp_re.append(recall)
        tmp_pr = torch.stack(tmp_pr).T.detach().cpu().numpy().tolist()
        tmp_re = torch.stack(tmp_re).T.detach().cpu().numpy().tolist()
        avg_precision.extend(tmp_pr)
        avg_recall.extend(tmp_re)
        
    avg_precision = np.mean(avg_precision,axis=0)
    avg_recall = np.mean(avg_recall, axis=0)
    for idx, kval in enumerate(Ks):
        print(f"[K={kval}] Precision:{avg_precision[idx]:.4f} Recall:{avg_recall[idx]:.4f}")
    return avg_precision, avg_recall

In [37]:
# for epoch in range(EPOCH):
#     avg_loss = []
#     model.train()
#     for batch in tqdm(train_loader):
#         batch = [item.to(device) for item in batch]
#         doc_id,pos_w,neg_w = batch
#         optimizer.zero_grad()
#         loss = loss_fn(*model(doc_id,pos_w,neg_w))
#         loss.backward()
#         optimizer.step()
#         avg_loss.append(loss.item())
#     avg_loss = np.mean(avg_loss)
#     print(f"Loss:{avg_loss:4f}")
    
#     # evaluate
#     model.eval()
#     test_word_emb = model.get_embedding(word_embedding_tensor)
#     res = evaluate(test_word_emb,test_loader)

In [38]:
from sklearn.metrics import ndcg_score

def evaluate_NDCG(test_word_emb, loader, topk=None):
    NDCGs = defaultdict(list)
    
    for batch in (test_loader):
        batch = [item.to(device) for item in batch]
        emb, ans = batch
        TFIDF_ans = np.zeros((len(ans), test_word_emb.shape[0]))
        for i in range(len(ans)):
            ans_row = ans[i]
            ans_row = ans_row[~ans_row.eq(-1)]
            ans_row = ans_row[~ans_row.eq(0)]
            for word_id in ans_row:
                word_id = word_id.item()
                word = dataset.vocab.itos[word_id]
                TFIDF_ans[i][word_id] += dataset.vocab.IDF[word]
             
        emb = model.get_embedding(emb)
        scores = -torch.cdist(emb, test_word_emb).cpu().detach().numpy()
        true_relevance = TFIDF_ans

        NDCGs['top50'].append(ndcg_score(true_relevance, scores, k=50))
        NDCGs['top100'].append(ndcg_score(true_relevance, scores, k=100))
        NDCGs['top200'].append(ndcg_score(true_relevance, scores, k=200))
        NDCGs['ALL'].append(ndcg_score(true_relevance, scores, k=None))
    
    print('NDCG top50', np.mean(NDCGs['top50']))
    print('NDCG top100', np.mean(NDCGs['top100']))
    print('NDCG top200', np.mean(NDCGs['top200']))
    print('NDCG ALL', np.mean(NDCGs['ALL']))
    return NDCGs

In [39]:
# validation_history = []

# for epoch in range(EPOCH):
#     avg_loss = []
#     model.train()
#     for batch in tqdm(train_loader):
#         batch = [item.to(device) for item in batch]
#         doc_id,pos_w,neg_w = batch
#         optimizer.zero_grad()
#         loss = loss_fn(*model(doc_id,pos_w,neg_w))
#         loss.backward()
#         optimizer.step()
#         avg_loss.append(loss.item())
#     avg_loss = np.mean(avg_loss)
#     print(f"Loss:{avg_loss:4f}")
    
#     # evaluate
#     model.eval()
#     test_word_emb = model.get_embedding(word_embedding_tensor)
#     ndcg_res = evaluate_NDCG(test_word_emb,test_loader)
#     validation_history.append(ndcg_res)

## Lasso

In [47]:
# select #answer largest pred
def metric4(binary_x, answer, w_idx=None, topk=50, verbose=0):
    select_num = topk
    answer = list(set(answer))
    
    if w_idx is not None:
        pred = w_idx[np.argsort(binary_x)[-select_num:]]
    else:
        pred = np.arange(len(binary_x))[np.argsort(binary_x)[-select_num:]]
    
    hit = np.intersect1d(pred, answer)
    hit_num = len(hit)
    recall = hit_num / len(answer)
    precision = hit_num / len(pred)
    if verbose == 1:
        print('answer:', word_list[answer])
        print('hit:', word_list[hit])
    return {"recall": recall, "precision": precision}

# select #answer largest pred
def metric_ndcg(binary_x, answer, topk=50, verbose=0):

    TFIDF_ans = np.zeros(len(binary_x))
    for word_idx in answer:
        if word_idx == 0:
            continue
        word = dataset.vocab.itos[word_idx]
        TFIDF_ans[word_idx] += dataset.vocab.IDF[word]
    NDCG_score = ndcg_score(TFIDF_ans.reshape(1,-1), binary_x.reshape(1,-1), k=topk)
    
    if verbose == 1:
        print('NDCG_score:', NDCG_score)
    return NDCG_score

In [48]:
from sklearn.metrics import r2_score

class PyTorchLinearRegression:
    ''' Class that implemnets Multiple Linear Regression with PyTorch'''
    def __init__(self, num_of_features, lr, constraintHigh, constraintLow, total, init_type=0, L1=0, L2=0):
        if init_type == 0:
            self.w = torch.zeros(num_of_features, requires_grad=True)
        elif init_type == 1:
            self.w = torch.ones(num_of_features, requires_grad=True)
        elif init_type == 2:  
            self.w = torch.rand(num_of_features, requires_grad=True)
        elif init_type == 3:
            self.w = -torch.ones(num_of_features, requires_grad=True)

        self.learning_rate = lr
        self.high = constraintHigh
        self.low = constraintLow
        self.total = total
        self.rg2 = total / num_of_features
        self.L1 = L1
        self.L2 = L2
        
    def _model(self, X):
        return X @ self.w.t()# + self.b
    
    def _mse(self, pred, real):
        difference = pred - real
        return torch.sum(difference * difference) / difference.numel()
    
    def _regularization_weightdist(self):
        difference = self.w - 1
        return -torch.sum(difference * difference) / difference.numel()
    
    def _regularization_weightsum(self):
        difference = torch.sum(self.w) - self.total
        return difference * difference / self.w.numel()
    
    def fit(self, X, y, epochs):
        print(loss_weight)
        X = torch.from_numpy(X).float()
        y = torch.from_numpy(y).float()
        
        for i in range(epochs):
            predictions = self._model(X)
            loss1 = self._mse(predictions, y)
            loss2 = self._regularization_weightdist()
            loss3 = self._regularization_weightsum()
            loss = loss1 * loss_weight[0] + loss2 * loss_weight[1] + loss3 * loss_weight[2]
           
            if (i % (epochs//20)) == 0:
                print(f'Epoch: {i} - Loss: {loss1}')
            
            loss.backward()
            with torch.no_grad():
                self.w -= (self.w.grad) * self.learning_rate + torch.sign(self.w)*self.L1 + self.w*self.L2
                self.w.grad.zero_()
                self.w.data.clamp_(min=self.low, max=self.high)

#             x = 100
#             if i % x == x-1:
# #                 self.w=torch.tensor(self.low + (self.high-self.low)*(self.w - torch.min(self.w))/(torch.max(self.w) - torch.min(self.w)), requires_grad=True)
#                 self.w.data.clamp_(min=self.low, max=self.high)
#                 pass
                
    def predict(self, X):
        X = torch.from_numpy(X).float()
        return self._model(X)
    
    def score(self, X, y):
        X = torch.from_numpy(X).float()
        y_pred = self._model(X).detach().numpy()
        return r2_score(y, y_pred)

In [49]:
word_embs = np.array(dataset.vocab.word_vectors)
doc_embs = np.array(dataset.document_vectors)
doc_answers = dataset.words_tokenized
word_list = dataset.vocab.itos

print(word_embs.shape)
print(doc_embs.shape)
print(len(doc_answers))


(7533, 100)
(10000, 100)
10000


In [50]:
pr, re = [[],[],[]], [[],[],[]]
ndcgs = defaultdict(list)

lr = 0.001
epochs = 1000
constraintHigh=1
constraintLow=0
# constraintHigh=float('inf')
# constraintLow=-float('inf')
loss_weight = [1, 0, 0]
L1, L2 = 1e-5, 0
rand_type = 0

total_mul = 1

for uid, uemb in enumerate(tqdm(doc_embs[:10])):
    x = word_embs.T
    y = uemb
    total = len(doc_answers[uid])

    torch_model = PyTorchLinearRegression(x.shape[1], lr, constraintHigh, constraintLow, int(total*total_mul), rand_type, L1, L2)
    torch_model.fit(x, y, epochs)
    
    m1 = metric4(torch_model.w.detach().numpy(), doc_answers[uid], w_idx=None, topk=50, verbose=0)
    m2 = metric4(torch_model.w.detach().numpy(), doc_answers[uid], w_idx=None, topk=100, verbose=0)
    m3 = metric4(torch_model.w.detach().numpy(), doc_answers[uid], w_idx=None, topk=200, verbose=0)
    ndcg1 = metric_ndcg(torch_model.w.detach().numpy(), doc_answers[uid], topk=50, verbose=0)
    ndcg2 = metric_ndcg(torch_model.w.detach().numpy(), doc_answers[uid], topk=100, verbose=0)
    ndcg3 = metric_ndcg(torch_model.w.detach().numpy(), doc_answers[uid], topk=200, verbose=0)
    ndcg4 = metric_ndcg(torch_model.w.detach().numpy(), doc_answers[uid], topk=None, verbose=0)
    pr[0].append(m1["precision"])
    re[0].append(m1["recall"])
    pr[1].append(m2["precision"])
    re[1].append(m2["recall"])
    pr[2].append(m3["precision"])
    re[2].append(m3["recall"])
    
    ndcgs["50"].append(ndcg1)
    ndcgs["100"].append(ndcg2)
    ndcgs["200"].append(ndcg3)
    ndcgs["-1"].append(ndcg4)

  0%|          | 0/10 [00:00<?, ?it/s]

[1, 0, 0]
Epoch: 0 - Loss: 0.4937790334224701
Epoch: 50 - Loss: 0.008251945488154888
Epoch: 100 - Loss: 0.004072097595781088
Epoch: 150 - Loss: 0.002774542896077037
Epoch: 200 - Loss: 0.002185232238844037
Epoch: 250 - Loss: 0.0018591865664348006
Epoch: 300 - Loss: 0.0016590289305895567
Epoch: 350 - Loss: 0.001522210892289877
Epoch: 400 - Loss: 0.0014211998786777258
Epoch: 450 - Loss: 0.0013432650594040751
Epoch: 500 - Loss: 0.0012790452456101775
Epoch: 550 - Loss: 0.0012316247448325157
Epoch: 600 - Loss: 0.0011922030244022608
Epoch: 650 - Loss: 0.0011571780778467655
Epoch: 700 - Loss: 0.0011258076410740614
Epoch: 750 - Loss: 0.0010991323506459594
Epoch: 800 - Loss: 0.0010731748770922422
Epoch: 850 - Loss: 0.0010474775917828083
Epoch: 900 - Loss: 0.0010238197864964604
Epoch: 950 - Loss: 0.0010019266046583652
[1, 0, 0]
Epoch: 0 - Loss: 0.3999672830104828
Epoch: 50 - Loss: 0.0017954212380573153
Epoch: 100 - Loss: 0.0011469894088804722
Epoch: 150 - Loss: 0.0009346501319669187
Epoch: 200 - 

In [51]:
print(f"Precision:{np.mean(pr[0]):.4f} Recall:{np.mean(re[0]):.4f}")
print(f"Precision:{np.mean(pr[1]):.4f} Recall:{np.mean(re[1]):.4f}")
print(f"Precision:{np.mean(pr[2]):.4f} Recall:{np.mean(re[2]):.4f}")
print(f"NDCG 50:{np.mean(ndcgs['50']):.4f}")
print(f"NDCG 100:{np.mean(ndcgs['100']):.4f}")
print(f"NDCG 200:{np.mean(ndcgs['200']):.4f}")
print(f"NDCG all:{np.mean(ndcgs['-1']):.4f}")


Precision:0.4260 Recall:0.1797
Precision:0.3290 Recall:0.2738
Precision:0.2495 Recall:0.4015
NDCG 50:0.2574
NDCG 100:0.2772
NDCG 200:0.3099
NDCG all:0.5547


## Top K freq word

In [40]:
word_freq = [(word, freq) for word, freq in dataset.vocab.word_freq_in_corpus.items()]
word_freq.sort(key=lambda x:x[1], reverse=True)
word_freq[:10]

[('the', 137959),
 ('and', 71459),
 ('a', 66359),
 ('of', 61514),
 ('to', 52823),
 ('is', 45511),
 ('in', 39808),
 ('it', 31652),
 ('i', 29011),
 ('this', 27891)]

In [41]:
def topk_word_evaluation(k=50):
    topk_word = [word for (word, freq) in word_freq[:k]]

    pr, re = [], []
    for ans in tqdm(test_ans):
        ans = set(ans)
        ans = [dataset.vocab.itos[a] for a in ans]

        hit = []
        for word in ans:
            if word in topk_word:
                hit.append(word)

        precision = len(hit) / k
        recall = len(hit) / len(ans)
        pr.append(precision)
        re.append(recall)

    print('top {} word'.format(k))
    print('percision', np.mean(pr))
    print('recall', np.mean(re))

topk_word_evaluation(k=50)
topk_word_evaluation(k=100)
topk_word_evaluation(k=200)


  0%|          | 0/2000 [00:00<?, ?it/s]

top 50 word
percision 0.57312
recall 0.25995568997152513


  0%|          | 0/2000 [00:00<?, ?it/s]

top 100 word
percision 0.42140999999999995
recall 0.3705359126710024


  0%|          | 0/2000 [00:00<?, ?it/s]

top 200 word
percision 0.2830875
recall 0.48635047389378483


In [45]:
# ## test dcg
# from sklearn.metrics import ndcg_score, dcg_score
# k=2

# true_relevance = np.asarray([[1, 2, 3, 4]])
# scores = np.asarray([[1, 2, 3, 2.5]])
# print('dcg',dcg_score(true_relevance, scores,k=k))
# print('ndcg',ndcg_score(true_relevance, scores,k=k))


# w = 1 / (np.log(np.arange(true_relevance.shape[1])[:k] + 2) / np.log(2))
# dcg = true_relevance[0][np.argsort(scores)[0][::-1][:k]].dot(w)
# print(dcg)

# idcg = np.sort(true_relevance[0])[::-1][:k].dot(w)
# print(dcg/idcg)

In [46]:
def topk_word_evaluation_NDCG(k=50):
    freq_word =[word for (word, freq) in word_freq]
    freq_word_idx = [dataset.vocab.stoi[word] for word in freq_word if word in dataset.vocab.stoi]
    
    scores = np.zeros(len(dataset.vocab.word_vectors))
    for rank, idx in enumerate(freq_word_idx):
        scores[idx] = len(dataset.vocab.word_vectors) - rank
    
    NDCGs = []
    
    for ans in tqdm(test_ans):
        TFIDF_ans = np.zeros(len(dataset.vocab.word_vectors))
        
        for word_idx in ans:
            if word_idx == 0:
                continue
            word = dataset.vocab.itos[word_idx]
            TFIDF_ans[word_idx] += dataset.vocab.IDF[word]

        NDCG_score = ndcg_score(TFIDF_ans.reshape(1,-1), scores.reshape(1,-1), k=k)
        NDCGs.append(NDCG_score)

    print('top {} NDCG:{}'.format(k, np.mean(NDCGs)))

topk_word_evaluation_NDCG(k=50)
topk_word_evaluation_NDCG(k=100)
topk_word_evaluation_NDCG(k=200)
topk_word_evaluation_NDCG(k=None)


  0%|          | 0/2000 [00:00<?, ?it/s]

top 50 NDCG:0.08714991654386776


  0%|          | 0/2000 [00:00<?, ?it/s]

top 100 NDCG:0.11619786532653996


  0%|          | 0/2000 [00:00<?, ?it/s]

top 200 NDCG:0.15694766452897216


  0%|          | 0/2000 [00:00<?, ?it/s]

top None NDCG:0.43048368192931924


## Nearest Guessing, so bad

In [52]:
word_embs_IDF = word_embs.copy()

for word, IDF in dataset.vocab.IDF.items():
    word_idx = dataset.vocab.stoi[word]
    word_embs_IDF[word_idx] *= IDF

In [62]:
pr, re = [[],[],[]], [[],[],[]]
ndcgs = defaultdict(list)

for uid, uemb in enumerate(tqdm(doc_embs[:100])):
    y = uemb
    word_embs_IDF
    word_weight = np.dot(word_embs_IDF, y)
    
    m1 = metric4(word_weight, doc_answers[uid], w_idx=None, topk=50, verbose=0)
    m2 = metric4(word_weight, doc_answers[uid], w_idx=None, topk=100, verbose=0)
    m3 = metric4(word_weight, doc_answers[uid], w_idx=None, topk=200, verbose=0)
    ndcg1 = metric_ndcg(word_weight, doc_answers[uid], topk=50, verbose=0)
    ndcg2 = metric_ndcg(word_weight, doc_answers[uid], topk=100, verbose=0)
    ndcg3 = metric_ndcg(word_weight, doc_answers[uid], topk=200, verbose=0)
    ndcg4 = metric_ndcg(word_weight, doc_answers[uid], topk=None, verbose=0)
    pr[0].append(m1["precision"])
    re[0].append(m1["recall"])
    pr[1].append(m2["precision"])
    re[1].append(m2["recall"])
    pr[2].append(m3["precision"])
    re[2].append(m3["recall"])
    
    ndcgs["50"].append(ndcg1)
    ndcgs["100"].append(ndcg2)
    ndcgs["200"].append(ndcg3)
    ndcgs["-1"].append(ndcg4)

  0%|          | 0/100 [00:00<?, ?it/s]

In [63]:
print(f"Precision:{np.mean(pr[0]):.4f} Recall:{np.mean(re[0]):.4f}")
print(f"Precision:{np.mean(pr[1]):.4f} Recall:{np.mean(re[1]):.4f}")
print(f"Precision:{np.mean(pr[2]):.4f} Recall:{np.mean(re[2]):.4f}")
print(f"NDCG 50:{np.mean(ndcgs['50']):.4f}")
print(f"NDCG 100:{np.mean(ndcgs['100']):.4f}")
print(f"NDCG 200:{np.mean(ndcgs['200']):.4f}")
print(f"NDCG all:{np.mean(ndcgs['-1']):.4f}")


Precision:0.0120 Recall:0.0054
Precision:0.0110 Recall:0.0089
Precision:0.0100 Recall:0.0164
NDCG 50:0.0159
NDCG 100:0.0193
NDCG 200:0.0262
NDCG all:0.3377
