## Siamese network 
Steps:
1. load word embeding and document embedding
2. create pytorch dataset and dataloader
3. Try Contrastive loss and triplet loss
4. further improve negative sampling (e.g. hard negative or word2vec negative sampling)

In [1]:
import numpy as np 
import re
import torch
import torch.nn as nn
from itertools import cycle
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from tqdm import tqdm

In [2]:
# load word embedding
word2embedding = dict()
embedding_file = "../../word/glove.6B.100d.txt"
word_dim = int(re.findall(r".(\d+)d",embedding_file)[0])
with open(embedding_file,"r") as f:
    for line in f:
        line = line.strip().split()
        word = line[0]
        embedding = list(map(float,line[1:]))
        word2embedding[word] = embedding

print("Number of words:%d" % len(word2embedding))

Number of words:400000


In [3]:
class Vocabulary:
    def __init__(self, freq_threshold, word2embedding):
        # The low frequency words will be assigned as <UNK> token
        self.itos = {0: "<UNK>"}
        self.stoi = {"<UNK>": 0}
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
        return text.strip().split()

    def build_vocabulary(self, sentence_list):
        self.frequencies = {}
        self.word_vectors = [[0]*word_dim] # init zero padding
        idx = 1
        
        for sentence in tqdm(sentence_list, desc="Preprocessing documents"):
            for word in self.tokenizer_eng(sentence):
                if word not in word2embedding:
                    continue
                if word not in self.frequencies:
                    self.frequencies[word] = 1

                else:
                    self.frequencies[word] += 1

                if self.frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    self.word_vectors.append(word2embedding[word])
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [4]:
class CBowDataset(Dataset):
    def __init__(self, 
                 data_file_path,
                 word2embedding,
                 freq_threshold=20,
                 skip_header = False,
                 max_length = None,
                 ):
        # read data
        self.document_vectors = []
        docEmb_file = open("../data/docvector.txt","r")
        with open(data_file_path,'r',encoding='utf-8') as f:
            if skip_header:
                f.readline()
            self.documents = []
            for line in tqdm(f, desc="Loading documents"):
                if max_length is not None and len(self.documents) >= max_length:
                    break
                self.documents.append(line.strip("\n"))
                doc_vec = docEmb_file.readline().strip().split()
                doc_vec = list(map(float, doc_vec))
                self.document_vectors.append(doc_vec)
        
        # build vocabulary
        self.vocab = Vocabulary(freq_threshold,word2embedding)
        self.vocab.build_vocabulary(self.documents)
        self.vocab_size = len(self.vocab)
        self.words_tokenized = [self.vocab.numericalize(text) for text in self.documents]
        
        # train-test split
        # training
        self.train_length = int(len(self.words_tokenized)*0.8)
        self.train_vectors = self.document_vectors[:self.train_length]
        self.train_words = self.words_tokenized[:self.train_length]
        self.document_ids = list(range(self.train_length))
        self.generator = cycle(self.context_target_generator())
        self.dataset_size = sum([len(s) for s in self.train_words])
        
        # testing
        self.test_vectors = self.document_vectors[self.train_length:]
        self.test_words = self.words_tokenized[self.train_length:]



    def context_target_generator(self):
        np.random.shuffle(self.document_ids) # inplace shuffle

        # randomly select a document and create its training example
        for document_id in self.document_ids: 
            word_list = set(self.train_words[document_id])
            negative_sample_space = list(set(range(self.vocab_size)) - word_list)
            negative_samples = np.random.choice(negative_sample_space,size=len(word_list),replace = False)
            for word_id, negative_wordID in zip(word_list, negative_samples):
                yield [document_id, word_id, negative_wordID]
                
    def __getitem__(self, idx):
        doc_id, word_id, negative_wordID = next(self.generator)
        doc_id = torch.FloatTensor(self.document_vectors[doc_id])
        word_id = torch.FloatTensor(self.vocab.word_vectors[word_id])
        negative_word = torch.FloatTensor(self.vocab.word_vectors[negative_wordID])

        return doc_id, word_id, negative_word

    def __len__(self):
        return self.dataset_size 


In [5]:
# load and build torch dataset
data_file_path = '../data/IMDB.txt'
# checkpoint_path = "doc2vecC_lr0.001.pt"
print("Building dataset....")
dataset = CBowDataset(
                    data_file_path=data_file_path,
                    word2embedding=word2embedding,
                    max_length=None,
                    freq_threshold=20,
                    skip_header=False
                    )
print("Finish building dataset!")
print(f"Number of documents:{len(dataset.documents)}")
print(f"Number of words:{dataset.vocab_size}")

Loading documents: 0it [00:00, ?it/s]

Building dataset....


Loading documents: 100000it [00:03, 29048.47it/s]
Preprocessing documents: 100%|███████| 100000/100000 [00:09<00:00, 10147.13it/s]


Finish building dataset!
Number of documents:100000
Number of words:27961


In [6]:
class TestDataset(Dataset):
    def __init__(self, 
                 doc_vectors,
                 ans_words,
                 ):
        self.doc_vectors = doc_vectors
        self.ans_words = ans_words
        assert len(doc_vectors) == len(ans_words)
        
    def __getitem__(self, idx):
        doc_vec = torch.FloatTensor(self.doc_vectors[idx])
        ans_w = torch.tensor(list(set(self.ans_words[idx])))
        return doc_vec, ans_w

    def collate_fn(self,batch):
        # Batch: List of tuples [(batch1), (batch2)]
        
        doc_vec = torch.cat([item[0].unsqueeze(0) for item in batch], dim=0)
        ans_w = [item[1] for item in batch]
        ans_w = pad_sequence(ans_w, batch_first=True, padding_value=-1)
        
        return doc_vec, ans_w 

    def __len__(self):
        return len(self.doc_vectors)


In [7]:
class TripletNet(nn.Module):
    def __init__(self, hdim):
        super(TripletNet, self).__init__()
        self.fc = nn.Sequential(nn.Linear(hdim, 256),
                        nn.PReLU(),
                        nn.Linear(256, 256),
                        nn.PReLU(),
                        nn.Linear(256, 2)
                        )


    def forward(self, x1, x2, x3):
        output1 = self.fc(x1)
        output2 = self.fc(x2)
        output3 = self.fc(x3)
        return output1, output2, output3

    def get_embedding(self, x):
        return self.fc(x)

In [8]:
class TripletLoss(nn.Module):
    """
    Triplet loss
    Takes embeddings of an anchor sample, a positive sample and a negative sample
    """

    def __init__(self, margin):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative, size_average=True):
        distance_positive = (anchor - positive).pow(2).sum(1)  # .pow(.5)
        distance_negative = (anchor - negative).pow(2).sum(1)  # .pow(.5)
        losses = F.relu(distance_positive - distance_negative + self.margin)
        return losses.mean() if size_average else losses.sum()

In [9]:
margin = 1.
BATCH_SIZE = 1024
EPOCH = 300

device = "cuda:0"
model = TripletNet(word_dim).to(device)
loss_fn = TripletLoss(margin).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

In [10]:
train_loader = DataLoader(
                        dataset, 
                        batch_size=BATCH_SIZE,
                        num_workers=4,
                        shuffle=True,
                        )

In [11]:
test_docvec = dataset.test_vectors
test_ans = dataset.test_words
test_dataset = TestDataset(test_docvec,test_ans)
test_loader = DataLoader(test_dataset,                         
                         batch_size=BATCH_SIZE,
                         num_workers=4,
                         collate_fn=test_dataset.collate_fn)
word_embedding_tensor = torch.FloatTensor(dataset.vocab.word_vectors).to(device)

In [12]:
def evaluate(test_word_emb, loader,Ks = [50,100,150,200]):
    avg_precision, avg_recall = [], []
    for batch in test_loader:
        batch = [item.to(device) for item in batch]
        emb, ans = batch
        emb = model.get_embedding(emb)
        scores = torch.cdist(emb, test_word_emb)
        ans_length = torch.sum((~ans.eq(-1)).float(), dim=-1)
        mask = ~ans.eq(-1).unsqueeze(-1)
        
        # calculate precision and recall
        tmp_pr, tmp_re = [],[]
        for K in Ks:
            top_indices = torch.argsort(scores,dim=1)[:,:K]
            hit = top_indices.unsqueeze(-2) == ans.unsqueeze(-1)
            hit = torch.sum((hit * mask).flatten(1),dim=-1)
            precision = hit / K
            recall = hit / ans_length
            tmp_pr.append(precision)
            tmp_re.append(recall)
        tmp_pr = torch.stack(tmp_pr).T.detach().cpu().numpy().tolist()
        tmp_re = torch.stack(tmp_re).T.detach().cpu().numpy().tolist()
        avg_precision.extend(tmp_pr)
        avg_recall.extend(tmp_re)
        
    avg_precision = np.mean(avg_precision,axis=0)
    avg_recall = np.mean(avg_recall, axis=0)
    for idx, kval in enumerate(Ks):
        print(f"[K={kval}] Precision:{avg_precision[idx]:.4f} Recall:{avg_recall[idx]:.4f}")
    return avg_precision, avg_recall

In [None]:
for epoch in range(EPOCH):
    avg_loss = []
    model.train()
    for batch in tqdm(train_loader):
        batch = [item.to(device) for item in batch]
        doc_id,pos_w,neg_w = batch
        optimizer.zero_grad()
        loss = loss_fn(*model(doc_id,pos_w,neg_w))
        loss.backward()
        optimizer.step()
        avg_loss.append(loss.item())
    avg_loss = np.mean(avg_loss)
    print(f"Loss:{avg_loss:4f}")
    
    # evaluate
    model.eval()
    test_word_emb = model.get_embedding(word_embedding_tensor)
    res = evaluate(test_word_emb,test_loader)

100%|█████████████████████████████████████| 18094/18094 [04:21<00:00, 69.24it/s]

Loss:0.204756



  0%|                                                 | 0/18094 [00:00<?, ?it/s]

[K=50] Precision:0.1689 Recall:0.0716
[K=100] Precision:0.1643 Recall:0.1372
[K=150] Precision:0.1544 Recall:0.1915
[K=200] Precision:0.1460 Recall:0.2401


100%|█████████████████████████████████████| 18094/18094 [04:23<00:00, 68.65it/s]

Loss:0.200234



  0%|                                                 | 0/18094 [00:00<?, ?it/s]

[K=50] Precision:0.5202 Recall:0.2220
[K=100] Precision:0.4141 Recall:0.3451
[K=150] Precision:0.3317 Recall:0.4088
[K=200] Precision:0.2816 Recall:0.4579


 12%|████▋                                 | 2242/18094 [00:35<03:14, 81.51it/s]