## Siamese network 
Steps:
1. load word embeding and document embedding
2. create pytorch dataset and dataloader
3. Try Contrastive loss and triplet loss
4. further improve negative sampling (e.g. hard negative or word2vec negative sampling)

#### raw data
* word embedding: glove
* doc text: ./data/IMDB.txt

### preprocess
1. truncate smallest k word in IDF
2. stemming

### model
1. k highest freq words
2. CBOW
3. Triplet
4.

### evaluation
1. F1
2. F1 weighted by TF-IDF

In [1]:
from collections import defaultdict
import math
import numpy as np 
import re
import torch
import torch.nn as nn
from itertools import cycle
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from tqdm.auto import tqdm

In [2]:
n_document = 10000
min_word_freq_threshold = 20
topk_word_freq_threshold = 100
document_vector_agg = 'mean'

In [3]:
# load word embedding
embedding_file = "../data/glove.6B.100d.txt"

word2embedding = dict()
word_dim = int(re.findall(r".(\d+)d",embedding_file)[0])

with open(embedding_file,"r") as f:
    for line in tqdm(f):
        line = line.strip().split()
        word = line[0]
        embedding = list(map(float,line[1:]))
        word2embedding[word] = embedding

print("Number of words:%d" % len(word2embedding))

0it [00:00, ?it/s]

Number of words:400000


In [4]:
class Vocabulary:
    def __init__(self, word2embedding, min_word_freq_threshold=0, topk_word_freq_threshold=0):
        # The low frequency words will be assigned as <UNK> token
        self.itos = {0: "<UNK>"}
        self.stoi = {"<UNK>": 0}
        self.word2embedding = word2embedding
        self.min_word_freq_threshold = min_word_freq_threshold
        self.topk_word_freq_threshold = topk_word_freq_threshold
        
    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
        return text.strip().split()

    def build_vocabulary(self, sentence_list):
        self.word_freq_in_corpus = defaultdict(int)
        self.doc_freq = defaultdict(int) # # of document a word appear
        self.document_num = len(sentence_list)
        
        self.word_vectors = [[0]*word_dim] # init zero padding
        self.mask_word = set()
        idx = 1
        
        for sentence in tqdm(sentence_list, desc="Preprocessing documents"):
            # for doc_freq
            document_words = set()
            
            for word in self.tokenizer_eng(sentence):
                # pass unknown word
                if word not in self.word2embedding:
                    continue
                    
                # calculate word freq
                self.word_freq_in_corpus[word] += 1
                
                # validate word if it is more than min_word_freq_threshold times
                if self.word_freq_in_corpus[word] == self.min_word_freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    self.word_vectors.append(self.word2embedding[word])
                    idx += 1
                
                document_words.add(word)
                
            for word in document_words:
                self.doc_freq[word] += 1
        
        # calculate IDF
        self.IDF = {}
        print('doc num', self.document_num)
        for word, freq in self.doc_freq.items():
            self.IDF[word] = math.log(self.document_num / (freq+1))
        
        # eliminate smallest K IDF words
        IDF = [(word, freq) for word, freq in self.IDF.items()]
        IDF.sort(key=lambda x: x[1])
        
        print('eliminate words')
        for i in range(self.topk_word_freq_threshold):
            if IDF[i][0] not in self.stoi:
                continue
            print(IDF[i][0])
            idx = self.stoi[IDF[i][0]]
            del self.stoi[IDF[i][0]]
            del self.itos[idx]
            del self.word_freq_in_corpus[IDF[i][0]]
                
    def calculate_document_vector(self, sentence_list, agg):
        document_vectors = []
        
        for sentence in tqdm(sentence_list, desc="calculate document vectors"):
            document_vector = np.zeros(len(self.word_vectors[0]))
            document_word_count = 0
            
            for word in self.tokenizer_eng(sentence):
                # pass unknown word
                if word not in self.stoi:
                    continue
                else:
                    document_word_count += 1
                    
                    if agg == 'mean':
                        document_vector += self.word2embedding[word]
                    elif agg == 'TF-IDF':
                        document_vector += self.word2embedding[word] * self.IDF[word]
                
            if document_word_count == 0:
                print('error', sentence)
                return -1
            else:
                document_vector /= document_word_count
            
            document_vectors.append(document_vector)
            
        return document_vectors
        
    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [5]:
class CBowDataset(Dataset):
    def __init__(self, 
                 raw_data_file_path,
                 word2embedding,
                 skip_header = False,
                 n_document = None,
                 min_word_freq_threshold = 2,
                 topk_word_freq_threshold = 5,
                 document_vector_agg = 'mean',
                 ):

        assert document_vector_agg in ['mean', 'TF-IDF']
        
        # raw documents
        self.documents = []
        # document vectors
        self.document_vectors = []
        
        with open(raw_data_file_path,'r',encoding='utf-8') as f:
            if skip_header:
                f.readline()
            for line in tqdm(f, desc="Loading documents"):
                # read firt n document
                if n_document is not None and len(self.documents) >= n_document:
                    break    
                self.documents.append(line.strip("\n"))

        # build vocabulary
        self.vocab = Vocabulary(word2embedding, min_word_freq_threshold, topk_word_freq_threshold)
        self.vocab.build_vocabulary(self.documents)
        self.vocab_size = len(self.vocab)
        self.words_tokenized = [self.vocab.numericalize(text) for text in self.documents]
        
        # calculate document vectors
        self.document_vectors = self.vocab.calculate_document_vector(self.documents, document_vector_agg)
        # train-test split
        # training
        self.train_length = int(len(self.words_tokenized)*0.8)
        self.train_vectors = self.document_vectors[:self.train_length]
        self.train_words = self.words_tokenized[:self.train_length]
        self.document_ids = list(range(self.train_length))
        self.generator = cycle(self.context_target_generator())
        self.dataset_size = sum([len(s) for s in self.train_words])
        
        # testing
        self.test_vectors = self.document_vectors[self.train_length:]
        self.test_words = self.words_tokenized[self.train_length:]

    def context_target_generator(self):
        np.random.shuffle(self.document_ids) # inplace shuffle

        # randomly select a document and create its training example
        for document_id in self.document_ids: 
            word_list = set(self.train_words[document_id])
            negative_sample_space = list(set(range(self.vocab_size)) - word_list)
            negative_samples = np.random.choice(negative_sample_space,size=len(word_list),replace = False)
            for word_id, negative_wordID in zip(word_list, negative_samples):
                yield [document_id, word_id, negative_wordID]
                
    def __getitem__(self, idx):
        doc_id, word_id, negative_wordID = next(self.generator)
        doc_id = torch.FloatTensor(self.document_vectors[doc_id])
        word_id = torch.FloatTensor(self.vocab.word_vectors[word_id])
        negative_word = torch.FloatTensor(self.vocab.word_vectors[negative_wordID])

        return doc_id, word_id, negative_word

    def __len__(self):
        return self.dataset_size 


In [6]:
# load and build torch dataset
data_file_path = '../data/IMDB.txt'

print("Building dataset....")
dataset = CBowDataset(
                    raw_data_file_path=data_file_path,
                    word2embedding=word2embedding,
                    skip_header=False,
                    n_document = n_document,
                    min_word_freq_threshold = min_word_freq_threshold,
                    topk_word_freq_threshold = topk_word_freq_threshold,
                    document_vector_agg = document_vector_agg,
                    )
print("Finish building dataset!")
print(f"Number of documents:{len(dataset.documents)}")
print(f"Number of words:{dataset.vocab_size}")

Building dataset....


Loading documents: 0it [00:00, ?it/s]

Preprocessing documents:   0%|          | 0/10000 [00:00<?, ?it/s]

doc num 10000
eliminate words
the
and
a
of
to
is
this
in
it
that
i
for
with
but
as
on
was
one
not
are
film
movie
be
have
all
its
an
you
at
by
from
who
his
has
so
like
he
about
out
if
very
more
good
they
when
just
what
some
or
there
time
great
story
see
my
well
up
can
also
which
only
would
their
really
most
me
her
other
will
had
even
much
than
first
into
were
no
get
best
been
way
people
how
made
do
love
after
because
we
many
films
too
seen
she
him
watch
them
think
movies
two


calculate document vectors:   0%|          | 0/10000 [00:00<?, ?it/s]

Finish building dataset!
Number of documents:10000
Number of words:7433


In [7]:
class TestDataset(Dataset):
    def __init__(self, 
                 doc_vectors,
                 ans_words,
                 ):
        self.doc_vectors = doc_vectors
        self.ans_words = ans_words
        assert len(doc_vectors) == len(ans_words)
        
    def __getitem__(self, idx):
        doc_vec = torch.FloatTensor(self.doc_vectors[idx])
        ans_w = torch.tensor(list(set(self.ans_words[idx])))
        return doc_vec, ans_w

    def collate_fn(self,batch):
        # Batch: List of tuples [(batch1), (batch2)]
        
        doc_vec = torch.cat([item[0].unsqueeze(0) for item in batch], dim=0)
        ans_w = [item[1] for item in batch]
        ans_w = pad_sequence(ans_w, batch_first=True, padding_value=-1)
        
        return doc_vec, ans_w 

    def __len__(self):
        return len(self.doc_vectors)


In [8]:
class TripletNet(nn.Module):
    def __init__(self, hdim):
        super(TripletNet, self).__init__()
        self.fc = nn.Sequential(nn.Linear(hdim, 256),
                        nn.PReLU(),
                        nn.Linear(256, 256),
                        nn.PReLU(),
                        nn.Linear(256, 2)
                        )


    def forward(self, x1, x2, x3):
        output1 = self.fc(x1)
        output2 = self.fc(x2)
        output3 = self.fc(x3)
        return output1, output2, output3

    def get_embedding(self, x):
        return self.fc(x)

In [9]:
class TripletLoss(nn.Module):
    """
    Triplet loss
    Takes embeddings of an anchor sample, a positive sample and a negative sample
    """

    def __init__(self, margin):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative, size_average=True):
        distance_positive = (anchor - positive).pow(2).sum(1)  # .pow(.5)
        distance_negative = (anchor - negative).pow(2).sum(1)  # .pow(.5)
        losses = F.relu(distance_positive - distance_negative + self.margin)
        return losses.mean() if size_average else losses.sum()

In [10]:
margin = 1.
BATCH_SIZE = 1024
EPOCH = 300

device = "cuda:0"
model = TripletNet(word_dim).to(device)
loss_fn = TripletLoss(margin).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

In [11]:
train_loader = DataLoader(
                        dataset, 
                        batch_size=BATCH_SIZE,
                        num_workers=4,
                        shuffle=True,
                        )

In [13]:
test_docvec = dataset.test_vectors
test_ans = dataset.test_words
test_dataset = TestDataset(test_docvec,test_ans)
test_loader = DataLoader(test_dataset,                         
                         batch_size=BATCH_SIZE,
                         num_workers=4,
                         collate_fn=test_dataset.collate_fn)
word_embedding_tensor = torch.FloatTensor(dataset.vocab.word_vectors).to(device)

In [14]:
def evaluate(test_word_emb, loader,Ks = [50,100,150,200]):
    avg_precision, avg_recall = [], []
    for batch in test_loader:
        batch = [item.to(device) for item in batch]
        emb, ans = batch
        emb = model.get_embedding(emb)
        scores = torch.cdist(emb, test_word_emb)
        ans_length = torch.sum((~ans.eq(-1)).float(), dim=-1)
        mask = ~ans.eq(-1).unsqueeze(-1)
        
        # calculate precision and recall
        tmp_pr, tmp_re = [],[]
        for K in Ks:
            top_indices = torch.argsort(scores,dim=1)[:,:K]
            hit = top_indices.unsqueeze(-2) == ans.unsqueeze(-1)
            hit = torch.sum((hit * mask).flatten(1),dim=-1)
            precision = hit / K
            recall = hit / ans_length
            tmp_pr.append(precision)
            tmp_re.append(recall)
        tmp_pr = torch.stack(tmp_pr).T.detach().cpu().numpy().tolist()
        tmp_re = torch.stack(tmp_re).T.detach().cpu().numpy().tolist()
        avg_precision.extend(tmp_pr)
        avg_recall.extend(tmp_re)
        
    avg_precision = np.mean(avg_precision,axis=0)
    avg_recall = np.mean(avg_recall, axis=0)
    for idx, kval in enumerate(Ks):
        print(f"[K={kval}] Precision:{avg_precision[idx]:.4f} Recall:{avg_recall[idx]:.4f}")
    return avg_precision, avg_recall

In [15]:
for epoch in range(EPOCH):
    avg_loss = []
    model.train()
    for batch in tqdm(train_loader):
        batch = [item.to(device) for item in batch]
        doc_id,pos_w,neg_w = batch
        optimizer.zero_grad()
        loss = loss_fn(*model(doc_id,pos_w,neg_w))
        loss.backward()
        optimizer.step()
        avg_loss.append(loss.item())
    avg_loss = np.mean(avg_loss)
    print(f"Loss:{avg_loss:4f}")
    
    # evaluate
    model.eval()
    test_word_emb = model.get_embedding(word_embedding_tensor)
    res = evaluate(test_word_emb,test_loader)

  0%|          | 0/1826 [00:00<?, ?it/s]

Loss:0.444908
[K=50] Precision:0.1672 Recall:0.1088
[K=100] Precision:0.1461 Recall:0.1877
[K=150] Precision:0.1301 Recall:0.2500
[K=200] Precision:0.1165 Recall:0.2973


  0%|          | 0/1826 [00:00<?, ?it/s]

Loss:0.422311
[K=50] Precision:0.1594 Recall:0.1013
[K=100] Precision:0.1443 Recall:0.1840
[K=150] Precision:0.1298 Recall:0.2482
[K=200] Precision:0.1183 Recall:0.3010


  0%|          | 0/1826 [00:00<?, ?it/s]

Loss:0.417343
[K=50] Precision:0.1395 Recall:0.0914
[K=100] Precision:0.1350 Recall:0.1740
[K=150] Precision:0.1245 Recall:0.2398
[K=200] Precision:0.1140 Recall:0.2913


  0%|          | 0/1826 [00:00<?, ?it/s]

Loss:0.416710
[K=50] Precision:0.1574 Recall:0.1040
[K=100] Precision:0.1425 Recall:0.1858
[K=150] Precision:0.1283 Recall:0.2488
[K=200] Precision:0.1169 Recall:0.2997


  0%|          | 0/1826 [00:00<?, ?it/s]

Loss:0.414358
[K=50] Precision:0.1500 Recall:0.0980
[K=100] Precision:0.1332 Recall:0.1718
[K=150] Precision:0.1214 Recall:0.2336
[K=200] Precision:0.1114 Recall:0.2849


  0%|          | 0/1826 [00:00<?, ?it/s]

Loss:0.414695
[K=50] Precision:0.1723 Recall:0.1133
[K=100] Precision:0.1462 Recall:0.1887
[K=150] Precision:0.1298 Recall:0.2497
[K=200] Precision:0.1172 Recall:0.2994


  0%|          | 0/1826 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Top K freq word

In [16]:
word_freq = [(word, freq) for word, freq in dataset.vocab.word_freq_in_corpus.items()]
word_freq.sort(key=lambda x:x[1], reverse=True)
word_freq[:10]

[('life', 3108),
 ('characters', 2892),
 ('show', 2837),
 ('still', 2675),
 ('dont', 2645),
 ('then', 2621),
 ('character', 2618),
 ('never', 2589),
 ('being', 2572),
 ('little', 2565)]

In [17]:
def topk_word_evaluation(k=50):
    topk_word = [word for (word, freq) in word_freq[:k]]

    pr, re = [], []
    for ans in tqdm(test_ans):
        ans = set(ans)
        ans = [dataset.vocab.itos[a] for a in ans]

        hit = []
        for word in ans:
            if word in topk_word:
                hit.append(word)

        precision = len(hit) / k
        recall = len(hit) / len(ans)
        pr.append(precision)
        re.append(recall)

    print('top {} word'.format(k))
    print('percision', np.mean(pr))
    print('recall', np.mean(re))

topk_word_evaluation(k=50)
topk_word_evaluation(k=100)
topk_word_evaluation(k=200)


  0%|          | 0/2000 [00:00<?, ?it/s]

top 50 word
percision 0.16796000000000003
recall 0.10540392218034676


  0%|          | 0/2000 [00:00<?, ?it/s]

top 100 word
percision 0.14424
recall 0.1826121353120183


  0%|          | 0/2000 [00:00<?, ?it/s]

top 200 word
percision 0.11602250000000001
recall 0.2920237813334744
