In [1]:
import itertools
from collections import OrderedDict 
import re
import nltk
from nltk.corpus import brown, gutenberg
from nltk.probability import FreqDist
from nltk.corpus import stopwords

## Corpus

In [23]:
gutenberg.fileids()[3]

'bible-kjv.txt'

In [2]:
samples  = gutenberg.sents(gutenberg.fileids()[3])
pattern = re.compile("[A-Za-z]+")
stop_w =  set(stopwords.words('english'))
corpus = []
for sent in samples:
    sent = [w.lower() for w in sent]
    sent = [w for w in sent if w not in stop_w]
    sent = [w.replace('\n', ' ') for w in sent]
    sent = [w for w in sent if pattern.fullmatch(w)]
    if len(sent) > 5:
        corpus.append(sent)

In [3]:
fre_dist = FreqDist()
for sent in corpus:
    fre_dist.update(sent)
fre_dist = {k : v for k, v in fre_dist.items() if v > 5}

In [4]:
vocab_size = len(fre_dist)
idx_to_word = {idx: word for idx,  word in enumerate(fre_dist.keys())}
word_to_idx = {word: idx for idx, word in idx_to_word.items()}

In [5]:
corpus_indexed = [[word_to_idx[word] for word in sent if word in word_to_idx]for sent in corpus]
corpus_indexed = [sent for sent in corpus_indexed if len(sent) > 5]
fre_dist_indexed = {word_to_idx[w]: f for w, f in fre_dist.items()}

## CBOW with softmax
$$
P(center|context;\theta) = P(w_O|w_I; \theta) =  \cfrac{\exp(h^\top \text{v}^{'}_{w_{O}})}{\sum_{w_ \in V}\exp(h^\top \text{v}'_{w_i})} 
$$

In [6]:
import torch
import numpy as np
import torch.functional as F
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import tqdm

In [7]:
class CBOWDataset(torch.utils.data.Dataset):
    def __init__(self, corpus, windows_size=2, sentence_length_threshold=5):
        self.windows_size = windows_size
        self.sentence_length_threshold = sentence_length_threshold
        self.contexts, self.centers = self._generate_pairs(corpus, windows_size)
        
    def _generate_pairs(self, corpus, windows_size):
        contexts = []
        centers = []
        
        for sent in corpus:
            if len(sent) < self.sentence_length_threshold:
                continue
            
            for center_word_pos in range(len(sent)):
                context = []
                for w in range(-windows_size, windows_size + 1):
                    context_word_pos = center_word_pos + w
                    if(0 <= context_word_pos < len(sent) and context_word_pos != center_word_pos):
                        context.append(sent[context_word_pos])
                if(len(context) == 2 * self.windows_size):
                    contexts.append(context)
                    centers.append(sent[center_word_pos])
        return contexts, centers
    
    def __len__(self):
        return len(self.centers)
    
    def __getitem__(self, index):
        return np.array(self.contexts[index]), np.array([self.centers[index]])

In [8]:
class CBOWSoftmax(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.syn0 = nn.Embedding(vocab_size, embedding_dim)
        self.syn1 = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, context, center):
        #  context: [b_size, windows_size]
        #  center: [b_size, 1]
        embds = self.syn0(context).mean(dim=1) # [b_size, embedding_dim]
        out = self.syn1(embds)
        
        log_probs = F.log_softmax(out, dim=1)
        loss = F.nll_loss(log_probs, center.view(-1), reduction='mean')
        return loss
        

In [9]:
EMBEDDING_DIM = 50
model = CBOWSoftmax(vocab_size, EMBEDDING_DIM)
optimizer = optim.Adam(model.parameters(), lr=0.001,  weight_decay=1e-6)

In [10]:
dataset = CBOWDataset(corpus_indexed)
data_loader = DataLoader(dataset, batch_size=100, num_workers=0)

In [11]:
log_interval = 100
for epoch_i in range(10):
    total_loss = 0
    model.train()
    tk0 = tqdm.tqdm(data_loader, smoothing=0, mininterval=1.0)
    for i, (context, center) in enumerate(tk0):
       
        loss = model(context, center)

        model.zero_grad()
        loss.backward()
        
        optimizer.step()
        total_loss += loss.item()
        if(i + 1) % log_interval == 0:
            tk0.set_postfix(loss = total_loss/log_interval)
            total_loss = 0

100%|██████████| 2404/2404 [01:13<00:00, 32.69it/s, loss=7.22]
100%|██████████| 2404/2404 [00:58<00:00, 40.83it/s, loss=6.85]
100%|██████████| 2404/2404 [01:26<00:00, 27.89it/s, loss=6.63]
100%|██████████| 2404/2404 [00:45<00:00, 52.58it/s, loss=6.46]
100%|██████████| 2404/2404 [00:49<00:00, 49.05it/s, loss=6.33]
100%|██████████| 2404/2404 [01:30<00:00, 26.70it/s, loss=6.23]
100%|██████████| 2404/2404 [01:19<00:00, 30.39it/s, loss=6.13]
100%|██████████| 2404/2404 [00:57<00:00, 41.80it/s, loss=6.05]
100%|██████████| 2404/2404 [01:06<00:00, 36.06it/s, loss=5.98]
100%|██████████| 2404/2404 [00:52<00:00, 45.42it/s, loss=5.91]


### fetch word embedding

In [15]:
syn0 = model.syn0.weight.data

w2v_embedding = syn0 
w2v_embedding = w2v_embedding.numpy()
l2norm = np.linalg.norm(w2v_embedding, 2, axis=1, keepdims=True)
w2v_embedding = w2v_embedding / l2norm


In [17]:
w2v_embedding.shape

(4533, 50)

# Evaluation

In [26]:
class CosineSimilarity:
    def __init__(self, word_embedding, idx_to_word_dict, word_to_idx_dict):
        self.word_embedding = word_embedding # normed already
        self.idx_to_word_dict = idx_to_word_dict
        self.word_to_idx_dict = word_to_idx_dict
        
    def get_synonym(self, word, topK=10):
        idx = self.word_to_idx_dict[word]
        embed = self.word_embedding[idx]
        
        cos_similairty = w2v_embedding @ embed
        
        topK_index = np.argsort(-cos_similairty)[:topK]
        pairs = []
        for i in topK_index:
            w = self.idx_to_word_dict[i]
#             pairs[w] = cos_similairty[i]
            pairs.append((w, cos_similairty[i]))
        return pairs
        
    
    

In [27]:
cosineSim = CosineSimilarity(w2v_embedding, idx_to_word, word_to_idx)
cosineSim.get_synonym('christ')

[('christ', 0.9999999),
 ('appearing', 0.67836726),
 ('nazareth', 0.6352371),
 ('partakers', 0.6006965),
 ('apostle', 0.59606194),
 ('repented', 0.56955314),
 ('preaching', 0.5495355),
 ('saviour', 0.5484799),
 ('remission', 0.5454745),
 ('repentance', 0.53892404)]

In [22]:
cosineSim.get_synonym('jesus')

[('jesus', 1.0),
 ('corinth', 0.61979365),
 ('preaching', 0.5921793),
 ('john', 0.5870693),
 ('paul', 0.5545162),
 ('apostle', 0.55235624),
 ('obedience', 0.54686654),
 ('considered', 0.54658234),
 ('lazarus', 0.54609567),
 ('gospel', 0.54567015)]

#### debug

In [20]:
emebdding[word_to_idx['woodhouse']]

tensor([-3.2198,  1.6870, -1.1216,  0.2148,  0.5650, -1.5975,  2.0637, -0.0704,
        -0.5054,  0.1672, -0.6360, -1.1686,  0.2833, -0.5476,  1.6006, -1.8558,
         2.0661,  1.4087, -0.9098, -0.1002,  0.1536,  0.6631,  0.4492,  1.7913,
        -2.0220,  1.4267,  2.7691, -2.6137,  0.7928, -1.2498,  0.2200,  0.5134,
         0.1859,  0.9686,  0.5371,  1.5631, -0.0983, -0.8209, -0.1594, -0.2577,
        -1.6348, -0.7915,  0.8425, -3.2552, -0.3174,  0.9666,  0.9690,  0.2145,
        -1.5016, -1.5281])

In [12]:
context.shape, center.shape

(torch.Size([49, 4]), torch.Size([49, 1]))

In [13]:
context[0]

tensor([  43,   63,  632, 2073])

In [14]:
center[0]

tensor([65])