In [1]:
import torch
from torch.optim import SGD
from torch.autograd import Variable, profiler
import numpy as np
import torch.functional as F
import torch.nn.functional as F

In [2]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',   
]

In [3]:
# build vocabulary
words = []
for sentence in corpus:
    for word in sentence.split():
         if word not in words:
            words.append(word)
        
word2idx = {w:idx for (idx, w) in enumerate(words)}
idx2word = {idx:w for (idx, w) in enumerate(words)}

vocabulary_size = len(word2idx)

In [4]:
def get_word_embedding(word):
    word_vec_one_hot = np.zeros(vocabulary_size)
    word_vec_one_hot[word2idx[word]] = 1
    return word_vec_one_hot

In [5]:
embedding_dims = 10
window_size = 2

In [6]:
def train_generator():
    for sentence in corpus:
        words = sentence.split()
        indices = [word2idx[w] for w in words]
        for i in range(len(indices)):
            # center word, context
            # i is center word index
            for w in range(-window_size, window_size + 1):
                context_idx = i + w
                if context_idx < 0 or context_idx >= len(indices) or i == context_idx:
                    continue
                center_vec_one_hot = np.zeros(vocabulary_size)
                center_vec_one_hot[indices[i]] = 1
                
                context_idx = indices[context_idx]
                                
                yield center_vec_one_hot, context_idx
 

In [9]:
# Network definition
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)


for epo in range(501):
    avg_loss = 0
    samples = 0
    for data, target in train_generator():
        x = Variable(torch.from_numpy(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())
        
        samples += len(y_true)
        
        a1 = torch.matmul(W1, x)
        a2 = torch.matmul(W2, a1)

        log_softmax = F.log_softmax(a2, dim=0)

        network_pred_dist = F.softmax(log_softmax, dim=0)
        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        avg_loss += loss.item()
        loss.backward()

        W1.data -= 0.002 * W1.grad.data
        W2.data -= 0.002 * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
        
    
    if epo % 100 == 0:
        print(avg_loss / samples)

4.496821568693433
2.0961487259183613
1.7688039268766131
1.6510580139500755
1.6036454788276127
1.5816143589360374


In [None]:
%matplotlib inline
from scikitplot.decomposition import plot_pca_2d_projection
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
pca.fit(W1.data.numpy().T)
proj = pca.transform(W1.data.numpy().T)
ax = plot_pca_2d_projection(pca, W1.data.numpy().T, np.array(words), feature_labels=words, figsize=(12,12), text_fontsize=12)
# ax.legend(None)
for i, txt in enumerate(words):
    ax.annotate(txt, (proj[i,0], proj[i,1]), size=16)

In [11]:
def get_word_vector_v(word):
    return W1[:, word2idx[word]].data.numpy()

def get_word_vector_u(word):
    return W2[word2idx[word],:].data.numpy()

In [None]:
# Poland to Warsaw is like Germany to ?

In [13]:
pol = 1 * get_word_vector_v('poland') + 1 * get_word_vector_u('poland')
ger = 1 * get_word_vector_v('germany') + 1 * get_word_vector_u('germany') 
waw = 1 * get_word_vector_v('warsaw') + 1 * get_word_vector_u('warsaw') 

yyy = waw - pol + ger

In [14]:
from scipy.spatial.distance import cosine
distances = [(v, cosine(yyy, 1 * get_word_vector_u(v) + 1 * get_word_vector_v(v))) for v in words]

In [15]:
# Poland to Warsaw is like Germany to Berlin

In [16]:
distances

[('he', 0.8570210635662079),
 ('is', 1.0280788782984018),
 ('a', 1.1927961856126785),
 ('king', 1.1952263414859772),
 ('she', 1.0590651705861092),
 ('queen', 0.6953186988830566),
 ('man', 0.8267989605665207),
 ('woman', 1.4676547944545746),
 ('warsaw', 0.35097044706344604),
 ('poland', 1.1727249324321747),
 ('capital', 0.9477489851415157),
 ('berlin', 0.2761566638946533),
 ('germany', 0.28735482692718506),
 ('paris', 1.1595857292413712),
 ('france', 1.3894160687923431)]

In [None]:
# In what context Paris appears?

In [12]:
context_to_predict = get_word_vector_v('paris')
hidden = Variable(torch.from_numpy(context_to_predict)).float()
a = torch.matmul(W2, hidden)
probs = F.softmax(a, dim=0).data.numpy()
for context, prob in zip(words, probs):
    print(f'{context}: {prob:.2f}')

he: 0.01
is: 0.45
a: 0.03
king: 0.01
she: 0.01
queen: 0.00
man: 0.02
woman: 0.00
warsaw: 0.00
poland: 0.03
capital: 0.00
berlin: 0.00
germany: 0.01
paris: 0.00
france: 0.42
