In [0]:
#import library
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F

In [0]:
corpus = ['Recent methods for learning vector space representations of words have succeeded in capturing fine-grained semantic and syntactic regularities using vector arithmetic, but the origin of these regularities has remained opaque',
          'We analyze and make explicit the model properties needed for such regularities to emerge in word vectors','The result is a new global logbilinear regression model that combines the advantages of the two major model families in the literature: global matrix factorization and local context window methods']

In [105]:
#obtain the list of words.
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)
print(tokenized_corpus)

[['Recent', 'methods', 'for', 'learning', 'vector', 'space', 'representations', 'of', 'words', 'have', 'succeeded', 'in', 'capturing', 'fine-grained', 'semantic', 'and', 'syntactic', 'regularities', 'using', 'vector', 'arithmetic,', 'but', 'the', 'origin', 'of', 'these', 'regularities', 'has', 'remained', 'opaque'], ['We', 'analyze', 'and', 'make', 'explicit', 'the', 'model', 'properties', 'needed', 'for', 'such', 'regularities', 'to', 'emerge', 'in', 'word', 'vectors'], ['The', 'result', 'is', 'a', 'new', 'global', 'logbilinear', 'regression', 'model', 'that', 'combines', 'the', 'advantages', 'of', 'the', 'two', 'major', 'model', 'families', 'in', 'the', 'literature:', 'global', 'matrix', 'factorization', 'and', 'local', 'context', 'window', 'methods']]


In [0]:
#corpus[2].lower()

In [0]:
#create dictionary and assign the index for words
my_dict = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in my_dict:
            my_dict.append(token)

word2idx = {w: idx for (idx, w) in enumerate(my_dict)}
idx2word = {idx: w for (idx, w) in enumerate(my_dict)}

my_dict_size = len(my_dict)

In [108]:
word2idx

{'Recent': 0,
 'The': 39,
 'We': 27,
 'a': 42,
 'advantages': 49,
 'analyze': 28,
 'and': 15,
 'arithmetic,': 19,
 'but': 20,
 'capturing': 12,
 'combines': 48,
 'context': 57,
 'emerge': 36,
 'explicit': 30,
 'factorization': 55,
 'families': 52,
 'fine-grained': 13,
 'for': 2,
 'global': 44,
 'has': 24,
 'have': 9,
 'in': 11,
 'is': 41,
 'learning': 3,
 'literature:': 53,
 'local': 56,
 'logbilinear': 45,
 'major': 51,
 'make': 29,
 'matrix': 54,
 'methods': 1,
 'model': 31,
 'needed': 33,
 'new': 43,
 'of': 7,
 'opaque': 26,
 'origin': 22,
 'properties': 32,
 'regression': 46,
 'regularities': 17,
 'remained': 25,
 'representations': 6,
 'result': 40,
 'semantic': 14,
 'space': 5,
 'succeeded': 10,
 'such': 34,
 'syntactic': 16,
 'that': 47,
 'the': 21,
 'these': 23,
 'to': 35,
 'two': 50,
 'using': 18,
 'vector': 4,
 'vectors': 38,
 'window': 58,
 'word': 37,
 'words': 8}

In [0]:
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
  

In [110]:
idx_pairs[:10]

array([[0, 1],
       [0, 2],
       [1, 0],
       [1, 2],
       [1, 3],
       [2, 0],
       [2, 1],
       [2, 3],
       [2, 4],
       [3, 1]])

In [0]:
def get_input_layer(word_idx):
    x = torch.zeros(my_dict_size).float()
    x[word_idx] = 1.0
    return x
  
  #Input layer is just the center word encoded in one-hot manner. It dimensions are [1, vocabulary_size]
  

In [79]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, my_dict_size).float(), requires_grad=True)
W2 = Variable(torch.randn(my_dict_size, embedding_dims).float(), requires_grad=True)
num_epochs = 1010
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 6.15251525599381
Loss at epo 10: 5.709854064316585
Loss at epo 20: 5.389699952766813
Loss at epo 30: 5.143794337223316
Loss at epo 40: 4.947761688561275
Loss at epo 50: 4.787342763769216
Loss at epo 60: 4.653403744204291
Loss at epo 70: 4.539722133093867
Loss at epo 80: 4.441879593914953
Loss at epo 90: 4.356644013010222
Loss at epo 100: 4.281588464358757
Loss at epo 110: 4.214856257931939
Loss at epo 120: 4.154997875361607
Loss at epo 130: 4.1008699223913
Loss at epo 140: 4.051554217009709
Loss at epo 150: 4.006308357468967
Loss at epo 160: 3.9645252897821623
Loss at epo 170: 3.925702892500779
Loss at epo 180: 3.8894254766661547
Loss at epo 190: 3.8553448245443147
Loss at epo 200: 3.8231700231289043
Loss at epo 210: 3.792655052398813
Loss at epo 220: 3.7635921375504857
Loss at epo 230: 3.735805899932467
Loss at epo 240: 3.7091466747481245
Loss at epo 250: 3.6834885806872926
Loss at epo 260: 3.658723433675437
Loss at epo 270: 3.6347598639027825
Loss at epo 280: 3.6115198

In [112]:
#check the index of word
for ind, token in enumerate(my_dict):
  word2idx[token] = ind
  idx2word[ind] = token
print(word2idx['methods'])

1


In [0]:
#create a zero matrix with size of length dictionary
length=len(my_dict)
cooc = np.zeros([length,length], np.float32)

In [0]:
#create a function that loops over the words on a sentence and updates the co-ocurrence matrix
def process_sentence(sentence):
    words_in_sentence = word2idx
    list_of_indeces = [my_dict.index(word) for word in words_in_sentence]
    for index1 in list_of_indeces:
        for index2 in list_of_indeces:
            if index1 != index2:
                cooc[index1,index2] +=1

In [0]:
#go through all
for sentence in my_dict:
    process_sentence(sentence)

In [128]:
print(cooc)

[[ 0. 59. 59. ... 59. 59. 59.]
 [59.  0. 59. ... 59. 59. 59.]
 [59. 59.  0. ... 59. 59. 59.]
 ...
 [59. 59. 59. ...  0. 59. 59.]
 [59. 59. 59. ... 59.  0. 59.]
 [59. 59. 59. ... 59. 59.  0.]]


In [0]:
coocs = np.transpose(np.nonzero(cooc))

In [130]:
print(coocs)

[[ 0  1]
 [ 0  2]
 [ 0  3]
 ...
 [58 55]
 [58 56]
 [58 57]]


In [0]:
def weight_func(x, x_max, alpha):
    wx = (x/x_max)**alpha
    wx = torch.min(wx, torch.ones_like(wx))
    return wx.cuda()  