In [2]:
import torch
import random
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable

In [3]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
]

In [4]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)

In [5]:
tokenized_corpus

[['he', 'is', 'a', 'king'],
 ['she', 'is', 'a', 'queen'],
 ['he', 'is', 'a', 'man'],
 ['she', 'is', 'a', 'woman'],
 ['warsaw', 'is', 'poland', 'capital'],
 ['berlin', 'is', 'germany', 'capital'],
 ['paris', 'is', 'france', 'capital']]

In [6]:
print(corpus[1].split())

['she', 'is', 'a', 'queen']


In [7]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

In [8]:
vocabulary


['he',
 'is',
 'a',
 'king',
 'she',
 'queen',
 'man',
 'woman',
 'warsaw',
 'poland',
 'capital',
 'berlin',
 'germany',
 'paris',
 'france']

In [9]:
list(enumerate(vocabulary))

[(0, 'he'),
 (1, 'is'),
 (2, 'a'),
 (3, 'king'),
 (4, 'she'),
 (5, 'queen'),
 (6, 'man'),
 (7, 'woman'),
 (8, 'warsaw'),
 (9, 'poland'),
 (10, 'capital'),
 (11, 'berlin'),
 (12, 'germany'),
 (13, 'paris'),
 (14, 'france')]

In [10]:
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

In [11]:
word2idx


{'he': 0,
 'is': 1,
 'a': 2,
 'king': 3,
 'she': 4,
 'queen': 5,
 'man': 6,
 'woman': 7,
 'warsaw': 8,
 'poland': 9,
 'capital': 10,
 'berlin': 11,
 'germany': 12,
 'paris': 13,
 'france': 14}

In [12]:
indices = [word2idx[word] for word in sentence]

In [13]:
tokenized_corpus,indices


([['he', 'is', 'a', 'king'],
  ['she', 'is', 'a', 'queen'],
  ['he', 'is', 'a', 'man'],
  ['she', 'is', 'a', 'woman'],
  ['warsaw', 'is', 'poland', 'capital'],
  ['berlin', 'is', 'germany', 'capital'],
  ['paris', 'is', 'france', 'capital']],
 [13, 1, 14, 10])

In [14]:
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    print(indices)
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))
            
    print("==================================")
    print(idx_pairs)

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

[0, 1, 2, 3]
[(0, 1), (0, 2), (1, 0), (1, 2), (1, 3), (2, 0), (2, 1), (2, 3), (3, 1), (3, 2)]
[4, 1, 2, 5]
[(0, 1), (0, 2), (1, 0), (1, 2), (1, 3), (2, 0), (2, 1), (2, 3), (3, 1), (3, 2), (4, 1), (4, 2), (1, 4), (1, 2), (1, 5), (2, 4), (2, 1), (2, 5), (5, 1), (5, 2)]
[0, 1, 2, 6]
[(0, 1), (0, 2), (1, 0), (1, 2), (1, 3), (2, 0), (2, 1), (2, 3), (3, 1), (3, 2), (4, 1), (4, 2), (1, 4), (1, 2), (1, 5), (2, 4), (2, 1), (2, 5), (5, 1), (5, 2), (0, 1), (0, 2), (1, 0), (1, 2), (1, 6), (2, 0), (2, 1), (2, 6), (6, 1), (6, 2)]
[4, 1, 2, 7]
[(0, 1), (0, 2), (1, 0), (1, 2), (1, 3), (2, 0), (2, 1), (2, 3), (3, 1), (3, 2), (4, 1), (4, 2), (1, 4), (1, 2), (1, 5), (2, 4), (2, 1), (2, 5), (5, 1), (5, 2), (0, 1), (0, 2), (1, 0), (1, 2), (1, 6), (2, 0), (2, 1), (2, 6), (6, 1), (6, 2), (4, 1), (4, 2), (1, 4), (1, 2), (1, 7), (2, 4), (2, 1), (2, 7), (7, 1), (7, 2)]
[8, 1, 9, 10]
[(0, 1), (0, 2), (1, 0), (1, 2), (1, 3), (2, 0), (2, 1), (2, 3), (3, 1), (3, 2), (4, 1), (4, 2), (1, 4), (1, 2), (1, 5), (2, 4), (

In [15]:
idx_pairs


array([[ 0,  1],
       [ 0,  2],
       [ 1,  0],
       [ 1,  2],
       [ 1,  3],
       [ 2,  0],
       [ 2,  1],
       [ 2,  3],
       [ 3,  1],
       [ 3,  2],
       [ 4,  1],
       [ 4,  2],
       [ 1,  4],
       [ 1,  2],
       [ 1,  5],
       [ 2,  4],
       [ 2,  1],
       [ 2,  5],
       [ 5,  1],
       [ 5,  2],
       [ 0,  1],
       [ 0,  2],
       [ 1,  0],
       [ 1,  2],
       [ 1,  6],
       [ 2,  0],
       [ 2,  1],
       [ 2,  6],
       [ 6,  1],
       [ 6,  2],
       [ 4,  1],
       [ 4,  2],
       [ 1,  4],
       [ 1,  2],
       [ 1,  7],
       [ 2,  4],
       [ 2,  1],
       [ 2,  7],
       [ 7,  1],
       [ 7,  2],
       [ 8,  1],
       [ 8,  9],
       [ 1,  8],
       [ 1,  9],
       [ 1, 10],
       [ 9,  8],
       [ 9,  1],
       [ 9, 10],
       [10,  1],
       [10,  9],
       [11,  1],
       [11, 12],
       [ 1, 11],
       [ 1, 12],
       [ 1, 10],
       [12, 11],
       [12,  1],
       [12, 10],
       [10,  1

In [24]:
a = torch.tensor([[1, 2, 3],
                   [4, 5, 6],
                    [4, 5, 6],
                 [4, 5, 6]])

a.view(6, 2)

tensor([[ 1,  2],
        [ 3,  4],
        [ 5,  6],
        [ 4,  5],
        [ 6,  4],
        [ 5,  6]])

In [25]:
target=3
torch.from_numpy(np.array([target])).long()

tensor([ 3])

In [26]:
log_softmax = F.log_softmax(torch.tensor([1.0,2.0,3.0]), dim=0)
log_softmax.view(1,-1)

tensor([[-2.4076, -1.4076, -0.4076]])

NameError: name 'W1' is not defined