In [145]:
import tensorflow as tf
import numpy as np 

import string

import spacy
spc_nlp = spacy.load('en')

In [233]:
with open('test1.txt') as f:
    words = []
    for line in f:
        words += [word.translate(None, string.punctuation).lower() for word in line.split()]

In [234]:
unique_words = set(words)

word2int = {}
int2word = {}

vocab = len(unique_words)

In [235]:
for i, word in enumerate(unique_words):
    word2int[word] = i
    int2word[i] = word

In [238]:
with open ('test1.txt') as f:
    content = ''.join([line.replace('\r\n', ' ').lower() for line in f if line.strip() != ''])

In [239]:
tokenize_sents = [[token.text for token in spc_nlp(s.text) if token.pos_!='PUNCT'] for s in spc_nlp(content.decode('utf-8')).sents]

## Generate training data

### Skipgram

In [240]:
data_skipgram = []

WINDOW_SIZE = 2

for sent in tokenize_sents:
    for word_idx, word in enumerate(sent):
        for nb_word in sent[max(word_idx-WINDOW_SIZE, 0):min(word_idx+WINDOW_SIZE, len(sent)-1)+1]:
            if nb_word != word:
                data_skipgram.append([word, nb_word])

In [241]:
data_skipgram[:50]

[[u'what', u'a'],
 [u'what', u'beautiful'],
 [u'a', u'what'],
 [u'a', u'beautiful'],
 [u'a', u'day'],
 [u'beautiful', u'what'],
 [u'beautiful', u'a'],
 [u'beautiful', u'day'],
 [u'day', u'a'],
 [u'day', u'beautiful'],
 [u'any', u'day'],
 [u'any', u'there'],
 [u'day', u'any'],
 [u'day', u'there'],
 [u'day', u'is'],
 [u'there', u'any'],
 [u'there', u'day'],
 [u'there', u'is'],
 [u'there', u'rainbow'],
 [u'is', u'day'],
 [u'is', u'there'],
 [u'is', u'rainbow'],
 [u'is', u'it'],
 [u'rainbow', u'there'],
 [u'rainbow', u'is'],
 [u'rainbow', u'it'],
 [u'rainbow', u'\u2019s'],
 [u'it', u'is'],
 [u'it', u'rainbow'],
 [u'it', u'\u2019s'],
 [u'it', u'a'],
 [u'\u2019s', u'rainbow'],
 [u'\u2019s', u'it'],
 [u'\u2019s', u'a'],
 [u'\u2019s', u'nice'],
 [u'a', u'it'],
 [u'a', u'\u2019s'],
 [u'a', u'nice'],
 [u'a', u'day'],
 [u'nice', u'\u2019s'],
 [u'nice', u'a'],
 [u'nice', u'day'],
 [u'day', u'a'],
 [u'day', u'nice'],
 [u'a', u'nice'],
 [u'a', u'day'],
 [u'nice', u'a'],
 [u'nice', u'day'],
 [u'nice'

### CBOW

In [242]:
data_cbow = []

WINDOW_SIZE = 2

for sent in tokenize_sents:
    for word_idx, word in enumerate(sent):
        for nb_word in sent[max(word_idx-WINDOW_SIZE, 0):min(word_idx+WINDOW_SIZE, len(sent)-1)+1]:
            if nb_word != word:
                data_cbow.append([nb_word, word])

In [243]:
data_cbow[:50]

[[u'a', u'what'],
 [u'beautiful', u'what'],
 [u'what', u'a'],
 [u'beautiful', u'a'],
 [u'day', u'a'],
 [u'what', u'beautiful'],
 [u'a', u'beautiful'],
 [u'day', u'beautiful'],
 [u'a', u'day'],
 [u'beautiful', u'day'],
 [u'day', u'any'],
 [u'there', u'any'],
 [u'any', u'day'],
 [u'there', u'day'],
 [u'is', u'day'],
 [u'any', u'there'],
 [u'day', u'there'],
 [u'is', u'there'],
 [u'rainbow', u'there'],
 [u'day', u'is'],
 [u'there', u'is'],
 [u'rainbow', u'is'],
 [u'it', u'is'],
 [u'there', u'rainbow'],
 [u'is', u'rainbow'],
 [u'it', u'rainbow'],
 [u'\u2019s', u'rainbow'],
 [u'is', u'it'],
 [u'rainbow', u'it'],
 [u'\u2019s', u'it'],
 [u'a', u'it'],
 [u'rainbow', u'\u2019s'],
 [u'it', u'\u2019s'],
 [u'a', u'\u2019s'],
 [u'nice', u'\u2019s'],
 [u'it', u'a'],
 [u'\u2019s', u'a'],
 [u'nice', u'a'],
 [u'day', u'a'],
 [u'\u2019s', u'nice'],
 [u'a', u'nice'],
 [u'day', u'nice'],
 [u'a', u'day'],
 [u'nice', u'day'],
 [u'nice', u'a'],
 [u'day', u'a'],
 [u'a', u'nice'],
 [u'day', u'nice'],
 [u'means

## Convert word pair to numeric value via one_hot_encode

In [167]:
def to_one_hot(data_point_index, vocab_size):
    #print data_point_index
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp

In [213]:
x_train = [] # input word
y_train = [] # output word

# data = data_skipgram
data = data_cbow

for word_pair in data:
    x_train.append(to_one_hot(word2int[word_pair[0]], vocab))
    y_train.append(to_one_hot(word2int[word_pair[1]], vocab))

# convert them to numpy arrays
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [214]:
y_train

array([[ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [215]:
x_train.shape

(2178, 288)

In [216]:
y_train.shape

(2178, 288)

## Tensorflow Modeling

In [217]:
# making placeholders for x_train and y_train

x = tf.placeholder(tf.float32, shape=(None, vocab))
y_label = tf.placeholder(tf.float32, shape=(None, vocab))

In [218]:
# Set word vector dimension
EMBEDDING_DIM = 50

In [219]:
W1 = tf.Variable(tf.random_normal([vocab, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) #bias
hidden_representation = tf.add(tf.matmul(x,W1), b1)

In [220]:
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab]))
b2 = tf.Variable(tf.random_normal([vocab]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation, W2), b2))

### Train the network

In [221]:
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init) #make sure to do this!

# define the loss function:
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))
# define the training step:
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)
n_iters = 1000
# train for n_iter iterations
for _ in range(n_iters):
    sess.run(train_step, feed_dict={x: x_train, y_label: y_train})
    print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train}))

('loss is : ', 29.221334)
('loss is : ', 27.948202)
('loss is : ', 26.929453)
('loss is : ', 26.103209)
('loss is : ', 25.413389)
('loss is : ', 24.830915)
('loss is : ', 24.330421)
('loss is : ', 23.888725)
('loss is : ', 23.492037)
('loss is : ', 23.132746)
('loss is : ', 22.806057)
('loss is : ', 22.508224)
('loss is : ', 22.235979)
('loss is : ', 21.986441)
('loss is : ', 21.757002)
('loss is : ', 21.545298)
('loss is : ', 21.349331)
('loss is : ', 21.167337)
('loss is : ', 20.997841)
('loss is : ', 20.839502)
('loss is : ', 20.691187)
('loss is : ', 20.551844)
('loss is : ', 20.420528)
('loss is : ', 20.296402)
('loss is : ', 20.178682)
('loss is : ', 20.066677)
('loss is : ', 19.959784)
('loss is : ', 19.857471)
('loss is : ', 19.759296)
('loss is : ', 19.664879)
('loss is : ', 19.573906)
('loss is : ', 19.486099)
('loss is : ', 19.401239)
('loss is : ', 19.319115)
('loss is : ', 19.239553)
('loss is : ', 19.162416)
('loss is : ', 19.087561)
('loss is : ', 19.014866)
('loss is : 

('loss is : ', 12.754557)
('loss is : ', 12.743352)
('loss is : ', 12.73218)
('loss is : ', 12.721033)
('loss is : ', 12.709922)
('loss is : ', 12.698827)
('loss is : ', 12.687767)
('loss is : ', 12.676744)
('loss is : ', 12.665738)
('loss is : ', 12.654762)
('loss is : ', 12.643823)
('loss is : ', 12.632902)
('loss is : ', 12.622016)
('loss is : ', 12.611155)
('loss is : ', 12.600315)
('loss is : ', 12.589508)
('loss is : ', 12.578734)
('loss is : ', 12.567973)
('loss is : ', 12.557247)
('loss is : ', 12.546551)
('loss is : ', 12.535882)
('loss is : ', 12.52523)
('loss is : ', 12.514607)
('loss is : ', 12.504019)
('loss is : ', 12.493449)
('loss is : ', 12.48291)
('loss is : ', 12.472396)
('loss is : ', 12.461904)
('loss is : ', 12.451439)
('loss is : ', 12.441005)
('loss is : ', 12.430588)
('loss is : ', 12.420206)
('loss is : ', 12.409843)
('loss is : ', 12.399501)
('loss is : ', 12.389194)
('loss is : ', 12.378907)
('loss is : ', 12.368636)
('loss is : ', 12.358404)
('loss is : ', 

('loss is : ', 10.145569)
('loss is : ', 10.139293)
('loss is : ', 10.133018)
('loss is : ', 10.126766)
('loss is : ', 10.120513)
('loss is : ', 10.114268)
('loss is : ', 10.108038)
('loss is : ', 10.10181)
('loss is : ', 10.095592)
('loss is : ', 10.089386)
('loss is : ', 10.083188)
('loss is : ', 10.076994)
('loss is : ', 10.070811)
('loss is : ', 10.064632)
('loss is : ', 10.058467)
('loss is : ', 10.052304)
('loss is : ', 10.046158)
('loss is : ', 10.040012)
('loss is : ', 10.033877)
('loss is : ', 10.027756)
('loss is : ', 10.021638)
('loss is : ', 10.015528)
('loss is : ', 10.009424)
('loss is : ', 10.003331)
('loss is : ', 9.997242)
('loss is : ', 9.9911661)
('loss is : ', 9.9850988)
('loss is : ', 9.9790344)
('loss is : ', 9.9729824)
('loss is : ', 9.9669333)
('loss is : ', 9.9608984)
('loss is : ', 9.9548712)
('loss is : ', 9.9488487)
('loss is : ', 9.9428358)
('loss is : ', 9.9368267)
('loss is : ', 9.930829)
('loss is : ', 9.9248381)
('loss is : ', 9.9188576)
('loss is : ', 

('loss is : ', 8.4996395)
('loss is : ', 8.4953346)
('loss is : ', 8.4910364)
('loss is : ', 8.4867411)
('loss is : ', 8.4824505)
('loss is : ', 8.4781628)
('loss is : ', 8.4738836)
('loss is : ', 8.4696074)
('loss is : ', 8.4653349)
('loss is : ', 8.4610653)
('loss is : ', 8.4568024)
('loss is : ', 8.4525452)
('loss is : ', 8.4482946)
('loss is : ', 8.4440432)
('loss is : ', 8.4397974)
('loss is : ', 8.4355555)
('loss is : ', 8.431324)
('loss is : ', 8.4270935)
('loss is : ', 8.422863)
('loss is : ', 8.418643)
('loss is : ', 8.414422)
('loss is : ', 8.4102068)
('loss is : ', 8.4059992)
('loss is : ', 8.4017954)
('loss is : ', 8.3975925)
('loss is : ', 8.3933973)
('loss is : ', 8.3892069)
('loss is : ', 8.3850212)
('loss is : ', 8.3808355)
('loss is : ', 8.3766565)
('loss is : ', 8.3724813)
('loss is : ', 8.3683138)
('loss is : ', 8.3641453)
('loss is : ', 8.3599854)
('loss is : ', 8.3558302)
('loss is : ', 8.3516779)
('loss is : ', 8.3475285)
('loss is : ', 8.3433838)
('loss is : ', 8

In [224]:
vectors = sess.run(W1 + b1)

print(vectors[word2int['kingdom']])

[ 2.57650733 -0.27166796  0.64199615  1.26238966 -1.78765273  0.10659375
  0.51200849 -0.53586954 -2.03098536 -0.21292004 -2.89792585  0.38323617
  0.69113439  0.49926409  1.26502991 -0.05600855 -0.82178038 -0.96936756
 -0.76439178  1.22169304  0.43693709 -0.91187274 -0.04682606  0.92630064
 -0.0269774   0.64881718 -0.79485232 -0.06609856  0.42604586  0.57804835
  0.66148746  0.3997677  -1.95709348  1.71351779  0.08956528 -0.6018551
 -0.26850778 -1.65666246  0.2356502   1.7306453   0.02674896 -0.51682341
 -0.945108   -1.30101657  1.4007746  -1.36493492  1.29641473  0.01350327
 -0.81573242 -0.24666321]


### Check similarity

In [225]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_sim(vec1, vec2):
    return cosine_similarity(vec1, vec2)

def closest_word(word, vec):
    vec1 = vec[word2int[word]]
    dist = 1
    idx = 0
    for i, v in enumerate(vec):
        if(dist > cosine_sim(vec1, v)):
            dist = cosine_sim(vec1, v)
            idx = i
    return int2word[idx]

In [231]:
import warnings
warnings.filterwarnings('ignore')
closest_word('united', vectors)

'new'