In [2]:
import numpy as np
import re
from matplotlib import pyplot as plt
%matplotlib inline

import keras
from keras.layers import Dense, Activation, Input, Dropout
from keras.models import Model

In [3]:
# Reading a text file
f = open('./data.txt')
d = f.read()
f.close()

# Preprocessing text to include only the characters
data = d[1260:]
data = data.lower().decode('utf-8')
p = re.sub('[^A-Za-z]+', ' ', data)

ds = p.split()
print len(ds), 'words in text'

words, freq = np.unique(ds, return_counts=True)
print len(words), 'distinct words with total frequency', sum(freq), '(just to verify)'
print words

40057 words in text
5484 distinct words with total frequency 40057 (just to verify)
[u'a' u'abasement' u'abhorred' ..., u'youth' u'zeus' u'zip']


In [7]:
# Forming a mapping from words->index and index->word for one-hot encoding. Index is unique

bow = {}
rev_bow = {}
i = 0
for ix in range(len(words)):
    if freq[ix] > 2:
        bow[i] = words[ix]
        rev_bow[words[ix]] = i
        i += 1

print len(bow)
print bow
print rev_bow

1781
{0: u'a', 1: u'abject', 2: u'able', 3: u'abnegation', 4: u'about', 5: u'above', 6: u'absolute', 7: u'absolutely', 8: u'absurdity', 9: u'accept', 10: u'accepted', 11: u'access', 12: u'accompanying', 13: u'accordance', 14: u'according', 15: u'accordingly', 16: u'account', 17: u'accurately', 18: u'accustomed', 19: u'ache', 20: u'acquire', 21: u'acquires', 22: u'act', 23: u'action', 24: u'actions', 25: u'acts', 26: u'actual', 27: u'actually', 28: u'added', 29: u'additional', 30: u'adequately', 31: u'admiration', 32: u'admire', 33: u'admit', 34: u'advance', 35: u'advanced', 36: u'advancement', 37: u'advantage', 38: u'advantageous', 39: u'advantages', 40: u'advocate', 41: u'aesthetic', 42: u'affairs', 43: u'afford', 44: u'affording', 45: u'affords', 46: u'after', 47: u'again', 48: u'against', 49: u'age', 50: u'ages', 51: u'ago', 52: u'agree', 53: u'agreeable', 54: u'agreement', 55: u'aid', 56: u'aim', 57: u'aims', 58: u'air', 59: u'alike', 60: u'alive', 61: u'all', 62: u'allied', 63: u'

In [8]:
# Supplementary functions for one-hot encoding

def get_one_hot_vector(word):
    vec = np.zeros((len(bow),))
    vec[rev_bow[word]] = 1.0
    return vec

def get_word_from_vec(vec):
    ind = np.argmax(vec)
    return bow[ind]

a = get_one_hot_vector('tree')
print len(a), a
a_ = get_word_from_vec(a)
print a_

1781 [ 0.  0.  0. ...,  0.  0.  0.]
tree


In [9]:
# Converting the whole text into one hot encoding form ie Each word encoded as a one hot vector one after the other

dataset = []
for w in range(len(ds)):
    try:
        dataset.append(get_one_hot_vector(ds[w]))
    except:
        pass

dataset = np.asarray(dataset)
print dataset.shape # Why is the shape (35456, 1781)? Why is the shape not (40057, 1781)?

# Demonstrating accessing a word using its index
for ix in range(10):
    print bow[dataset[ix].argmax()]

(35456, 1781)
it
is
often
enough
and
always
with
great
to
me


In [10]:
X = np.zeros((dataset.shape[0]-3, dataset.shape[1]*3))
# print dataset.shape[0], dataset.shape[1]
# print dataset.shape[0]-1, dataset.shape[1]*2
# print '\n'

for ix in range(X.shape[0]-3):
    X[ix] = np.hstack((dataset[ix], dataset[ix+1], dataset[ix+2]))


a = np.asarray([1,2,3,4])
b = np.asarray([5,6,7,8])
print a, a.shape
print b, b.shape
c = np.zeros((1,8))
c[0] = np.hstack((a,b))
print c, c.shape
print '\n'
print dataset[0], dataset[0].shape


y = dataset[3:]

print X.shape, y.shape

[1 2 3 4] (4,)
[5 6 7 8] (4,)
[[ 1.  2.  3.  4.  5.  6.  7.  8.]] (1, 8)


[ 0.  0.  0. ...,  0.  0.  0.] (1781,)
(35453, 5343) (35453, 1781)


In [11]:
split = int(0.85 * X.shape[0])

X_train = X[:split]
X_val = X[split:]
y_train = y[:split]
y_val = y[split:]

print X_train.shape, X_val.shape
print y_train.shape, y_val.shape

(30135, 5343) (5318, 5343)
(30135, 1781) (5318, 1781)


In [54]:
embedding = 100

inp = Input(shape=(5343,))
emb = Dense(embedding, activation='tanh')(inp)
emb = Dropout(0.32)(emb)
out = Dense(1781, activation='softmax')(emb)

model = Model(inputs=inp, outputs=out)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
encoder = Model(inputs=inp, outputs=emb)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 5343)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 100)               534400    
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1781)              179881    
Total params: 714,281.0
Trainable params: 714,281.0
Non-trainable params: 0.0
_________________________________________________________________


In [55]:
hist = model.fit(X_train, y_train, epochs=25, shuffle=True, batch_size=100, validation_data=(X_val, y_val), verbose=2)

Train on 30135 samples, validate on 5318 samples
Epoch 1/25
32s - loss: 6.2173 - acc: 0.0770 - val_loss: 6.4149 - val_acc: 0.0778
Epoch 2/25
19s - loss: 5.6455 - acc: 0.1027 - val_loss: 6.4763 - val_acc: 0.0933
Epoch 3/25
22s - loss: 5.4927 - acc: 0.1187 - val_loss: 6.4688 - val_acc: 0.1042
Epoch 4/25
23s - loss: 5.3334 - acc: 0.1383 - val_loss: 6.4679 - val_acc: 0.1132
Epoch 5/25
21s - loss: 5.1695 - acc: 0.1575 - val_loss: 6.4761 - val_acc: 0.1232
Epoch 6/25
20s - loss: 5.0084 - acc: 0.1728 - val_loss: 6.4812 - val_acc: 0.1269
Epoch 7/25
21s - loss: 4.8515 - acc: 0.1864 - val_loss: 6.5174 - val_acc: 0.1314
Epoch 8/25
20s - loss: 4.6916 - acc: 0.2000 - val_loss: 6.5248 - val_acc: 0.1299
Epoch 9/25
19s - loss: 4.5282 - acc: 0.2144 - val_loss: 6.5526 - val_acc: 0.1290
Epoch 10/25
19s - loss: 4.3748 - acc: 0.2324 - val_loss: 6.5791 - val_acc: 0.1258
Epoch 11/25
19s - loss: 4.2230 - acc: 0.2467 - val_loss: 6.6145 - val_acc: 0.1232
Epoch 12/25
19s - loss: 4.0648 - acc: 0.2648 - val_loss: 6

In [68]:
a = encoder.predict(X)

w2v = {}
alpha = 0.8
for ix in range(X.shape[0]):
    try:
        old_vec = w2v[bow[y[ix].argmax()]]
        new_vec = alpha*old_vec + (1-alpha)*a[ix]
        w2v[bow[y[ix].argmax()]] = new_vec
    except:
        w2v[bow[y[ix].argmax()]] = a[ix]
print len(w2v.keys())
print w2v.keys()

1781
[u'limited', u'writings', u'demand', u'desirable', u'four', u'sleep', u'thirst', u'perverted', u'consists', u'hate', u'relationships', u'whose', u'founder', u'under', u'pride', u'worth', u'invariable', u'compassion', u'void', u'every', u'gratification', u'utmost', u'school', u'unconditioned', u'triumph', u'strivings', u'dogmas', u'estimates', u'direct', u'second', u'even', u'established', u'errors', u'organisms', u'children', u'lights', u'above', u'conduct', u'new', u'net', u'ever', u'seeks', u'told', u'deemed', u'simultaneously', u'notions', u'never', u'richest', u'here', u'protection', u'represented', u'path', u'obtained', u'substance', u'rests', u'changed', u'credit', u'superstitious', u'natures', u'punishment', u'classification', u'explained', u'feelings', u'brought', u'moral', u'total', u'charge', u'would', u'negative', u'distributing', u'call', u'calm', u'until', u'holy', u'brings', u'aware', u'dogma', u'hold', u'must', u'me', u'word', u'work', u'my', u'example', u'ascetic',

In [69]:
print w2v['tree']

[-0.00688785  0.21504846  0.08065619 -0.15410466 -0.07233584 -0.0559564
  0.1292907  -0.29712903  0.17873082  0.06926979  0.28190422  0.49738419
 -0.2431061  -0.4593814   0.02840119 -0.18153399 -0.62248743 -0.34955448
 -0.19996282 -0.2468074   0.10581502  0.12925176 -0.24986763 -0.37405765
  0.46253192  0.16779752  0.42856669  0.10447274 -0.11720264 -0.2649653
  0.50423688  0.14361604 -0.02982341  0.29967031  0.16057612  0.09657104
 -0.23618641 -0.29968101 -0.03715049 -0.13997726  0.21051803  0.20425323
 -0.04957343  0.12208879  0.08335635 -0.1920609   0.03536489  0.04371502
 -0.14213647 -0.05467362 -0.09199621 -0.25094211  0.03898845 -0.26410046
  0.32695845  0.07644249 -0.04239016  0.17274012  0.09326854  0.05499697
  0.1270173  -0.439928    0.00730894 -0.03198957  0.19082475 -0.21577984
  0.01973508 -0.11397059 -0.3503862  -0.00740688 -0.13036299  0.06425832
 -0.16137736  0.00884709 -0.29775107 -0.17380336 -0.29464948 -0.23670667
 -0.17037193  0.11723574 -0.30239826 -0.19256113 -0.1

In [93]:
def similarity(a1, a2):
    return np.dot(a1, a2)/np.sqrt((a1**2).sum()*(a2**2).sum())

v1 = w2v['man']
v2 = w2v['men']
print similarity(v1, v2)

v1 = w2v['man']
v2 = w2v['tree']
print similarity(v1, v2)

v1 = w2v['it']
v2 = w2v['a']
print similarity(v1, v2)

v1 = w2v['it']
v2 = w2v['greek']
print similarity(v1, v2)

v1 = w2v['religion']
v2 = w2v['god']
print similarity(v1, v2)

v1 = w2v['religion']
v2 = w2v['faith']
print similarity(v1, v2)

0.56733
0.145347
0.68698
0.0471217
0.517246
0.321595
