In [1]:
from cbow import CBOW
import numpy as np

In [2]:
def cos(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [3]:
with open("data.txt", "r") as f:
    text = f.read()
text = np.array(text.split(' '))
text = text[np.where(text != '')]

In [10]:
window = 2
n = 100
lr = 1e-4
epochs = 50

cbow = CBOW(text=text, window=window, n=n, learning_rate=lr)

for epoch in range(epochs):
    cbow.fit(epoch)

Epoch 0 :: Loss 0.004113562090457002
Epoch 1 :: Loss 0.004116589946487178
Epoch 2 :: Loss 0.0041219786953716146
Epoch 3 :: Loss 0.004142251172659111
Epoch 4 :: Loss 0.004194229457356439
Epoch 5 :: Loss 0.004292346077262836
Epoch 6 :: Loss 0.004422282537824453
Epoch 7 :: Loss 0.004558905611478135
Epoch 8 :: Loss 0.004704529366826366
Epoch 9 :: Loss 0.004866786743311899
Epoch 10 :: Loss 0.005045614704954271
Epoch 11 :: Loss 0.00523452901708704
Epoch 12 :: Loss 0.0054192181073356525
Epoch 13 :: Loss 0.005592103239059441
Epoch 14 :: Loss 0.005738544243988657
Epoch 15 :: Loss 0.005856978722920833
Epoch 16 :: Loss 0.005957515437786534
Epoch 17 :: Loss 0.006039497139267918
Epoch 18 :: Loss 0.006099870123480497
Epoch 19 :: Loss 0.006147543001444731
Epoch 20 :: Loss 0.006185421282753168
Epoch 21 :: Loss 0.006216417983760533
Epoch 22 :: Loss 0.006241796629561732
Epoch 23 :: Loss 0.0062603918211285945
Epoch 24 :: Loss 0.006273561236578151
Epoch 25 :: Loss 0.00628352023798211
Epoch 26 :: Loss 0.00

<h3>Closest words</h3>

In [11]:
def word2vec(word):
    index = cbow.word2index[word]
    return (cbow.V[:, index] + cbow.U[index, :]) / 2

In [12]:
vecs = []
for word in cbow.vocab:
    vecs.append(word2vec(word))
vecs = np.array(vecs)

In [13]:
def get_dists(word):
    v = word2vec(word)
    dists = np.array([cos(v, w) for w in vecs])
    return dists

In [14]:
def get_top_5(word):
    dists = get_dists(word)
    sorted_dists = np.flip(np.argsort(dists))
    return cbow.vocab[sorted_dists][:6], dists[sorted_dists][:6]

In [15]:
print(get_top_5("stock"))
print(get_top_5("mortgage"))
print(get_top_5("dollar"))

(array(['stock', 'value', 'out', 'mr', 'is', 'year'], dtype='<U22'), array([1., 1., 1., 1., 1., 1.]))
(array(['mortgage', 'pernod', 'down', '5m', 'share', 'rosneft'],
      dtype='<U22'), array([1., 1., 1., 1., 1., 1.]))
(array(['dollar', 'market', 'firm', 'has', 'insurer', 'financial'],
      dtype='<U22'), array([1., 1., 1., 1., 1., 1.]))


In [16]:
v_fall = word2vec("fall")
ind_fall = cbow.word2index["fall"]
v_rise = word2vec("rise")
ind_rise = cbow.word2index["rise"]
v_low = word2vec("low")
ind_low = cbow.word2index["low"]

In [17]:
add_obj_val = np.array([cos(w, v_rise - v_fall + v_low) for w in vecs])
mul_obj_val = np.array([cos(w, v_rise) * cos(w, v_low) / (cos(w, v_fall) + 1e-8) for w in vecs])

In [18]:
d = np.flip(np.argsort(add_obj_val))
print([(cbow.vocab[d][i], add_obj_val[d][i]) for i in range(10)])

[('from', 0.9999999997810832), ('move', 0.9999999996492474), ('low', 0.9999999996476001), ('year', 0.9999999996075158), ('is', 0.9999999996072111), ('fund', 0.9999999995739877), ('even', 0.9999999995434932), ('however', 0.9999999995006482), ('decline', 0.9999999994904881), ('which', 0.9999999994879907)]


In [19]:
d = np.flip(np.argsort(mul_obj_val))
print([(cbow.vocab[d][i], mul_obj_val[d][i]) for i in range(10)])

[('by', 0.9999999978327363), ('and', 0.9999999976982104), ('to', 0.9999999976908518), ('at', 0.9999999976427397), ('yuan', 0.9999999976222278), ('case', 0.9999999976196763), ('prices', 0.9999999976135749), ('may', 0.9999999975352913), ('past', 0.9999999974999253), ('economy', 0.9999999974251517)]
