In [1]:
%pylab inline
import torch

Populating the interactive namespace from numpy and matplotlib


In [2]:
figsize(10, 8)
plt.style.use(['dark_background'])

In [3]:
import pickle

corpus_text = pickle.load(
    open("corpus_text.pickle", "rb")
)

In [4]:
word2id, id2word = {}, {}

uniques = set()
for words in corpus_text:
    for w in words:
        uniques.add(w)

count_id = 0
for word in uniques:
    word2id[word] = count_id
    id2word[count_id] = word
    count_id += 1

In [5]:
id2word[23]

'resultou'

In [6]:
len(uniques), len(corpus_text)

(618, 61)

In [7]:
window = 2
pair_ids = []

text_size = len(corpus_text)

corpus_text = np.array(corpus_text)
mask = np.array([i for i in range(-window, window+1) if i is not 0])

for paragraph in corpus_text:
    paragraph = np.array(paragraph)
    text_size = len(paragraph)

    for center_word in range(window, text_size-window):
        center_word_id = word2id[paragraph[center_word]]
        context_words = [word2id[i] for i in paragraph[mask + center_word]]

        pair_ids.append([context_words, center_word_id])

In [8]:
from IPython.display import Markdown

In [9]:
ex_out = "|contexto | central | contexto | central |\n|--|--|--|--|\n"
for i in range(95, 106):
    w0, w1 = pair_ids[i]
    ex_out += f"|{w0} | {w1} | {[id2word[w] for w in w0]} | {id2word[w1]}|\n"

Markdown(ex_out)

|contexto | central | contexto | central |
|--|--|--|--|
|[525, 134, 175, 608] | 18 | ['sobre', 'base', 'morfogênese', 'previu'] | química|
|[134, 18, 608, 282] | 175 | ['base', 'química', 'previu', 'reações'] | morfogênese|
|[18, 175, 282, 441] | 608 | ['química', 'morfogênese', 'reações', 'químicas'] | previu|
|[175, 608, 441, 36] | 282 | ['morfogênese', 'previu', 'químicas', 'oscilantes'] | reações|
|[608, 282, 36, 556] | 441 | ['previu', 'reações', 'oscilantes', 'reação'] | químicas|
|[282, 441, 556, 501] | 36 | ['reações', 'químicas', 'reação', 'belousov'] | oscilantes|
|[441, 36, 501, 148] | 556 | ['químicas', 'oscilantes', 'belousov', 'zhabotinsky'] | reação|
|[36, 556, 148, 531] | 501 | ['oscilantes', 'reação', 'zhabotinsky', 'observadas'] | belousov|
|[556, 501, 531, 78] | 148 | ['reação', 'belousov', 'observadas', 'primeira'] | zhabotinsky|
|[501, 148, 78, 54] | 531 | ['belousov', 'zhabotinsky', 'primeira', 'vez'] | observadas|
|[148, 531, 54, 237] | 78 | ['zhabotinsky', 'observadas', 'vez', 'década'] | primeira|


In [10]:
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, emb_size, context_size):
        super(CBOW, self).__init__()
        
        self.embeddings = torch.nn.Embedding(vocab_size, emb_size)
        
        self.linear0 =  torch.nn.Linear(2*emb_size*context_size, 512)
        self.linear1 = torch.nn.Linear(512, vocab_size)

        self.log_softmax = torch.nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        out = self.embeddings(x).view(1, -1)

        out = self.linear0(out)
        out = self.linear1(out)
        
        out = self.log_softmax(out)
        return out
    
    def get_word_emb(self, word_id):
        word = torch.LongTensor([word_id])
        return self.embeddings(word).view(1, -1)

In [11]:
cbow = CBOW(len(uniques), 10, 2)

In [12]:
teste_data = torch.LongTensor(pair_ids[100][0])
target_data = torch.LongTensor(pair_ids[100][1])
res = cbow(teste_data).argmax().item()
id2word[res]

'acidental'

In [None]:
nll_loss = torch.nn.NLLLoss()
optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001)

indexes = np.arange(len(pair_ids))
losses = []
for epoch in range(101):
    np.random.shuffle(indexes)
    for index in indexes:
        context, target = pair_ids[index]
        optimizer.zero_grad()
        cbow.zero_grad()
        
        X = torch.LongTensor(context)
        Y = torch.LongTensor([target])
        out_prob = cbow(X)
        loss = nll_loss(out_prob, Y)
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
    if epoch %20 == 0:
        print(f"{epoch:<2} - {loss.item():.3f}")

0  - 6.144
20 - 3.571
40 - 0.209
60 - 0.198


In [None]:
fig = figure()
ax = fig.gca()
ax.plot(losses)
savefig("../images/word2vec-cbow-loss.png", dpi="figure", transparent=True)

In [None]:
_pos = 28
teste_data = torch.LongTensor(pair_ids[_pos][0])
target_data = torch.LongTensor(pair_ids[_pos][1])
res = cbow(teste_data).argmax().item()
print(f"data:{[id2word[i] for i in pair_ids[_pos][0]]} | target: {id2word[pair_ids[_pos][1]]}")
print(f"{id2word[res]}")


In [None]:
cbow.get_word_emb(_pos).detach().numpy()

In [None]:
from sklearn.decomposition import PCA
import pandas as pd

In [None]:
dataf = pd.DataFrame({
    "words":list(uniques),
    "id": [word2id[i] for i in uniques],
    "emb": [cbow.get_word_emb(word2id[i]).detach().numpy()[0] for i in uniques],
    "x" : [0]*len(uniques),
    "y": [0]*len(uniques)
})

In [None]:
dataf.head()

In [None]:
pca = PCA(n_components=2)
a = np.array([cbow.get_word_emb(word2id[i]).detach().numpy()[0] for i in uniques],)
XY = pca.fit_transform(a)

In [None]:
dataf["x"] = XY[:, 0]
dataf["y"] = XY[:, 1]
dataf.head()

In [None]:
import altair as alt

In [None]:
alt.themes.enable('opaque')
base = alt.Chart(dataf).properties(
        width=500,
        height=500
    )

base.mark_circle(size=180).encode(x="x", y="y", tooltip="words")

In [None]:
from knn_eucl_cos import knn

In [None]:
w = word2id["turing"]

matrix = np.array(dataf.values[:, 3:5], dtype=np.float64)
result_pos, result_values = knn(matrix, pos=w)

for i, j in zip(result_pos, result_values):
    print(id2word[i], f"{j:.3f}")