In [2]:
from model_utils import sample_negative, get_batches
from word2vecs import Word2Vec_negative, Word2Vec, EarlyStopping
import pandas as pd
import itertools

In [3]:
df = pd.read_csv("dataset copy.csv", sep=";")

In [4]:
corpus = []

for phrase in df["phrase"]:
    corpus.append([word for word in phrase.split() if word != ' '])

In [5]:
from collections import Counter
import random, math

In [6]:
def subsample_frequent_words(corpus):
  filtered_corpus = []
  word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
  total = sum(list(word_counts.values()))
  # proportion of each word in corpus
  word_counts = {word: word_counts[word]/float(total) for word in word_counts}

  for text in corpus:
    filtered_corpus.append([])
    for word in text:
      if random.random() < (1+math.sqrt(word_counts[word]*1e3) * 1e-3 / word_counts[word]):
        filtered_corpus[-1].append(word)
  return filtered_corpus

In [7]:
corpus = subsample_frequent_words(corpus)
vocabulary = set(itertools.chain.from_iterable(corpus))

word_to_index = {word: index for (index, word) in enumerate(vocabulary)}
index_to_word = {index: word for (index, word) in enumerate(vocabulary)}

In [8]:
context = []
window = 4

In [9]:
for text in corpus:
  for i, word in enumerate(text):
    start = max(0, i-window)
    end = min(i+window, len(text))
    for j in range(start, end):
      if i != j:
        context.append((word,text[j]))

print(f"Tem-se um total de {len(context)} pares de palavras alvo e palavras contextuais")

Tem-se um total de 2018 pares de palavras alvo e palavras contextuais


In [10]:
import torch.optim as optim
import torch.nn as nn
import torch
import torch.autograd as autograd
import numpy as np

word2vec sem negativo

In [11]:
vocab_size = len(vocabulary)

w2v = Word2Vec(embedding_size= 200, vocab_size = vocab_size)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(w2v.parameters())
early_stopping = EarlyStopping()
context_tensor = []

In [12]:
for target, meaning in context:
  target_tensor = autograd.Variable(torch.LongTensor([word_to_index[target]]))
  meaning_tensor = autograd.Variable(torch.LongTensor([word_to_index[meaning]]))
  context_tensor.append((target_tensor, meaning_tensor))

In [13]:
while True:
  losses = []
  context_batches = get_batches(word_to_index, context= context, batch_size=1000)
  for i in range(len(context_batches)):
    w2v.zero_grad()
    target_tensor, meaning_tensor = context_batches[i]
    loss = w2v(target_tensor, meaning_tensor)
    loss.backward()
    optimizer.step()
    losses.append(loss.data)
  print("loss: ", np.mean(losses))
  early_stopping.update_loss(np.mean(losses))
  if early_stopping.stop_training():
    break

loss:  5.477333
loss:  3.180909
Ganho de custo: 489.66
loss:  1.5443022
Ganho de custo: 519.54
loss:  0.6335019
Ganho de custo: 536.17
loss:  0.22166914
Ganho de custo: 543.69
loss:  0.06377351
Ganho de custo: 316.09
loss:  0.014253848
Ganho de custo: 153.51
loss:  0.004004623
Ganho de custo: 62.72
loss:  0.0005276215
Ganho de custo: 21.93
loss:  0.00021491354
Ganho de custo: 6.04
loss:  8.510561e-05
Ganho de custo: 0.83


Word2vec com negativo

In [14]:
vocab_size = len(vocabulary)

w2v = Word2Vec_negative(embedding_size= 200, vocab_size = vocab_size)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(w2v.parameters())
early_stopping = EarlyStopping()
context_tensor = []

In [15]:
context = []
window = 4
negative_example = sample_negative(corpus, 1)

In [16]:
for text in corpus:
  for i, word in enumerate(text):
    
    start = max(0, i-window)
    end = min(i+window, len(text))
    print(text)
    for j in range(start, end):
      if i != j:
        aux = (word,text[j], next(negative_example)[0])
        print(aux[2], " --> ", aux[1])
        if aux[1] == aux[2]:
          print('cheguei aq')
        context.append(aux)

print(f"Tem-se um total de {len(context)} pares de palavras alvo e palavras contextuais")

['im', 'feeling', 'rather', 'rotten', 'so', 'im', 'not', 'very', 'ambitious', 'right', 'now']
wanna  -->  feeling
katniss  -->  rather
the  -->  rotten
['im', 'feeling', 'rather', 'rotten', 'so', 'im', 'not', 'very', 'ambitious', 'right', 'now']
despite  -->  im
today  -->  rather
baba  -->  rotten
i  -->  so
['im', 'feeling', 'rather', 'rotten', 'so', 'im', 'not', 'very', 'ambitious', 'right', 'now']
have  -->  im
i  -->  feeling
did  -->  rotten
position  -->  so
but  -->  im
['im', 'feeling', 'rather', 'rotten', 'so', 'im', 'not', 'very', 'ambitious', 'right', 'now']
separate  -->  im
of  -->  feeling
walrus  -->  rather
purples  -->  so
vain  -->  im
what  -->  not
['im', 'feeling', 'rather', 'rotten', 'so', 'im', 'not', 'very', 'ambitious', 'right', 'now']
rich  -->  im
updating  -->  feeling
find  -->  rather
probably  -->  rotten
in  -->  im
again  -->  not
to  -->  very
['im', 'feeling', 'rather', 'rotten', 'so', 'im', 'not', 'very', 'ambitious', 'right', 'now']
have  -->  feel

In [17]:
while True:
  losses = []
  context_batches = get_batches(word_to_index, negative= True, context= context, batch_size=1000)
  for i in range(len(context_batches)):
    w2v.zero_grad()
    target_tensor, meaning_tensor, negative_tensor = context_batches[i]
    loss = w2v(target_tensor, meaning_tensor, negative_tensor)
    loss.backward()
    optimizer.step()
    losses.append(loss.data)
  print("loss: ", np.mean(losses))
  early_stopping.update_loss(np.mean(losses))
  if early_stopping.stop_training():
    break

loss:  7592.9995
loss:  7446.691
Ganho de custo: 759201.86
loss:  7322.768
Ganho de custo: 759203.52
loss:  7204.01
Ganho de custo: 759205.08
loss:  7088.2026
Ganho de custo: 759206.59
loss:  6975.56
Ganho de custo: 744575.44
loss:  6863.3784
Ganho de custo: 732183.06
loss:  6753.031
Ganho de custo: 720307.23
loss:  6644.643
Ganho de custo: 708726.51
loss:  6536.9546
Ganho de custo: 697462.3
loss:  6430.7188
Ganho de custo: 686244.14
loss:  6325.891
Ganho de custo: 675209.42
loss:  6221.144
Ganho de custo: 664370.7
loss:  6117.6714
Ganho de custo: 653601.86
loss:  6016.539
Ganho de custo: 642978.32
loss:  5915.2114
Ganho de custo: 632495.61
loss:  5815.724
Ganho de custo: 622020.9
loss:  5716.481
Ganho de custo: 611673.68
loss:  5618.35
Ganho de custo: 601560.55
loss:  5521.9995
Ganho de custo: 591427.78
loss:  5425.992
Ganho de custo: 581479.1
loss:  5331.287
Ganho de custo: 571554.83
loss:  5237.875
Ganho de custo: 561741.8
loss:  5145.0557
Ganho de custo: 552106.79
loss:  5053.3574


KeyboardInterrupt: 

não converge com o negativo