Implementation of word2vec model based on https://rguigoures.github.io/word2vec_pytorch/ tutorial

In [1]:
import itertools
import nltk
# nltk.download('brown')
# from nltk.corpus import brown
import re
from numpy.random import multinomial
import pandas as pd

In [11]:
# corpus = []

# # faz-se a tokenização do dataset de noticias:
# for cat in ["news"]:
#   for text_id in brown.fileids(cat):
#         raw_text = list(itertools.chain.from_iterable(brown.sents(text_id)))
#         text = ' '.join(raw_text)
#         text = text.lower()
#         text.replace('\n', ' ')
#         text = re.sub('[^a-z ]+', '', text)
#         corpus.append([w for w in text.split() if w != ''])

In [2]:
df = pd.read_csv("dataset copy.csv", sep=";")
df

Unnamed: 0,phrase,emotion
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness
5,i cant walk into a shop anywhere where i do no...,fear
6,i felt anger when at the end of a telephone call,anger
7,i explain why i clung to a relationship with a...,joy
8,i like to have the same breathless feeling as ...,joy
9,i jest i feel grumpy tired and pre menstrual w...,anger


In [3]:
corpus = []

for phrase in df["phrase"]:
    corpus.append([word for word in phrase.split() if word != ' '])

In [4]:
corpus

[['im',
  'feeling',
  'rather',
  'rotten',
  'so',
  'im',
  'not',
  'very',
  'ambitious',
  'right',
  'now'],
 ['im', 'updating', 'my', 'blog', 'because', 'i', 'feel', 'shitty'],
 ['i',
  'never',
  'make',
  'her',
  'separate',
  'from',
  'me',
  'because',
  'i',
  'don',
  't',
  'ever',
  'want',
  'her',
  'to',
  'feel',
  'like',
  'i',
  'm',
  'ashamed',
  'with',
  'her'],
 ['i',
  'left',
  'with',
  'my',
  'bouquet',
  'of',
  'red',
  'and',
  'yellow',
  'tulips',
  'under',
  'my',
  'arm',
  'feeling',
  'slightly',
  'more',
  'optimistic',
  'than',
  'when',
  'i',
  'arrived'],
 ['i',
  'was',
  'feeling',
  'a',
  'little',
  'vain',
  'when',
  'i',
  'did',
  'this',
  'one'],
 ['i',
  'cant',
  'walk',
  'into',
  'a',
  'shop',
  'anywhere',
  'where',
  'i',
  'do',
  'not',
  'feel',
  'uncomfortable'],
 ['i',
  'felt',
  'anger',
  'when',
  'at',
  'the',
  'end',
  'of',
  'a',
  'telephone',
  'call'],
 ['i',
  'explain',
  'why',
  'i',
  'clung

In [5]:
from collections import Counter
import random, math

Agora, vamos calcular as ocorrências das palavras no corpus e dps calcular a probabilidade de manter a palavra no corpus, que é definida por:
$$P(w_i) = \frac{10^{-3}}{p_i}(\sqrt{10^3p_i} + 1)$$

In [6]:
def subsample_frequent_words(corpus):
  filtered_corpus = []
  word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
  total = sum(list(word_counts.values()))
  # proportion of each word in corpus
  word_counts = {word: word_counts[word]/float(total) for word in word_counts}

  for text in corpus:
    filtered_corpus.append([])
    for word in text:
      if random.random() < (1+math.sqrt(word_counts[word]*1e3) * 1e-3 / word_counts[word]):
        filtered_corpus[-1].append(word)
  return filtered_corpus

In [7]:
corpus = subsample_frequent_words(corpus)
vocabulary = set(itertools.chain.from_iterable(corpus))

word_to_index = {word: index for (index, word) in enumerate(vocabulary)}
index_to_word = {index: word for (index, word) in enumerate(vocabulary)}

Agora construindo o Bag of Words

In [8]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random

In [9]:
def get_batches(context, batch_size = 100):
  random.shuffle(context)
  batches = []
  batch_target, batch_meaning= [], []
  for i in range(len(context)):

    batch_target.append(word_to_index[context[i][0]])
    batch_meaning.append(word_to_index[context[i][1]])

    if (i+1) % batch_size or i == len(context) - 1:
      tensor_target = autograd.Variable(torch.from_numpy(np.array(batch_target)).long())
      tensor_meaning = autograd.Variable(torch.from_numpy(np.array(batch_meaning)).long())
      batches.append((tensor_target, tensor_meaning))
      batch_target, batch_meaning = [], []
  return batches


In [10]:
def sample_negative(size):
  probability = {}
  word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
  normalizing_factor = sum([v**0.75 for v in word_counts.values()])

  for word in word_counts:
    probability[word] = word_counts[word]**0.75 / normalizing_factor

  words = np.array(list(word_counts.keys()))

  while True:
    word_list = []
    index = np.array(multinomial(size, list(probability.values())))
    for i, count in enumerate(index):
      for _ in range(count):
        word_list.append(words[i])
    yield word_list

In [11]:
context = []
window = 4
# negative_samples = sample_negative(8)

dar uma olhada no nest_asyncio para paralelização do bloco abaixo e se possível melhorar o resto do programa

In [12]:
for text in corpus:
  for i, word in enumerate(text):
    start = max(0, i-window)
    end = min(i+window, len(text))
    for j in range(start, end):
      if i != j:
        context.append((word,text[j]))

print(f"Tem-se um total de {len(context)} pares de palavras alvo e palavras contextuais")

Tem-se um total de 2018 pares de palavras alvo e palavras contextuais


# Criando de fato o word2vec

In [13]:
class Word2Vec(nn.Module):
  def __init__(self, embedding_size, vocab_size):
    super(Word2Vec, self).__init__()
    self.target_embeddings = nn.Embedding(vocab_size, embedding_size)
    self.context_embeddings = nn.Embedding(vocab_size, embedding_size)

  def forward(self, target_word, context_word):
    emb_target = self.target_embeddings(target_word)
    emb_context = self.context_embeddings(context_word)
    emb_product = torch.mul(emb_target, emb_context)
    emb_product = torch.sum(emb_product, dim=1)
    output = torch.sum(F.logsigmoid(emb_product))
    # emb_negative = self.context_embeddings(negative_example)
    # emb_product = torch.bmm(emb_negative, emb_target.unsqueeze(2))
    # emb_product = torch.sum(emb_product, dim=1)
    # output += torch.sum(F.logsigmoid(-emb_product))
    return -output

Definição de uma função que para o aprendizado assim que a função de custo parar de decrescer significantemente

In [14]:
class EarlyStopping():
  def __init__(self, patience=5, min_percent_gain=1):
    self.patience = patience
    self.loss = []
    self.min_percent_gain = min_percent_gain/100.

  def update_loss(self, loss):
    self.loss.append(loss)
    if len(self.loss) > self.patience:
      del self.loss[0]

  def stop_training(self):
    if len(self.loss) == 1:
      return False

    gain = (max(self.loss) - min(self.loss)/max(self.loss))
    print(f"Ganho de custo: {round(100*gain,2)}")
    return gain < self.min_percent_gain

Etapa de aprendizado

In [15]:
vocab_size = len(vocabulary)

w2v = Word2Vec(embedding_size= 200, vocab_size = vocab_size)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(w2v.parameters())
early_stopping = EarlyStopping()
context_tensor = []

In [16]:
for target, meaning in context:
  target_tensor = autograd.Variable(torch.LongTensor([word_to_index[target]]))
  meaning_tensor = autograd.Variable(torch.LongTensor([word_to_index[meaning]]))
  context_tensor.append((target_tensor, meaning_tensor))

In [17]:
while True:
  losses = []
  context_batches = get_batches(context= context, batch_size=1000)
  for i in range(len(context_batches)):
    w2v.zero_grad()
    target_tensor, meaning_tensor = context_batches[i]
    loss = w2v(target_tensor, meaning_tensor)
    loss.backward()
    optimizer.step()
    losses.append(loss.data)
  print("loss: ", np.mean(losses))
  early_stopping.update_loss(np.mean(losses))
  if early_stopping.stop_training():
    break

loss:  5.37983
loss:  3.1180847
Ganho de custo: 480.02
loss:  1.5449657
Ganho de custo: 509.27
loss:  0.66552913
Ganho de custo: 525.61
loss:  0.25723094
Ganho de custo: 533.2
loss:  0.078618094
Ganho de custo: 309.29
loss:  0.016831018
Ganho de custo: 153.41
loss:  0.0041913274
Ganho de custo: 65.92
loss:  0.0005354312
Ganho de custo: 25.51
loss:  0.00022546829
Ganho de custo: 7.58
loss:  8.80563e-05
Ganho de custo: 1.16
loss:  3.4798308e-05
Ganho de custo: -0.41


testar com um dataset pequeno (20 coisas), com o sample de negativos e sem