In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from torch import nn
from torch.nn import init
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

import string

from collections import Counter
import collections
import random

import torch.nn.functional as F



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
data = pd.read_csv('./data/train.csv')
data.head(10)

Unnamed: 0,id,sentence,label
0,fde9e435-8186-4cb3-8ec1-1be67ddb5f96,"""Трудно е класически оркестър и рок банда да с...",OBJ
1,bb522430-40f0-4781-9910-92a1aefd013b,"Следователно, Москва е пазителка на православн...",OBJ
2,d6a84f01-9153-4f3b-bca6-ed2b2edc6a9e,От Washington Post са изготвили подробен матер...,OBJ
3,3623488a-c528-4509-a92d-9ad4b49099ec,И пак така относно заслугите за постигнатото о...,OBJ
4,587b0e27-6ac8-433f-9b99-adf8d9c7c0a2,Понякога удобството да разтвориш набързо стран...,OBJ
5,f2d2aab0-25af-4bef-b678-badd5b390e8d,"Вчера Барак Обама, отиващият си тъжен стопанин...",SUBJ
6,f94840f8-7fb6-42e3-aa84-0a250d58af5b,Дали защото Първият черен президент на САЩ си ...,SUBJ
7,b30bd670-07fd-40ac-82d7-c67988f57cc3,"И като доказателство за това, гръмна и следващ...",SUBJ
8,512b57e5-9b65-43a8-8dec-01bef61a3ad6,"Последният път, когато Америка се обърна навът...",SUBJ
9,d6bad64f-59f2-491e-9d99-103d2748d647,"Шок, бомба, ужас!",SUBJ


In [4]:
word_data = []
for sent in data['sentence'].values:
  words = word_tokenize(sent)
  words = [word for word in words if word not in string.punctuation]
  word_data.extend(words)
word_data[:15]

['``',
 'Трудно',
 'е',
 'класически',
 'оркестър',
 'и',
 'рок',
 'банда',
 'да',
 'свирят',
 'заедно',
 'имаме',
 'физически',
 'проблеми',
 'защото']

In [5]:
cnt = Counter(word_data)
cnt

Counter({'на': 570,
         'и': 473,
         'да': 348,
         'в': 283,
         'се': 272,
         'е': 264,
         'от': 257,
         'за': 213,
         '``': 178,
         'че': 171,
         "''": 170,
         'не': 151,
         'с': 132,
         'са': 111,
         'ще': 110,
         'си': 102,
         'това': 91,
         'като': 86,
         'по': 74,
         '–': 66,
         'които': 56,
         'а': 51,
         'има': 46,
         'но': 43,
         'много': 42,
         'му': 38,
         'след': 37,
         'хора': 37,
         'което': 36,
         'до': 35,
         'години': 34,
         'който': 32,
         'която': 32,
         'още': 32,
         'В': 32,
         'го': 31,
         'И': 31,
         'през': 31,
         'ни': 30,
         'един': 30,
         'може': 29,
         'или': 28,
         'към': 28,
         'бъде': 27,
         'г.': 26,
         'България': 25,
         'тези': 25,
         'им': 24,
         'време': 24,
         'В

In [6]:
vocabulary_size = 15000

def build_dataset(words, n_words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(n_words - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0

  unk_idx = dictionary['UNK']
  for word in words:
    index = dictionary.get(word, unk_idx)
    if index == unk_idx:
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reversed_dictionary


In [7]:
data, count, dictionary, reverse_dictionary = build_dataset(word_data,
                                                            vocabulary_size)

In [8]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size, 1), dtype=np.longlong)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.longlong)
  span = 2 * skip_window + 1  # [ | skip_window | target | skip_window | ]
  buffer = collections.deque(maxlen=span)
  if data_index + span > len(data):
    data_index = 0
  buffer.extend(data[data_index:data_index + span])
  data_index += span
  for i in range(batch_size // num_skips):
    context_words = [w for w in range(span) if w != skip_window]
    words_to_use = random.sample(context_words, num_skips)
    for j, context_word in enumerate(words_to_use):
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[context_word]
    if data_index == len(data):
      buffer.extend(data[:span])
      data_index = span
    else:
      buffer.append(data[data_index])
      data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
  print(batch[i, 0], reverse_dictionary[batch[i, 0]],
        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

1327 Трудно -> 6 е
1327 Трудно -> 9 ``
6 е -> 1327 Трудно
6 е -> 1328 класически
1328 класически -> 1329 оркестър
1328 класически -> 6 е
1329 оркестър -> 2 и
1329 оркестър -> 1328 класически


In [None]:
class SkipGram(nn.Module):
  def __init__(self, vocabulary_size, emb_size):
    super(SkipGram, self).__init__()
    self.vocab_size = vocabulary_size
    self.emb_size = emb_size

    '''
    Define and initialize embeddings. For which layers? What will be the initial values?
    '''

  def forward(self, target, neighbor, negative_sample):
    emb_target = self.target_emb(target)
    emb_neighbor = self.neighbor_emb(neighbor)
    emb_negative = self.neighbor_emb(negative_sample)

    score = torch.sum(torch.mul(emb_target, emb_neighbor))
    score = torch.clamp(score,max=10, min=10)
    score = -F.logsigmoid(score)

    neg_score = torch.bmm(emb_negative, emb_target.permute(0,2,1))
    neg_score = torch.clamp(neg_score,max=10, min=10)
    neg_score = torch.sum(-F.logsigmoid(-neg_score),dim=1)

    return torch.mean(score + neg_score)

In [10]:
embedding_size=100
model = SkipGram(vocabulary_size, embedding_size)
optimizer = torch.optim.SGD(model.parameters(), lr=1.)

In [11]:
batch_size = 256
embedding_size = 100  # Dimension of the embedding vector.
skip_window = 10       # How many words to consider left and right.
num_skips = 4         # How many times to reuse an input to generate a label.
num_sampled = 128      # Number of negative examples to sample.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, (valid_size,1), replace=False)

In [None]:
num_steps = 20001

def sample_negative(batch_size, vocabulary_size, negative_size):
  return np.random.randint(0, vocabulary_size, (batch_size, negative_size))

model.train()

average_loss = 0
for step in tqdm(range(1, num_steps)):
  optimizer.zero_grad()

  pos_u, pos_v = generate_batch(batch_size, num_skips=num_skips, skip_window=skip_window)
  pos_u = torch.from_numpy(pos_u)
  pos_v = torch.from_numpy(pos_v)
  neg_v = sample_negative(batch_size, vocabulary_size, num_sampled)
  neg_v = torch.LongTensor(neg_v)

  loss = model(pos_u, pos_v, neg_v)
  average_loss += loss.detach().cpu().item()

  loss.backward()
  optimizer.step()

  if step % 2000 == 0:
    if step > 0:
      average_loss /= 2000
    # The average loss is an estimate of the loss over the last 2000 batches.
    print('Average loss at step ', step, ': ', average_loss)
    average_loss = 0

 10%|█         | 2013/20000 [00:25<03:50, 77.89it/s]

Average loss at step  2000 :  1280.005859375


 20%|██        | 4013/20000 [00:54<03:53, 68.49it/s]

Average loss at step  4000 :  1280.005859375


 30%|███       | 6006/20000 [01:23<03:51, 60.45it/s]

Average loss at step  6000 :  1280.005859375


 40%|████      | 8006/20000 [01:55<03:09, 63.27it/s]

Average loss at step  8000 :  1280.005859375


 50%|████▉     | 9999/20000 [02:26<02:26, 68.17it/s]


Average loss at step  10000 :  1280.005859375


AttributeError: 'SkipGram' object has no attribute 'get_nearest'