In [1]:
import pandas as pd
import numpy as np

import torch
import tensorflow as tf

from sklearn.feature_extraction.text import TfidfVectorizer
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
csv = pd.read_csv('./data/train.csv')
csv.head(10)

Unnamed: 0,id,sentence,label
0,fde9e435-8186-4cb3-8ec1-1be67ddb5f96,"""Трудно е класически оркестър и рок банда да с...",OBJ
1,bb522430-40f0-4781-9910-92a1aefd013b,"Следователно, Москва е пазителка на православн...",OBJ
2,d6a84f01-9153-4f3b-bca6-ed2b2edc6a9e,От Washington Post са изготвили подробен матер...,OBJ
3,3623488a-c528-4509-a92d-9ad4b49099ec,И пак така относно заслугите за постигнатото о...,OBJ
4,587b0e27-6ac8-433f-9b99-adf8d9c7c0a2,Понякога удобството да разтвориш набързо стран...,OBJ
5,f2d2aab0-25af-4bef-b678-badd5b390e8d,"Вчера Барак Обама, отиващият си тъжен стопанин...",SUBJ
6,f94840f8-7fb6-42e3-aa84-0a250d58af5b,Дали защото Първият черен президент на САЩ си ...,SUBJ
7,b30bd670-07fd-40ac-82d7-c67988f57cc3,"И като доказателство за това, гръмна и следващ...",SUBJ
8,512b57e5-9b65-43a8-8dec-01bef61a3ad6,"Последният път, когато Америка се обърна навът...",SUBJ
9,d6bad64f-59f2-491e-9d99-103d2748d647,"Шок, бомба, ужас!",SUBJ


In [3]:
MAX_FEATURES = 20000
MAX_LEN = 30  # cut texts after this number of words (among top max_features most common words)
BATCH_SIZE = 32
LSTM_UNITS = 128
EMBEDDING_SIZE = 300
DISPLAY_STEP = 1

In [4]:
csv['label'] = [0 if lb == 'OBJ' else 1 for lb in csv['label'] ]
csv['label'] = csv['label'].astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(csv['sentence'].values, csv['label'].values, test_size=0.33, random_state=42)

In [6]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

In [7]:
tokenizer = Tokenizer(num_words=MAX_LEN)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(X_train, maxlen=MAX_LEN)
X_test = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_LEN)

In [8]:
class LstmDataset(Dataset):
  def __init__(self, x, y):
        self.x = x
        self.y = y
  def __len__(self):
        return len(self.x)

  def __getitem__(self, idx):
        return (np.array(self.x[idx], dtype=np.longlong),
                np.array(self.y[idx], dtype=np.float32))
train_data = LstmDataset(X_train, y_train)
test_data = LstmDataset(X_test, y_test)

In [12]:
class LSTM(nn.Module):

  def __init__(self, embedding_size, hidden_size, words_count, dropout_rate=0.1):
    super(LSTM, self).__init__()
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size


    self.word_embedding = nn.Embedding(words_count, self.embedding_size, padding_idx=0)
    self.lstm = nn.LSTM(self.embedding_size,
                                  self.hidden_size)
    self.proj = nn.Linear(self.hidden_size, 1)
    self.dropout= nn.Dropout(dropout_rate)

  def forward(self, x, lengths):
    embeddings = self.word_embedding(x)
    padded_input = nn.utils.rnn.pack_padded_sequence(embeddings, lengths, batch_first=True)

    seq_output, (h_n, c_n) = self.lstm(padded_input)
    seq_output, _ = nn.utils.rnn.pad_packed_sequence(seq_output, batch_first=True)
    out = seq_output.sum(dim=1).div(lengths.float().unsqueeze(dim=1))
    seq_output = self.dropout(out)
    logits = self.proj(seq_output)

    return logits

model = LSTM(EMBEDDING_SIZE, LSTM_UNITS, MAX_FEATURES)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()

In [13]:
def collate_fn(batch):
  # We want to sort the batch by seq length,
  # in order to make the computation more efficient
  batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)

  inputs = [torch.LongTensor(x[0]) for x in batch]
  padded_input = nn.utils.rnn.pad_sequence(inputs, batch_first=True)
  lengths = torch.LongTensor([len(x[0]) for x in batch])
  y = torch.FloatTensor(np.array([x[1] for x in batch])).reshape(-1, 1)

  return padded_input, lengths, y

In [14]:
def train(model, optimizer,loss_fn, dataset, epochs=10):
  cost = []
  for epoch in range(1, epochs+1):
    model.train()
    dataloader = DataLoader(dataset,batch_size=8,shuffle=True,drop_last=False, collate_fn=collate_fn)
    for x, l, y in tqdm(dataloader):
      optimizer.zero_grad()
      y_hat = model(x, l)
      loss = loss_fn(y_hat, y)
      loss.backward()
      optimizer.step()
      cost.append(loss.item())
    if (epoch) % DISPLAY_STEP == 0:
      print("Epoch: {:04d} mean cost={:.9f}".format(epoch, np.mean(cost)))
train(model, optimizer, loss_fn, train_data,2)

100%|██████████| 67/67 [00:02<00:00, 26.99it/s]


Epoch: 0001 mean cost=0.687860724


100%|██████████| 67/67 [00:02<00:00, 24.37it/s]

Epoch: 0002 mean cost=0.678192906



