In [2]:
import pandas as pd
import numpy as np

import torch

from sklearn.feature_extraction.text import TfidfVectorizer
from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

import string
import collections

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [170]:
csv = pd.read_csv('./data/train.csv')
csv.head(10)

Unnamed: 0,id,sentence,label
0,fde9e435-8186-4cb3-8ec1-1be67ddb5f96,"""Трудно е класически оркестър и рок банда да с...",OBJ
1,bb522430-40f0-4781-9910-92a1aefd013b,"Следователно, Москва е пазителка на православн...",OBJ
2,d6a84f01-9153-4f3b-bca6-ed2b2edc6a9e,От Washington Post са изготвили подробен матер...,OBJ
3,3623488a-c528-4509-a92d-9ad4b49099ec,И пак така относно заслугите за постигнатото о...,OBJ
4,587b0e27-6ac8-433f-9b99-adf8d9c7c0a2,Понякога удобството да разтвориш набързо стран...,OBJ
5,f2d2aab0-25af-4bef-b678-badd5b390e8d,"Вчера Барак Обама, отиващият си тъжен стопанин...",SUBJ
6,f94840f8-7fb6-42e3-aa84-0a250d58af5b,Дали защото Първият черен президент на САЩ си ...,SUBJ
7,b30bd670-07fd-40ac-82d7-c67988f57cc3,"И като доказателство за това, гръмна и следващ...",SUBJ
8,512b57e5-9b65-43a8-8dec-01bef61a3ad6,"Последният път, когато Америка се обърна навът...",SUBJ
9,d6bad64f-59f2-491e-9d99-103d2748d647,"Шок, бомба, ужас!",SUBJ


In [294]:
csv['label'] = [0 if lb == 'OBJ' else 1 for lb in csv['label'] ]
csv['label'] = csv['label'].astype(np.float32)

In [295]:
word_data = []
for sent in csv['sentence'].values:
  words = word_tokenize(sent)
  words = [word.lower() for word in words if word not in string.punctuation]
  word_data.extend(words)
word_data[:15]

['``',
 'трудно',
 'е',
 'класически',
 'оркестър',
 'и',
 'рок',
 'банда',
 'да',
 'свирят',
 'заедно',
 'имаме',
 'физически',
 'проблеми',
 'защото']

In [296]:
from collections import Counter

## Build a dictionary that maps words to integers
counts = Counter(word_data)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

## use the dict to tokenize each sent
sent_int = []
for sent in csv['sentence'].values:
    sent_int.append([vocab_to_int[word.lower()] for word in word_tokenize(sent) if word not in string.punctuation ])

In [297]:
MAX_LEN = 30
def pad_sequences(sent_ints, seq_length):
    ''' 
        Each sentence is padded with 0's or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    sequences = np.zeros((len(sent_ints), seq_length), dtype=int)

    for i, row in enumerate(sent_ints):
        sequences[i, -len(row):] = np.array(row)[:seq_length]
    
    return sequences
pad_sequences(sent_int, MAX_LEN)

array([[   0,    9,  221, ...,    7,  283,   11],
       [   0,    0,    0, ...,   40, 1312,  404],
       [   0,    0,    0, ...,    2,  649,  405],
       ...,
       [   0,    0,    0, ..., 5317,  999, 5318],
       [   0,    0,    0, ...,   98, 5320,   50],
       [   0,    0,    0, ..., 5321,    7, 1090]])

In [298]:
print('Unique words: ', len((vocab_to_int)))  # should ~ 74000+
print()

# print tokens in first review
print('Tokenized review: \n', sent_int[:1])

Unique words:  5321

Tokenized review: 
 [[9, 221, 7, 1302, 1303, 2, 644, 645, 3, 1304, 402, 163, 1305, 222, 74, 52, 644, 187, 403, 2, 3, 1306, 1307, 43, 83, 145, 7, 283, 11]]


In [300]:
train = pad_sequences(sent_int, MAX_LEN)
train_dataset = TensorDataset(torch.from_numpy(train), torch.from_numpy(csv['label'].to_numpy()))
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16)



In [301]:
class RNNDataset(Dataset):
  def __init__(self, x, y):
        self.x = x
        self.y = y
  def __len__(self):
        return len(self.x)

  def __getitem__(self, idx):
        return (np.array(self.x[idx], dtype=np.longlong),
                np.array(self.y[idx], dtype=np.float32))
train_data = RNNDataset(train, csv['label'].to_numpy())
train_loader = DataLoader(train_data, shuffle=True, batch_size=16)


In [None]:
MAX_FEATURES = 20000
MAX_LEN = 30  # cut texts after this number of words (among top max_features most common words)
BATCH_SIZE = 32
HIDDEN_DIM = 128
EMBEDDING_SIZE = 300
DISPLAY_STEP = 1
N_LAYERS = 2
CLASSES = 1

class RNN(nn.Module):
    def __init__(self, vocab_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=EMBEDDING_SIZE)
        self.rnn = nn.RNN(input_size=EMBEDDING_SIZE, hidden_size=HIDDEN_DIM, batch_first=True)
        self.linear = nn.Linear(HIDDEN_DIM, 1)
    def forward(self, x):
        embeddings = self.embedding(x)
        output, _ = self.rnn(embeddings)
        output = output[:, -1, :]
        return self.linear(output)

In [303]:
model  = RNN(len(vocab_to_int) +1 )

In [304]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()

In [305]:
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_samples = 0
    for x, y  in train_loader:
        optimizer.zero_grad()
        outputs = model(x)
        if outputs.size()[0] == 1:
            outputs = outputs.flatten() 
        else: 
            outputs = outputs.squeeze()
        loss = loss_fn(outputs, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(x)
        total_samples += len(x)

    avg_loss = total_loss / total_samples
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_loss:.4f}") #Val Loss: {avg_val_loss:.4f}

Epoch 1/5, Train Loss: 0.2429
Epoch 2/5, Train Loss: 0.0195
Epoch 3/5, Train Loss: 0.0058
Epoch 4/5, Train Loss: 0.0022
Epoch 5/5, Train Loss: 0.0012
