In [1]:
import ast
import pandas as pd
import numpy as np
import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.vocab import FastText
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

torch.manual_seed(2021)

import podium
from podium import BucketIterator
from podium.datasets import TabularDataset
from podium import Vocab
from podium.storage import Field
from podium.storage import LabelField
from podium.storage.vectorizers import GloVe

import toxic_util as TU

In [2]:
# inicijalizacija tokenizatora i lematizatora
tokenizer = podium.preproc.get_tokenizer('spacy')
lemmatizer = podium.preproc.SpacyLemmatizer()    

# inicijalizacija praznog vokabulara
vocab = Vocab()

# definicija polja, odnosno stupaca dataseta
fields = {
    'text': Field(
        'text', 
        pretokenize_hooks=[str.lower],   # prije tokeniziranja baci sve u lowercase
        tokenizer=tokenizer,             # tokeniziraj s definiranim tokenizatorom
        posttokenize_hooks=[lemmatizer], # nakon tokenizacije uzmi samo lemme riječi 
        numericalizer=vocab,             # veza dataseta i vokabulara, tj. embedding matrice
        keep_raw=True                    # debug svrhe
    ),
    'spans': Field(
        'spans', 
        tokenizer=ast.literal_eval,      # jer se čita iz csva, eval obavlja evaluaciju stringa kao da je kod
        disable_batch_matrix=True
    ),
    'labels': Field(
        'labels',
        tokenizer=ast.literal_eval,      # isto kao i iznad
        is_target=True,
        numericalizer=np.array,          # treba za padding
        padding_token=-100
    )
}

# učitaj dataset
original = TabularDataset(
    path = 'data/tsd_train_with_labels.csv', # prepravljen dataset s labelama već instaliranim
    format = 'csv', 
    fields = fields
)

# split na set za treniranje i set za validaciju
train, valid = original.split(0.8)

# inicijaliziraj embedding matricu
glove = GloVe()
embeddings = glove.load_vocab(vocab)

In [3]:
for t in train:
    assert len(t.text[1]) == len(t.labels[1]), f'pad'

In [4]:
# ako je ovo True, ispisuju se DEBUG izjave i model napravi samo jednu epohu s jednim retkom
dbg = False
def DEBUG(fstring):
    if(dbg): print(fstring)
        
vbs = False
def VERBOSE(fstring):
    if(vbs): print(fstring)

In [5]:
def seq_lens(batch):
    ret = []
    batch = batch.tolist()
    for seq in batch:
        seq_len = len(seq)
        for x in seq:
            if x == 1:
                seq_len -= 1
        ret.append(seq_len)
    return ret

class LSTMTagger(nn.Module):

    def __init__(self, word_embeds, vocab, hidden_dim):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.vocab = vocab
        self.vocab_size = word_embeds.shape[0]
        self.embeddings = word_embeds
        self.embed_dim = word_embeds.shape[1]
        self.lstm1 = nn.LSTM(word_embeds.shape[1], hidden_dim*2, batch_first = True)
        self.lstm2 = nn.LSTM(hidden_dim*2, hidden_dim, batch_first = True)
        self.hidden2tag = nn.Linear(hidden_dim, 3)

    def forward(self, text_batch):
        DEBUG('######################### FORWARD ############################')
        DEBUG(f'text_batch\n{text_batch}')
        
        input_lens = seq_lens(text_batch)
        
        DEBUG(f'input_lens\n{input_lens}')
        
        sent_vec = torch.from_numpy(embeddings[text_batch])
        sent_vec.requires_grad = False
        sent_len = len(text_batch[0])
        
        DEBUG(f'sent_vec\n{sent_vec.shape}\n{sent_vec}')
        
        packed_sent_vec = torch.nn.utils.rnn.pack_padded_sequence(
            sent_vec.view(-1, sent_len, self.embed_dim),
            input_lens,
            batch_first=True,
            enforce_sorted=False
        ) 
        
        # ova var se ne koristi, sluzi samo za debug purposes
        unpacked_sent_vec, _ = torch.nn.utils.rnn.pad_packed_sequence(packed_sent_vec, batch_first=True)
        DEBUG(f'unpacked_sent_vec\n{unpacked_sent_vec.shape}\n{unpacked_sent_vec}')
        
        DEBUG(f'packed_sent_vec\n{packed_sent_vec}')
        
        packed_lstm_out, _ = self.lstm1(packed_sent_vec.float())
        packed_lstm_out, _ = self.lstm2(packed_lstm_out)
        
        DEBUG(f'packed_lstm_out\n{packed_lstm_out}')
        
        lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(packed_lstm_out, batch_first=True)
    
        DEBUG(f'lstm_out\n{lstm_out.shape}\n{lstm_out}')
        
        tag_space = self.hidden2tag(lstm_out.view(-1, sent_len, self.hidden_dim))
        
        DEBUG(f'tag_space\n{tag_space.shape}\n{tag_space}')
        
        DEBUG('######################### END FORWARD ########################')
        return tag_space
    
    def predict(self, numericalized_sent):
        with torch.no_grad():
            tag_scores = self(numericalized_sent)
            labels = TU.labels_from_tensor(tag_scores.view(-1, 3))
            return labels

In [6]:
def instance_length(row):
    # row ... instanca objekta Example, i.e. redak u datasetu
    _, tokenized = row.text
    # row.text u sebi sadrži i raw i tokenized podatke, prvi se odbacuje
    return len(tokenized)

def fit(BSIZE, LR, NEPOCHS, embeddings, vocab):    
    trainit = BucketIterator(train, batch_size=BSIZE, bucket_sort_key=instance_length)
    model = LSTMTagger(embeddings, vocab, 300)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LR)
    
    print(f'BSIZE {BSIZE} LR {LR} NEPOCHS {NEPOCHS}')
    
    for epoch in range(NEPOCHS):
        for input_batch, target_batch in trainit:
            if input_batch.text.shape[0] != BSIZE: continue
                
            DEBUG('############## FIT ###############')
            
            text_batch = input_batch.text
            label_batch = torch.tensor(target_batch.labels)
            text_batch = torch.tensor(text_batch)
            
            DEBUG(f'text_batch\n{text_batch}')
            DEBUG(f'label_batch\n{label_batch}')
            
            model.zero_grad()            
            label_scores = model(text_batch)
            
            DEBUG(f'label_scores\n{label_scores.shape}\n{label_scores}')
            DEBUG(f'label_batch\n{label_batch.shape}\n{label_batch}')
            # radi dobro
            loss = loss_function(label_scores.view(BSIZE, 3, -1), label_batch.view(BSIZE, -1).long())
            
            DEBUG(f'loss\n{type(loss)}\n{loss}\n{loss.grad_fn}')
            VERBOSE(f'loss {loss}')
            loss.backward()
        
            optimizer.step()
            if(dbg): return None
     

    nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
    score = 0
    failed = 0
    succeeded = 0
    for row in valid:
        spans = row.spans[1]
        text = row.text[1]
        labels = row.labels[1]
        numericalized = [vocab.stoi[word] for word in text]
        numericalized = torch.tensor(numericalized)

        try:
            predicted_labels = model.predict(numericalized.view(1, -1))
            predicted_spans = TU.spans_from_labels(row.text[0], predicted_labels, nlp)
            score += TU.f1(predicted_spans, spans)
            succeeded += 1
        except: 
            failed += 1 # za svaki slucaj
            
    score = float(score) / succeeded
    print(f'F1 {score}')
    if(failed != 0): print(f'FAILED {failed}')

    return model, score, f'BSIZE {BSIZE} LR {LR} NEPOCHS {NEPOCHS}'

if dbg: fit(32, 0.001, 1, embeddings, vocab)

In [7]:
# ovako to izgleda interno za jedan batch s 32 inputa, vidimo da su 0 dobro postavljene
# loss tensor([[1.0987, 0.0000, 0.0000],
#         [1.1112, 0.0000, 0.0000],
#         [1.0995, 0.0000, 0.0000],
#         [1.0841, 0.0000, 0.0000],
#         [1.1162, 0.0000, 0.0000],
#         [1.0818, 0.0000, 0.0000],
#         [1.1162, 0.0000, 0.0000],
#         [1.1371, 0.0000, 0.0000],
#         [1.1340, 0.0000, 0.0000],
#         [1.0991, 0.0000, 0.0000],
#         [1.1141, 0.0000, 0.0000],
#         [1.0872, 1.1260, 0.0000],
#         [1.0942, 1.0652, 0.0000],
#         [0.9729, 1.1223, 0.0000],
#         [1.1731, 1.1258, 0.0000],
#         [1.0989, 1.1069, 0.0000],
#         [1.0427, 1.0527, 0.0000],
#         [0.9409, 1.1290, 0.0000],
#         [0.9859, 1.1236, 0.0000],
#         [1.0108, 1.0835, 0.0000],
#         [1.1639, 1.1037, 0.0000],
#         [1.0723, 1.0049, 0.0000],
#         [0.9820, 1.0845, 0.0000],
#         [1.1454, 1.1492, 0.0000],
#         [1.0974, 1.1455, 1.0829],
#         [1.0626, 1.0231, 1.0930],
#         [1.0369, 1.1696, 1.1302],
#         [1.1067, 1.0430, 1.1548],
#         [1.0622, 1.0870, 1.1068],
#         [1.1639, 1.0571, 1.0314],
#         [1.1049, 1.1777, 1.1405],
#         [1.1957, 1.0839, 0.9871]], grad_fn=<ViewBackward>)

In [8]:
def grid_search(batch_sizes, learning_rates, n_epochs, embeddings, vocab):
    best_model = None
    best_score = 0.0
    best_desc = ''
    for bs in batch_sizes:
        for lr in learning_rates:
            for n in n_epochs:
                model, score, model_desc = fit(bs, lr, n, embeddings, vocab)
                if score > best_score:
                    best_model = model
                    best_score = score
                    best_desc = model_desc
    return best_model, best_score, best_desc

In [9]:
# dobavi najbolji model
model, score, desc = grid_search([32], [0.1], [1], embeddings, vocab)
print(f'Best score > {score}')
print(f'Achieved for hyperparameters > {desc}')

BSIZE 32 LR 0.1 NEPOCHS 1
F1 0.23429456762504963
Best score > 0.23429456762504963
Achieved for hyperparameters > BSIZE 32 LR 0.1 NEPOCHS 1


In [10]:
test = pd.read_csv('data/tsd_trial.csv')
test

Unnamed: 0,spans,text
0,"[15, 16, 17, 18, 19, 27, 28, 29, 30, 31]",Because he's a moron and a bigot. It's not any...
1,"[29, 30, 31, 32, 33, 34]",How about we stop protecting idiots and let na...
2,"[166, 167, 168, 169, 170, 171]","If people were smart, they would Boycott th..."
3,"[87, 88, 89, 90, 91, 92]",Trump Claimed that Russia will never invade th...
4,[],As long as your willing to pay a lot more for ...
...,...,...
685,"[129, 130, 131, 132, 133, 134]",But ... Trump's not bluffing. He's prepared to...
686,"[126, 127, 128, 129, 130, 131]",Can't believe the limited knowledge of this Ar...
687,"[24, 25, 26, 27, 28, 29]",I think it conservative idiots who cannot reac...
688,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",You're an id*ot...Go away.


In [11]:
score = 0
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
fail=0
for idx, row in test.iterrows():
    try:
        spans = ast.literal_eval(row[0])
        text = row[1]

        tokenized = tokenizer(text.lower())
        _, lemmatized = lemmatizer(text.lower(), tokenized)

        numericalized = [vocab.stoi[word] for word in lemmatized]

        numericalized = torch.tensor(numericalized)
        batch = numericalized.view(1, -1)

        predicted_labels = model.predict(batch)

        predicted_spans = TU.spans_from_labels(text, predicted_labels, nlp)

        row_score = TU.f1(predicted_spans, spans)
        score += row_score
    except:
        fail += 1
score /= test.shape[0]
print(score)
print(fail)

0.2291812963836909
0
