In [1]:
import ast
import pandas as pd
import numpy as np
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

torch.manual_seed(2021)

import podium
from podium import BucketIterator
from podium.datasets import TabularDataset
from podium import Vocab
from podium.storage import Field
from podium.storage import LabelField
from podium.storage.vectorizers import GloVe

import toxic_util as TU

import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# inicijalizacija tokenizatora i lematizatora
tokenizer = podium.preproc.get_tokenizer('spacy')
lemmatizer = podium.preproc.SpacyLemmatizer()    

# inicijalizacija praznog vokabulara
vocab = Vocab()

# definicija polja, odnosno stupaca dataseta
fields = {
    'text': Field(
        'text', 
        pretokenize_hooks=[str.lower],   # prije tokeniziranja baci sve u lowercase
        tokenizer=tokenizer,             # tokeniziraj s definiranim tokenizatorom
        posttokenize_hooks=[lemmatizer], # nakon tokenizacije uzmi samo lemme riječi 
        numericalizer=vocab,             # veza dataseta i vokabulara, tj. embedding matrice
        keep_raw=True                    # debug svrhe
    ),
    'spans': Field(
        'spans', 
        tokenizer=ast.literal_eval,      # jer se čita iz csva, eval obavlja evaluaciju stringa kao da je kod
        disable_batch_matrix=True
    ),
    'labels': Field(
        'labels',
        tokenizer=ast.literal_eval,      # isto kao i iznad
        is_target=True,
        numericalizer=np.array,          # treba za padding
        padding_token=-100
    )
}

# učitaj dataset
original = TabularDataset(
    path = 'data/tsd_train_with_labels.csv', # prepravljen dataset s labelama već instaliranim
    format = 'csv', 
    fields = fields
)

# split na set za treniranje i set za validaciju
train, valid = original.split(0.8)

# inicijaliziraj embedding matricu
glove = GloVe()
embeddings = glove.load_vocab(vocab)

In [3]:
for t in train:
    assert len(t.text[1]) == len(t.labels[1]), f'pad'
    
torch.cuda.is_available()

True

In [4]:
# ako je ovo True, ispisuju se DEBUG izjave i model napravi samo jednu epohu s jednim retkom
dbg = False
def DEBUG(fstring):
    if(dbg): print(fstring)
        
vbs = False
def VERBOSE(fstring):
    if(vbs): print(fstring)

In [5]:
def seq_lens(batch):
    ret = []
    batch = batch.tolist()
    for seq in batch:
        seq_len = len(seq)
        for x in seq:
            if x == 1:
                seq_len -= 1
        ret.append(seq_len)
    return ret

class LSTMTagger(nn.Module):

    def __init__(self, base_model, word_embeds, vocab, hidden_dim):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.vocab = vocab
        self.vocab_size = word_embeds.shape[0]
        self.embeddings = word_embeds
        self.embed_dim = word_embeds.shape[1]
        self.lstm1 = base_model(word_embeds.shape[1], hidden_dim, batch_first = True)
        self.lstm2 = base_model(hidden_dim, hidden_dim, batch_first = True)
        self.hidden2tag = nn.Linear(hidden_dim, 3)

    def forward(self, text_batch):
        DEBUG('######################### FORWARD ############################')
        DEBUG(f'text_batch\n{text_batch}')
        
        input_lens = seq_lens(text_batch)
        
        DEBUG(f'input_lens\n{input_lens}')
        sent_vec = torch.from_numpy(embeddings[text_batch]).to(device)
        sent_len = len(text_batch[0])
        
        DEBUG(f'sent_vec\n{sent_vec.shape}\n{sent_vec}\n{sent_vec.device}')
        
        packed_sent_vec = torch.nn.utils.rnn.pack_padded_sequence(
            sent_vec.view(-1, sent_len, self.embed_dim),
            input_lens,
            batch_first=True,
            enforce_sorted=False
        ).to(device)
        
        # ova var se ne koristi, sluzi samo za debug purposes
        unpacked_sent_vec, _ = torch.nn.utils.rnn.pad_packed_sequence(packed_sent_vec, batch_first=True)
        DEBUG(f'unpacked_sent_vec\n{unpacked_sent_vec.shape}\n{unpacked_sent_vec}')
        
        DEBUG(f'packed_sent_vec\n{packed_sent_vec}')
        
        packed_lstm_out, _ = self.lstm1(packed_sent_vec.float())
        packed_lstm_out, _ = self.lstm2(packed_lstm_out)
        
        DEBUG(f'packed_lstm_out\n{packed_lstm_out}\n')
        
        lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(packed_lstm_out, batch_first=True)
    
        DEBUG(f'lstm_out\n{lstm_out.shape}\n{lstm_out}')
        
        tag_space = self.hidden2tag(lstm_out.view(-1, sent_len, self.hidden_dim))
        
        DEBUG(f'tag_space\n{tag_space.shape}\n{tag_space}')
        
        DEBUG('######################### END FORWARD ########################')
        return tag_space
    
    def predict(self, numericalized_sent):
        with torch.no_grad():
            tag_scores = self(numericalized_sent)
            labels = TU.labels_from_tensor(tag_scores.view(-1, 3))
            return labels

In [6]:
def instance_length(row):
    # row ... instanca objekta Example, i.e. redak u datasetu
    _, tokenized = row.text
    # row.text u sebi sadrži i raw i tokenized podatke, prvi se odbacuje
    return len(tokenized)

nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])

def fit(BSIZE, LR, NEPOCHS, HIDDEN_DIM, base_model, embeddings, vocab): 
    
    trainit = BucketIterator(train, batch_size=BSIZE, bucket_sort_key=instance_length)
    model = LSTMTagger(base_model, embeddings, vocab, HIDDEN_DIM).to(device)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LR)
    
    best_score = 0
    best_epoch = 0
    
    print(f'BSIZE {BSIZE} LR {LR} HIDDEN_DIM {HIDDEN_DIM} NEPOCHS {NEPOCHS}')
    prev_score = 0
    for epoch in range(NEPOCHS):
        for input_batch, target_batch in trainit:
            if input_batch.text.shape[0] != BSIZE: continue
                
            DEBUG('############## FIT ###############')
            
            text_batch = input_batch.text
            label_batch = torch.tensor(target_batch.labels).to(device)
            text_batch = torch.tensor(text_batch)
            
            DEBUG(f'text_batch\n{text_batch}')
            DEBUG(f'label_batch\n{label_batch}')
            
            model.zero_grad()            
            label_scores = model(text_batch)
            
            DEBUG(f'label_scores\n{label_scores.shape}\n{label_scores}')
            DEBUG(f'label_batch\n{label_batch.shape}\n{label_batch}')
            # radi dobro
            loss = loss_function(label_scores.view(BSIZE, 3, -1), label_batch.view(BSIZE, -1))
            
            DEBUG(f'loss\n{type(loss)}\n{loss}\n{loss.grad_fn}')
            VERBOSE(f'loss {loss}')
            loss.backward()
        
            optimizer.step()
            if(dbg): return None
     
        score = 0
        failed = 0
        succeeded = 0
        for row in valid:
            spans = row.spans[1]
            text = row.text[1]
            labels = row.labels[1]
            numericalized = [vocab.stoi[word] for word in text]
            numericalized = torch.tensor(numericalized)

            try:
                predicted_labels = model.predict(numericalized.view(1, -1))
                predicted_spans = TU.spans_from_labels(row.text[0], predicted_labels, nlp)
                score += TU.f1(predicted_spans, spans)
                succeeded += 1
            except:
                failed += 1 # za svaki slucaj

        score = float(score) / succeeded
        print(f'F1 {score} EPOCH {epoch + 1}')
        if score > best_score:
            best_score = score
            best_epoch = epoch + 1
            torch.save(model, 'best_model.pt')
        if(failed != 0): print(f'FAILED {failed}')
        prev_score = score
        
    best_model = torch.load('best_model.pt')        
    
    return best_model, best_score, f'BSIZE {BSIZE} LR {LR} HIDDEN_DIM {HIDDEN_DIM} ON_EPOCH {best_epoch} FOR_MODEL {base_model}'

if dbg: fit(32, 0.001, 1, 512, embeddings, vocab)

In [7]:
def grid_search(batch_sizes, learning_rates, n_epochs, hidden_dims, base_models, embeddings, vocab):
    best_model = None
    best_score = 0.0
    best_desc = ''
    for bs in batch_sizes:
        for lr in learning_rates:
            for n in n_epochs:
                for hd in hidden_dims:
                    for bms in base_models:
                        model, score, model_desc = fit(bs, lr, n, hd, bms, embeddings, vocab)
                        print(f'F1 {score} {model_desc}')
                        if score > best_score:
                            best_model = model
                            best_score = score
                            best_desc = model_desc
    return best_model, best_score, best_desc

In [8]:
print(device)
# dobavi najbolji model
model, score, desc = grid_search(
    batch_sizes = [64], 
    learning_rates = [0.01], 
    n_epochs = [50], 
    hidden_dims = [50], 
    base_models = [nn.RNN, nn.GRU, nn.LSTM], 
    embeddings = embeddings, 
    vocab = vocab)
print(f'Best score > {score}')
print(f'Achieved for hyperparameters > {desc}')

cuda
BSIZE 32 LR 0.001 HIDDEN_DIM 64 NEPOCHS 50
F1 0.1754708274438634 EPOCH 1
F1 0.24750969405704468 EPOCH 2
F1 0.1903381399454217 EPOCH 3
F1 0.24750969405704468 BSIZE 32 LR 0.001 HIDDEN_DIM 64 ON_EPOCH 2 FOR_MODEL <class 'torch.nn.modules.rnn.RNN'>
BSIZE 32 LR 0.001 HIDDEN_DIM 64 NEPOCHS 50
F1 0.11601120945856792 EPOCH 1
F1 0.15867521679834448 EPOCH 2
F1 0.08074361981767984 EPOCH 3
F1 0.15867521679834448 BSIZE 32 LR 0.001 HIDDEN_DIM 64 ON_EPOCH 2 FOR_MODEL <class 'torch.nn.modules.rnn.GRU'>
BSIZE 32 LR 0.001 HIDDEN_DIM 64 NEPOCHS 50
F1 0.10148794044970955 EPOCH 1
F1 0.05014625449758408 EPOCH 2
FAILED 41
F1 0.10148794044970955 BSIZE 32 LR 0.001 HIDDEN_DIM 64 ON_EPOCH 1 FOR_MODEL <class 'torch.nn.modules.rnn.LSTM'>
BSIZE 32 LR 0.001 HIDDEN_DIM 256 NEPOCHS 50



KeyboardInterrupt



In [None]:
test = pd.read_csv('data/tsd_trial.csv')
test

In [None]:
score = 0
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
fail=0
model.to(device)
for idx, row in test.iterrows():
    try:
        spans = ast.literal_eval(row[0])
        text = row[1]

        tokenized = tokenizer(text.lower())
        _, lemmatized = lemmatizer(text.lower(), tokenized)

        numericalized = [vocab.stoi[word] for word in lemmatized]

        numericalized = torch.tensor(numericalized)
        batch = numericalized.view(1, -1)

        predicted_labels = model.predict(batch)

        predicted_spans = TU.spans_from_labels(text, predicted_labels, nlp)

        row_score = TU.f1(predicted_spans, spans)
        score += row_score
    except:
        fail += 1
score /= test.shape[0]
print(score)
print(fail)

In [None]:
torch.save(model, 'bestmodel.pth')