In [6]:
from transformers import *
import torch
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import re
import ast
import pandas as pd

# load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

broj_klasa = 3 # 0 1 ili 2, odnosno OUT, BEGIN i IN toxic span

# load model
model = BertForTokenClassification.from_pretrained('bert-base-uncased', output_hidden_states=False, output_attentions=False, num_labels=broj_klasa )

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

cuda


In [7]:

data = pd.read_csv('data/tsd_train_with_labels.csv')
df = pd.DataFrame(data)

# za svaku sekvencu dodati specijalne znakove:
#   - na pocetak [CLS]
#   - na kraj [SEP]

inputs =  ['[CLS] ' + text + ' [SEP]' for text in df.text]

# tokenizacija
word_ranges = []
input_ids = []
input_tokens = []
for text in inputs:
    b = []
    b.append(([101],))
    for m in re.finditer(r'\S+', text[6:-6]):
        w = m.group(0)
        for idx in tokenizer.encode(w, add_special_tokens=False):
            #t = (idx, (m.start(), m.end()-1))
            t = (idx, m.start())
            b.append(t)
    b.append(([102],))
    word_ranges.append(b)
    sentence_tokens = tokenizer.tokenize( text )
    sentence_ids = tokenizer.convert_tokens_to_ids(sentence_tokens)
    input_ids.append(sentence_ids)
    input_tokens.append(sentence_tokens)


In [8]:

print(inputs[0])
print(input_tokens[0])
print(input_ids[0])
print(df.spans[0])
print(word_ranges[0])

[CLS] Another violent and aggressive immigrant killing a innocent and intelligent US Citizen.... Sarcasm [SEP]
['[CLS]', 'another', 'violent', 'and', 'aggressive', 'immigrant', 'killing', 'a', 'innocent', 'and', 'intelligent', 'us', 'citizen', '.', '.', '.', '.', 'sarcasm', '[SEP]']
[101, 2178, 6355, 1998, 9376, 11560, 4288, 1037, 7036, 1998, 9414, 2149, 6926, 1012, 1012, 1012, 1012, 20954, 102]
[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
[([101],), (2178, 0), (6355, 8), (1998, 16), (9376, 20), (11560, 31), (4288, 41), (1037, 49), (7036, 51), (1998, 60), (9414, 64), (2149, 76), (6926, 79), (1012, 79), (1012, 79), (1012, 79), (1012, 79), (20954, 91), ([102],)]


In [9]:

def create_labels(span, sentence):
    
    span = ast.literal_eval(span)
    labels = []
    labels.append(0) # za [CLS]
    for word in sentence:
        if word[1] in span:
            if len(labels) == 0:
                labels.append(1)
            else:
                if labels[-1] == 0:
                    labels.append(1)
                elif word[1] - 1 in spans:
                    labels.append(2)
                else:
                    labels.append(1)
        else:
            labels.append(0)
    labels.append(0) # za [SEP]
    
    return labels

In [10]:
spans = df.spans
labels = []
for span, sentence in zip(spans, word_ranges):
    #print(sentence[1:-1])
    labels.append(create_labels(span, sentence[1:-1]))

print(input_tokens[0])
print(labels[0])

['[CLS]', 'another', 'violent', 'and', 'aggressive', 'immigrant', 'killing', 'a', 'innocent', 'and', 'intelligent', 'us', 'citizen', '.', '.', '.', '.', 'sarcasm', '[SEP]']
[0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [11]:
# Nije potrebno da znate detalje - samo kopirajte kod

def get_key_padding_mask(data):
    '''
    Prima jedan pytorch tensor koji predstavlja jedan batch
    '''
    mask = (data != 0 )
    return mask

def get_type_ids(data):
    '''
    Prima jedan pytorch tensor koji predstavlja jedan batch.
    '''
    type_ids = (data == 0 )
    return type_ids


In [12]:
# pretvoriti sentence_ids u tenzor velicine 512 tipa long - prvi dio tensora popuniti vrijednostima iz sentence_ids a ostatak nulama (0)

input_ids = pad_sequences(input_ids, maxlen=512, dtype="long", truncating="post", padding="post")
input_ids = torch.tensor(input_ids)


# generiranje tipa tokena i maske - ne trebate znati detalje samo kopirajte
mask = get_key_padding_mask(input_ids)
type_ids = get_type_ids(input_ids)

# generirajte labele. Oblik tenzora mora biti isti kao sent_ids_pt

labels = pad_sequences(labels, maxlen=512, dtype="long", truncating="post", padding="post")
labels = torch.tensor(labels)

# ako niste ranije sve pretvorite u long format
input_ids = input_ids.long()
type_ids = type_ids.long()
mask = mask.long()
labels = labels.long()


print(input_ids[0])
print(mask[0])
print(type_ids[0])
print(labels[0])



tensor([  101,  2178,  6355,  1998,  9376, 11560,  4288,  1037,  7036,  1998,
         9414,  2149,  6926,  1012,  1012,  1012,  1012, 20954,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [13]:
# podjela train/test


# input_ids_train, input_ids_val, labels_train, labels_val, mask_train, mask_val, type_ids_train, type_ids_val = train_test_split(input_ids, 
#                                                   labels,
#                                                   mask,
#                                                   type_ids,
#                                                   test_size=0.15, 
#                                                   random_state=42)

# dataset_train = TensorDataset(input_ids_train, mask_train, labels_train, type_ids_train)
# dataset_val = TensorDataset(input_ids_val, mask_val, labels_val, type_ids_val)

# batch_size = 32

# dataloader_train = DataLoader(dataset_train, 
#                               sampler=RandomSampler(dataset_train), 
#                               batch_size=batch_size)

# dataloader_validation = DataLoader(dataset_val, 
#                                    sampler=SequentialSampler(dataset_val), 
#                                    batch_size=batch_size)


In [14]:
# if "treniranje modela":
#     model.train()
# elif "evaluacija modela":
#     model.eval()


cpu


In [17]:
model.to('cpu')

for i in range(df.shape[0]):
    # prolaz modela nad podacima
    # nad loss ce te onda zvati .backward() 
    model.zero_grad()

    
    outputs = model( 
        input_ids[i].view(1, -1), 
        attention_mask=mask[i].view(1, -1), 
        labels=labels[i].view(1, -1), 
        token_type_ids=type_ids[i].view(1,-1) 
    )

    outputs.loss.backward()
    len = 0
    for x in mask[i]:
        if x == 0:
            break
        len += 1
    print(len)
    print(outputs.loss)
    print(outputs.logits.shape)
    print(torch.max(outputs.logits, dim=2)[1][0][0:len])

19
tensor(1.0619, grad_fn=<NllLossBackward>)
torch.Size([1, 512, 3])
tensor([2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 0, 0, 0, 0, 0, 2])
162
tensor(1.2923, grad_fn=<NllLossBackward>)
torch.Size([1, 512, 3])
tensor([2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0,
        2, 2, 2, 1, 1, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 0, 1,
        1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 0, 2, 0, 1, 2, 2, 2, 1, 0, 0, 2,
        2, 2, 2, 0, 2, 2, 0, 1, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1,
        2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 0, 2, 2])
11
tensor(1.1308, grad_fn=<NllLossBackward>)
torch.Size([1, 512, 3])
tensor([1, 1, 2, 0, 1, 2, 2, 0, 2, 0, 2])
20
tensor(1.0853, grad_fn=<NllLossBackward>)
torch.Size([1, 512, 3])
tensor([2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 0, 2, 0, 1, 0, 2, 2, 1, 0, 2])
212

KeyboardInterrupt: 

In [None]:
learning_rate = 1e-5 # samo primjer
max_number_of_epochs = 20 # samo primjer

# Definicija ako definirati optimizer i scheduler za BERT
# Nije potrebno da znate detalje - samo kopirajte kod
def lr_lambda(current_step, num_warmup_steps = 5, num_training_steps = 50):
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))) 

optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False)  
scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda x: lr_lambda(x, 5, max_number_of_epochs) )   