In [174]:
from transformers import *
import torch
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import re
import ast
import pandas as pd

# load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

broj_klasa = 7939 # broj outputa mislim

# load model
model = BertForTokenClassification.from_pretrained('bert-base-uncased', output_hidden_states=False, output_attentions=False, num_labels=broj_klasa )


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [175]:

data = pd.read_csv('data/tsd_train_with_labels.csv')
df = pd.DataFrame(data)

# za svaku sekvencu dodati specijalne znakove:
#   - na pocetak [CLS]
#   - na kraj [SEP]

inputs =  ['[CLS] ' + text + ' [SEP]' for text in df.text]

# tokenizacija
word_ranges = []
input_ids = []
input_tokens = []
for text in inputs:
    b = []
    b.append(([101],))
    for m in re.finditer(r'\S+', text[6:-6]):
        w = m.group(0)
        for idx in tokenizer.encode(w, add_special_tokens=False):
            #t = (idx, (m.start(), m.end()-1))
            t = (idx, m.start())
            b.append(t)
    b.append(([102],))
    word_ranges.append(b)
    sentence_tokens = tokenizer.tokenize( text )
    sentence_ids = tokenizer.convert_tokens_to_ids(sentence_tokens)
    input_ids.append(sentence_ids)
    input_tokens.append(sentence_tokens)


In [176]:

print(inputs[0])
print(input_tokens[0])
print(input_ids[0])
print(df.spans[0])
print(word_ranges[0])

[CLS] Another violent and aggressive immigrant killing a innocent and intelligent US Citizen.... Sarcasm [SEP]
['[CLS]', 'another', 'violent', 'and', 'aggressive', 'immigrant', 'killing', 'a', 'innocent', 'and', 'intelligent', 'us', 'citizen', '.', '.', '.', '.', 'sarcasm', '[SEP]']
[101, 2178, 6355, 1998, 9376, 11560, 4288, 1037, 7036, 1998, 9414, 2149, 6926, 1012, 1012, 1012, 1012, 20954, 102]
[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
[([101],), (2178, 0), (6355, 8), (1998, 16), (9376, 20), (11560, 31), (4288, 41), (1037, 49), (7036, 51), (1998, 60), (9414, 64), (2149, 76), (6926, 79), (1012, 79), (1012, 79), (1012, 79), (1012, 79), (20954, 91), ([102],)]


In [177]:

def create_labels(span, sentence):
    
    span = ast.literal_eval(span)
    labels = []
    labels.append(0) # za [CLS]
    for word in sentence:
        if word[1] in span:
            if len(labels) == 0:
                labels.append(1)
            else:
                if labels[-1] == 0:
                    labels.append(1)
                elif word[1] - 1 in spans:
                    labels.append(2)
                else:
                    labels.append(1)
        else:
            labels.append(0)
    labels.append(0) # za [SEP]
    
    return labels

In [178]:
spans = df.spans
labels = []
for span, sentence in zip(spans, word_ranges):
    #print(sentence[1:-1])
    labels.append(create_labels(span, sentence[1:-1]))

print(input_tokens[0])
print(labels[0])

['[CLS]', 'another', 'violent', 'and', 'aggressive', 'immigrant', 'killing', 'a', 'innocent', 'and', 'intelligent', 'us', 'citizen', '.', '.', '.', '.', 'sarcasm', '[SEP]']
[0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [179]:
# Nije potrebno da znate detalje - samo kopirajte kod

def get_key_padding_mask(data):
    '''
    Prima jedan pytorch tensor koji predstavlja jedan batch
    '''
    mask = (data != 0 )
    return mask

def get_type_ids(data):
    '''
    Prima jedan pytorch tensor koji predstavlja jedan batch.
    '''
    type_ids = (data == 0 )
    return type_ids


In [180]:
# pretvoriti sentence_ids u tenzor velicine 512 tipa long - prvi dio tensora popuniti vrijednostima iz sentence_ids a ostatak nulama (0)

input_ids = pad_sequences(input_ids, maxlen=512, dtype="long", truncating="post", padding="post")
input_ids = torch.tensor(input_ids)


# generiranje tipa tokena i maske - ne trebate znati detalje samo kopirajte
mask = get_key_padding_mask(input_ids)
type_ids = get_type_ids(input_ids)

# generirajte labele. Oblik tenzora mora biti isti kao sent_ids_pt

labels = pad_sequences(labels, maxlen=512, dtype="long", truncating="post", padding="post")
labels = torch.tensor(labels)

###########

# train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
#                                                             random_state=2018, test_size=0.2)
# train_masks, validation_masks, _, _ = train_test_split(mask, type_ids,
#                                              random_state=2018, test_size=0.2)
                                             
# # Convert all of our data into torch tensors, the required datatype for our model
# train_inputs = torch.tensor(train_inputs)
# validation_inputs = torch.tensor(validation_inputs)
# train_labels = torch.tensor(train_labels)
# validation_labels = torch.tensor(validation_labels)
# train_masks = torch.tensor(train_masks)
# validation_masks = torch.tensor(validation_masks)

########

# ako niste ranije sve pretvorite u long format
input_ids = input_ids.long()
type_ids = type_ids.long()
mask = mask.long()
labels = labels.long()


print(input_ids[0,0:20])
print(mask[0,0:20])
print(type_ids[0,0:20])
print(labels[0,0:20])



tensor([  101,  2178,  6355,  1998,  9376, 11560,  4288,  1037,  7036,  1998,
         9414,  2149,  6926,  1012,  1012,  1012,  1012, 20954,   102,     0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])
tensor([0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [190]:
input_ids_train, input_ids_val, labels_train, labels_val, mask_train, mask_val, type_ids_train, type_ids_val = train_test_split(input_ids, 
                                                  labels,
                                                  mask,
                                                  type_ids,
                                                  test_size=0.15, 
                                                  random_state=42)

# print(X_train[0])
dataset_train = TensorDataset(input_ids_train, mask_train, labels_train, type_ids_train)
dataset_val = TensorDataset(input_ids_val, mask_val, labels_val, type_ids_val)

batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)


In [191]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [193]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [198]:
import random
import numpy as np
from tqdm.notebook import tqdm

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=211.0, style=ProgressStyle(description_widt…




RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 402653184 bytes. Buy new RAM!


In [181]:
# batch_size = 32

# # Create an iterator of our data with torch DataLoader 
# train_data = TensorDataset(train_inputs, train_masks, train_labels)
# train_sampler = RandomSampler(train_data)
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# validation_sampler = SequentialSampler(validation_data)
# validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [182]:
if "treniranje modela":
    model.train()
elif "evaluacija modela":
    model.eval()

# prolaz modela nad podacima
# nad loss ce te onda zvati .backward() 
loss, scores = model( input_ids, attention_mask=mask, labels=labels, token_type_ids=type_ids )

# ako iz scores zelite dobiti predikciju
predictions = torch.max(scores,2)[1]

print(loss)
print(scores.shape)
print(predictions.shape)

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 12486967296 bytes. Buy new RAM!


In [None]:
learning_rate = 1e-5 # samo primjer
max_number_of_epochs = 20 # samo primjer

# Definicija ako definirati optimizer i scheduler za BERT
# Nije potrebno da znate detalje - samo kopirajte kod
def lr_lambda(current_step, num_warmup_steps = 5, num_training_steps = 50):
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))) 

optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False)  
scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda x: lr_lambda(x, 5, max_number_of_epochs) )   