<a href="https://colab.research.google.com/github/dobrevajovana/Makedonizer/blob/main/NaturalLanguageinference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.2 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 33.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [None]:
import pandas as pd
import re
import torch
import time
import random

from torch.utils.data import Dataset, TensorDataset, DataLoader, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pad_sequence

from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup, XLMRobertaTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, matthews_corrcoef, f1_score

import pickle
import os
import numpy as np

from tqdm.notebook import tqdm

In [None]:
TOKENIZER_PATH = '/content/drive/MyDrive/Macedonizer-Evaluation/mk-roberta-tokenizer'
MODEL_PATH = '/content/drive/MyDrive/macedonizer.finki/mk-roberta-common-crawl/checkpoint-3800000'

In [None]:
# TOKENIZER_PATH = 'xlm-roberta-base'
# MODEL_PATH = 'xlm-roberta-base'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Macedonizer-Evaluation/NLI-Dataset/SNLI-Dataset/train.csv')
test_df   = pd.read_csv('/content/drive/MyDrive/Macedonizer-Evaluation/NLI-Dataset/SNLI-Dataset/test.csv')

In [None]:
train_df.label.unique()

array([ 1,  2,  0, -1])

In [None]:
train_df = train_df[train_df['label'] != -1]
test_df = test_df[test_df['label'] != -1]

In [None]:
train_df.label.unique()

array([1, 2, 0])

In [None]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [None]:
class NLIDataset(Dataset):

  def __init__(self, train_df, test_df, tokenizer_path):

    self.train_df = train_df
    self.test_df = test_df

    self.tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)
    # self.tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)

    self.train_data = None
    self.test_data = None
    self.init_data()

  def init_data(self):
    self.train_data = self.load_data(self.train_df)
    self.test_data = self.load_data(self.test_df)

  def load_data(self, df):
    MAX_LEN = 250
    token_ids = []
    mask_ids = []
    seg_ids = []
    y = []

    premise_list = df['premise_mk'].to_list()
    hypothesis_list = df['hypothesis_mk'].to_list()
    label_list = df['label'].to_list()

    for (premise, hypothesis, label) in zip(premise_list, hypothesis_list, label_list):
      premise_id = self.tokenizer.encode(premise, add_special_tokens = False, truncation=True, max_length=MAX_LEN)
      hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False, truncation=True, max_length=MAX_LEN)
      pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]

      premise_len = len(premise_id)
      hypothesis_len = len(hypothesis_id)

      # segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))
      attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))
      token_ids.append(torch.tensor(pair_token_ids))
      # seg_ids.append(segment_ids)
      mask_ids.append(attention_mask_ids)
      y.append(label)

    token_ids = pad_sequence(token_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    # seg_ids = pad_sequence(seg_ids, batch_first=True)
    y = torch.tensor(y)

    dataset = TensorDataset(token_ids, mask_ids, y) # seg_ids,
    print(len(dataset))
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=True):
    train_loader = DataLoader(
      self.train_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    val_loader = DataLoader(
      self.test_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    return train_loader, val_loader

In [None]:
dataset = NLIDataset(train_df, test_df, tokenizer_path=TOKENIZER_PATH)

50946
9824


In [None]:
train_loader, test_loader = dataset.get_data_loaders(batch_size=16)

In [None]:
# This cell is for XLM-RoBERTa-Base Fine Tuning
# uncomment if you want to fine-tune this model

# from transformers import XLMRobertaForSequenceClassification

# EPOCHS = 10

# model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=3)
# model.to(device)

In [None]:
# This cell is for our RoBERTa-Base Fine Tuning
# uncomment if you want to fine-tune this model

EPOCHS = 10

model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=3)
model.to(device)

Some weights of the model checkpoint at /content/drive/MyDrive/macedonizer.finki/mk-roberta-common-crawl/checkpoint-3800000 were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/macedonizer.finki/mk

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [None]:
LEARNING_RATE = 1e-05

optimizer = AdamW(
    model.parameters(),
    lr=LEARNING_RATE,
    eps=1e-8
)



In [None]:
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = len(train_loader) * EPOCHS
  )

In [None]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# RoBERTa-Base-All
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 124,647,939 trainable parameters


In [None]:
# XLM-RoBERTa-Base
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 278,045,955 trainable parameters


In [None]:
def single_model_performance(preds, labels):
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  print("Confusion Matrix:")
  print(confusion_matrix(y_true=labels_flat, y_pred=preds_flat))
  print(classification_report(y_true=labels_flat, y_pred=preds_flat))
  print(f'MCC score: {matthews_corrcoef(y_true=labels_flat, y_pred=preds_flat)}')

def matthews_corrcoef_func(preds, labels):
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return matthews_corrcoef(labels_flat, preds_flat)

def recall_score_func(preds, labels):
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return recall_score(labels_flat, preds_flat, average='weighted')

def precision_score_func(preds, labels):
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return precision_score(labels_flat, preds_flat, average='weighted')

def accuracy_score_func(preds, labels):
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return accuracy_score(labels_flat, preds_flat)

def f1_score_func(preds, labels):
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return f1_score(labels_flat, preds_flat, average='weighted')

In [None]:
def evaluate(test_loader):
  model.eval()

  loss_val_total = 0
  predictions, true_vals = [], []

  for batch in tqdm(test_loader):

    batch = tuple(b.to(device) for b in batch)

    inputs = {'input_ids':      batch[0],
              'attention_mask': batch[1],
              'labels':         batch[2],
              }

    with torch.no_grad():
        outputs = model(**inputs)

    loss = outputs[0]
    logits = outputs[1]
    loss_val_total += loss.item()

    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    predictions.append(logits)
    true_vals.append(label_ids)

  loss_val_avg = loss_val_total/len(test_loader)

  predictions = np.concatenate(predictions, axis=0)
  true_vals = np.concatenate(true_vals, axis=0)

  return loss_val_avg, predictions, true_vals

In [None]:
def train(model, train_loader, test_loader, optimizer, scheduler):
  total_step = len(train_loader)

  for epoch in range(EPOCHS):
    start = time.time()
    model.train()

    loss_train_total = 0

    progress_bar = tqdm(train_loader,
                      desc='Epoch {:1d}'.format(epoch),
                      leave=False,
                      disable=False)

    for batch in progress_bar:

      model.zero_grad()
      batch = tuple(b.to(device) for b in batch)
      inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

      outputs = model(**inputs)
      loss = outputs[0]
      loss_train_total += loss.item()
      loss.backward()

      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      optimizer.step()
      scheduler.step()

      progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    loss_train_avg = loss_train_total/len(train_loader)
    print(f'Epoch: {epoch}')
    tqdm.write(f'Training loss: {loss_train_avg}')

    # Evaluation
    val_loss, predictions, true_vals = evaluate(test_loader)

    val_f1 = f1_score_func(predictions, true_vals)
    print(f'Validation loss: {val_loss}')
    print(f'F1 Score (weighted): {val_f1}')
    print(f'Accuracy Score: {accuracy_score_func(predictions, true_vals)}')
    print(f'Precision Score: {precision_score_func(predictions, true_vals)}')
    print(f'Recall Score: {recall_score_func(predictions, true_vals)}')
    print(f'MCC Score: {matthews_corrcoef_func(predictions, true_vals)}')
    single_model_performance(predictions, true_vals)
    print('')

## SNLI Dataset on RoBERTa-CommonCrawl

In [None]:
# SNLI on our RoBERTa-CommonCrawl model
# Update on 26.03.2022

train(model, train_loader, test_loader, optimizer, scheduler)

Epoch 0:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 0
Training loss: 0.7829976291212968


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.6153911024040819
F1 Score (weighted): 0.7544641627890181
Accuracy Score: 0.7543770358306189
Precision Score: 0.7570683160109847
Recall Score: 0.7543770358306189
MCC Score: 0.6323967544433321
Confusion Matrix:
[[2724  468  176]
 [ 429 2390  400]
 [ 359  581 2297]]
              precision    recall  f1-score   support

           0       0.78      0.81      0.79      3368
           1       0.69      0.74      0.72      3219
           2       0.80      0.71      0.75      3237

    accuracy                           0.75      9824
   macro avg       0.76      0.75      0.75      9824
weighted avg       0.76      0.75      0.75      9824

MCC score: 0.6323967544433321



Epoch 1:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 1
Training loss: 0.5831740974818519


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.5858827198852545
F1 Score (weighted): 0.7749533118832926
Accuracy Score: 0.774735342019544
Precision Score: 0.7756081991669037
Recall Score: 0.774735342019544
MCC Score: 0.6622091722608774
Confusion Matrix:
[[2740  431  197]
 [ 382 2407  430]
 [ 275  498 2464]]
              precision    recall  f1-score   support

           0       0.81      0.81      0.81      3368
           1       0.72      0.75      0.73      3219
           2       0.80      0.76      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.78      0.77      0.77      9824
weighted avg       0.78      0.77      0.77      9824

MCC score: 0.6622091722608774



Epoch 2:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 2
Training loss: 0.4722646286035632


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.6083644823768822
F1 Score (weighted): 0.7749264959173399
Accuracy Score: 0.7754478827361564
Precision Score: 0.7758213641850966
Recall Score: 0.7754478827361564
MCC Score: 0.6637565985251389
Confusion Matrix:
[[2690  395  283]
 [ 375 2272  572]
 [ 229  352 2656]]
              precision    recall  f1-score   support

           0       0.82      0.80      0.81      3368
           1       0.75      0.71      0.73      3219
           2       0.76      0.82      0.79      3237

    accuracy                           0.78      9824
   macro avg       0.78      0.78      0.77      9824
weighted avg       0.78      0.78      0.77      9824

MCC score: 0.6637565985251389



Epoch 3:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 3
Training loss: 0.3866341752975337


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.6528111761223513
F1 Score (weighted): 0.7760021006557886
Accuracy Score: 0.7759568403908795
Precision Score: 0.776227244201663
Recall Score: 0.7759568403908795
MCC Score: 0.6639924959582829
Confusion Matrix:
[[2704  440  224]
 [ 362 2335  522]
 [ 226  427 2584]]
              precision    recall  f1-score   support

           0       0.82      0.80      0.81      3368
           1       0.73      0.73      0.73      3219
           2       0.78      0.80      0.79      3237

    accuracy                           0.78      9824
   macro avg       0.78      0.78      0.78      9824
weighted avg       0.78      0.78      0.78      9824

MCC score: 0.6639924959582829



Epoch 4:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 4
Training loss: 0.3155524391628322


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.7407329901203464
F1 Score (weighted): 0.7715829974780709
Accuracy Score: 0.7730048859934854
Precision Score: 0.7743892222428843
Recall Score: 0.7730048859934854
MCC Score: 0.6612473174390021
Confusion Matrix:
[[2725  347  296]
 [ 400 2157  662]
 [ 220  305 2712]]
              precision    recall  f1-score   support

           0       0.81      0.81      0.81      3368
           1       0.77      0.67      0.72      3219
           2       0.74      0.84      0.79      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6612473174390021



Epoch 5:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 5
Training loss: 0.26737043040788483


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.8357256421241424
F1 Score (weighted): 0.780677311118476
Accuracy Score: 0.7809446254071661
Precision Score: 0.7810596719077794
Recall Score: 0.7809446254071661
MCC Score: 0.6716797871211283
Confusion Matrix:
[[2709  417  242]
 [ 371 2313  535]
 [ 204  383 2650]]
              precision    recall  f1-score   support

           0       0.82      0.80      0.81      3368
           1       0.74      0.72      0.73      3219
           2       0.77      0.82      0.80      3237

    accuracy                           0.78      9824
   macro avg       0.78      0.78      0.78      9824
weighted avg       0.78      0.78      0.78      9824

MCC score: 0.6716797871211283



Epoch 6:   0%|          | 0/3185 [00:00<?, ?it/s]

## SNLI Dataset on XLM-RoBERTa-Model

In [None]:
# SNLI on XLM-RoBERTa-Base model
train(model, train_loader, test_loader, optimizer, scheduler)

Epoch 0:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 0
Training loss: 1.0068349694906262


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.9183981688480812
F1 Score (weighted): 0.5571119862642198
Confusion Matrix:
[[2289  357  722]
 [ 787 1052 1380]
 [ 572  400 2265]]
              precision    recall  f1-score   support

           0       0.63      0.68      0.65      3368
           1       0.58      0.33      0.42      3219
           2       0.52      0.70      0.60      3237

    accuracy                           0.57      9824
   macro avg       0.58      0.57      0.56      9824
weighted avg       0.58      0.57      0.56      9824

MCC score: 0.36483712893391657



Epoch 1:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 1
Training loss: 0.8939827317907073


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.8373152639850343
F1 Score (weighted): 0.6220215585893654
Confusion Matrix:
[[2280  684  404]
 [ 643 1974  602]
 [ 472  912 1853]]
              precision    recall  f1-score   support

           0       0.67      0.68      0.67      3368
           1       0.55      0.61      0.58      3219
           2       0.65      0.57      0.61      3237

    accuracy                           0.62      9824
   macro avg       0.62      0.62      0.62      9824
weighted avg       0.62      0.62      0.62      9824

MCC score: 0.4332685478609647



Epoch 2:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 2
Training loss: 0.8291568055912688


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.8178536672060187
F1 Score (weighted): 0.6432363490980482
Confusion Matrix:
[[2348  601  419]
 [ 621 1864  734]
 [ 436  691 2110]]
              precision    recall  f1-score   support

           0       0.69      0.70      0.69      3368
           1       0.59      0.58      0.58      3219
           2       0.65      0.65      0.65      3237

    accuracy                           0.64      9824
   macro avg       0.64      0.64      0.64      9824
weighted avg       0.64      0.64      0.64      9824

MCC score: 0.4651511461257843



Epoch 3:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 3
Training loss: 0.7764073398554905


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.8137963153057844
F1 Score (weighted): 0.6438967767017303
Confusion Matrix:
[[2484  625  259]
 [ 665 2113  441]
 [ 548  946 1743]]
              precision    recall  f1-score   support

           0       0.67      0.74      0.70      3368
           1       0.57      0.66      0.61      3219
           2       0.71      0.54      0.61      3237

    accuracy                           0.65      9824
   macro avg       0.65      0.64      0.64      9824
weighted avg       0.65      0.65      0.64      9824

MCC score: 0.47115460933978204



Epoch 4:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 4
Training loss: 0.7297686177716712


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.8238333021973165
F1 Score (weighted): 0.6477478043352991
Confusion Matrix:
[[2504  358  506]
 [ 721 1645  853]
 [ 522  462 2253]]
              precision    recall  f1-score   support

           0       0.67      0.74      0.70      3368
           1       0.67      0.51      0.58      3219
           2       0.62      0.70      0.66      3237

    accuracy                           0.65      9824
   macro avg       0.65      0.65      0.65      9824
weighted avg       0.65      0.65      0.65      9824

MCC score: 0.4800818574530782



Epoch 5:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 5
Training loss: 0.6864344215842206


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.8253627421170571
F1 Score (weighted): 0.6629592538362475
Confusion Matrix:
[[2448  534  386]
 [ 603 2002  614]
 [ 475  695 2067]]
              precision    recall  f1-score   support

           0       0.69      0.73      0.71      3368
           1       0.62      0.62      0.62      3219
           2       0.67      0.64      0.66      3237

    accuracy                           0.66      9824
   macro avg       0.66      0.66      0.66      9824
weighted avg       0.66      0.66      0.66      9824

MCC score: 0.49500641510185783



Epoch 6:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 6
Training loss: 0.6427226916587147


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.850992122331349
F1 Score (weighted): 0.660110814585215
Confusion Matrix:
[[2407  603  358]
 [ 570 2051  598]
 [ 463  748 2026]]
              precision    recall  f1-score   support

           0       0.70      0.71      0.71      3368
           1       0.60      0.64      0.62      3219
           2       0.68      0.63      0.65      3237

    accuracy                           0.66      9824
   macro avg       0.66      0.66      0.66      9824
weighted avg       0.66      0.66      0.66      9824

MCC score: 0.4902662801539418



Epoch 7:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 7
Training loss: 0.6049902169352611


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.8545984355945152
F1 Score (weighted): 0.6599431657370037
Confusion Matrix:
[[2396  503  469]
 [ 569 1882  768]
 [ 419  606 2212]]
              precision    recall  f1-score   support

           0       0.71      0.71      0.71      3368
           1       0.63      0.58      0.61      3219
           2       0.64      0.68      0.66      3237

    accuracy                           0.66      9824
   macro avg       0.66      0.66      0.66      9824
weighted avg       0.66      0.66      0.66      9824

MCC score: 0.49115801657648755



Epoch 8:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 8
Training loss: 0.5787533670523477


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.8952408880238036
F1 Score (weighted): 0.6589695636446548
Confusion Matrix:
[[2345  475  548]
 [ 532 1814  873]
 [ 395  516 2326]]
              precision    recall  f1-score   support

           0       0.72      0.70      0.71      3368
           1       0.65      0.56      0.60      3219
           2       0.62      0.72      0.67      3237

    accuracy                           0.66      9824
   macro avg       0.66      0.66      0.66      9824
weighted avg       0.66      0.66      0.66      9824

MCC score: 0.4918001970877914



Epoch 9:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 9
Training loss: 0.5573024438141466


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.9085395886329175
F1 Score (weighted): 0.6622434205984314
Confusion Matrix:
[[2387  537  444]
 [ 557 1938  724]
 [ 430  623 2184]]
              precision    recall  f1-score   support

           0       0.71      0.71      0.71      3368
           1       0.63      0.60      0.61      3219
           2       0.65      0.67      0.66      3237

    accuracy                           0.66      9824
   macro avg       0.66      0.66      0.66      9824
weighted avg       0.66      0.66      0.66      9824

MCC score: 0.49382131443163224



In [None]:
assert True == False

AssertionError: ignored

## SNLI Dataset On Our RoBERTa Model

In [None]:
# SNLI on our RoBERTa-Base model
# Update on 08.03.2022

# Run this cell!
train(model, train_loader, test_loader, optimizer, scheduler)

Epoch 0:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 0
Training loss: 0.7878371387860464


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.6369077361610502
F1 Score (weighted): 0.7363613026454001
Accuracy Score: 0.7375814332247557
Precision Score: 0.7394583806162941
Recall Score: 0.7375814332247557
MCC Score: 0.6076960516218738
Confusion Matrix:
[[2791  355  222]
 [ 560 2251  408]
 [ 475  558 2204]]
              precision    recall  f1-score   support

           0       0.73      0.83      0.78      3368
           1       0.71      0.70      0.71      3219
           2       0.78      0.68      0.73      3237

    accuracy                           0.74      9824
   macro avg       0.74      0.74      0.74      9824
weighted avg       0.74      0.74      0.74      9824

MCC score: 0.6076960516218738



Epoch 1:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 1
Training loss: 0.6045962669879339


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.616665524785507
F1 Score (weighted): 0.7596416618254588
Accuracy Score: 0.7592630293159609
Precision Score: 0.762382378532777
Recall Score: 0.7592630293159609
MCC Score: 0.639794929086689
Confusion Matrix:
[[2693  463  212]
 [ 407 2442  370]
 [ 308  605 2324]]
              precision    recall  f1-score   support

           0       0.79      0.80      0.79      3368
           1       0.70      0.76      0.73      3219
           2       0.80      0.72      0.76      3237

    accuracy                           0.76      9824
   macro avg       0.76      0.76      0.76      9824
weighted avg       0.76      0.76      0.76      9824

MCC score: 0.639794929086689



Epoch 2:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 2
Training loss: 0.4961018963677449


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.6329752365278887
F1 Score (weighted): 0.7724020083784263
Accuracy Score: 0.7728013029315961
Precision Score: 0.7722840130993939
Recall Score: 0.7728013029315961
MCC Score: 0.659197050106449
Confusion Matrix:
[[2746  371  251]
 [ 408 2304  507]
 [ 276  419 2542]]
              precision    recall  f1-score   support

           0       0.80      0.82      0.81      3368
           1       0.74      0.72      0.73      3219
           2       0.77      0.79      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.659197050106449



Epoch 3:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 3
Training loss: 0.4062147360874027


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.6557362351069039
F1 Score (weighted): 0.7714805879940664
Accuracy Score: 0.7717833876221498
Precision Score: 0.7714313293584545
Recall Score: 0.7717833876221498
MCC Score: 0.6576265490429107
Confusion Matrix:
[[2768  371  229]
 [ 425 2336  458]
 [ 293  466 2478]]
              precision    recall  f1-score   support

           0       0.79      0.82      0.81      3368
           1       0.74      0.73      0.73      3219
           2       0.78      0.77      0.77      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6576265490429107



Epoch 4:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 4
Training loss: 0.33433700605723976


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.729596388347277
F1 Score (weighted): 0.7717279797098533
Accuracy Score: 0.7717833876221498
Precision Score: 0.7726391729467659
Recall Score: 0.7717833876221498
MCC Score: 0.6579192157290831
Confusion Matrix:
[[2769  383  216]
 [ 429 2404  386]
 [ 298  530 2409]]
              precision    recall  f1-score   support

           0       0.79      0.82      0.81      3368
           1       0.72      0.75      0.74      3219
           2       0.80      0.74      0.77      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6579192157290831



Epoch 5:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 5
Training loss: 0.2797709536061767


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.8225783225822381
F1 Score (weighted): 0.7698285166591693
Accuracy Score: 0.7702565146579805
Precision Score: 0.770034837289666
Recall Score: 0.7702565146579805
MCC Score: 0.6555684687853816
Confusion Matrix:
[[2716  374  278]
 [ 397 2276  546]
 [ 258  404 2575]]
              precision    recall  f1-score   support

           0       0.81      0.81      0.81      3368
           1       0.75      0.71      0.73      3219
           2       0.76      0.80      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6555684687853816



Epoch 6:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 6
Training loss: 0.23792305719019666


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.9196614110432808
F1 Score (weighted): 0.7717477492998851
Accuracy Score: 0.7716815960912052
Precision Score: 0.7720544819460122
Recall Score: 0.7716815960912052
MCC Score: 0.6575224485211508
Confusion Matrix:
[[2739  399  230]
 [ 420 2374  425]
 [ 260  509 2468]]
              precision    recall  f1-score   support

           0       0.80      0.81      0.81      3368
           1       0.72      0.74      0.73      3219
           2       0.79      0.76      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6575224485211508



Epoch 7:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 7
Training loss: 0.20731293063017203


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 1.0269291950799282
F1 Score (weighted): 0.7698217513212088
Accuracy Score: 0.769849348534202
Precision Score: 0.7698044022877338
Recall Score: 0.769849348534202
MCC Score: 0.6546924501451677
Confusion Matrix:
[[2709  419  240]
 [ 412 2342  465]
 [ 268  457 2512]]
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      3368
           1       0.73      0.73      0.73      3219
           2       0.78      0.78      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6546924501451677



Epoch 8:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 8
Training loss: 0.18466261490434555


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 1.137490622596876
F1 Score (weighted): 0.7712251133345844
Accuracy Score: 0.7708672638436482
Precision Score: 0.7718358985586519
Recall Score: 0.7708672638436482
MCC Score: 0.6564242541438758
Confusion Matrix:
[[2660  449  259]
 [ 363 2384  472]
 [ 233  475 2529]]
              precision    recall  f1-score   support

           0       0.82      0.79      0.80      3368
           1       0.72      0.74      0.73      3219
           2       0.78      0.78      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6564242541438758



Epoch 9:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 9
Training loss: 0.1715386151632563


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 1.1757785232998552
F1 Score (weighted): 0.7719810410002043
Accuracy Score: 0.7717833876221498
Precision Score: 0.7722407014790428
Recall Score: 0.7717833876221498
MCC Score: 0.6576701889491979
Confusion Matrix:
[[2693  427  248]
 [ 383 2371  465]
 [ 244  475 2518]]
              precision    recall  f1-score   support

           0       0.81      0.80      0.81      3368
           1       0.72      0.74      0.73      3219
           2       0.78      0.78      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6576701889491979



In [None]:
# SNLI on our RoBERTa-Base model
train(model, train_loader, test_loader, optimizer, scheduler)

Epoch 0:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 0
Training loss: 0.7894805244972792


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.6359457686446389
F1 Score (weighted): 0.7377085092055004
Confusion Matrix:
[[2793  325  250]
 [ 581 2106  532]
 [ 460  412 2365]]
              precision    recall  f1-score   support

           0       0.73      0.83      0.78      3368
           1       0.74      0.65      0.69      3219
           2       0.75      0.73      0.74      3237

    accuracy                           0.74      9824
   macro avg       0.74      0.74      0.74      9824
weighted avg       0.74      0.74      0.74      9824

MCC score: 0.6103734650769178



Epoch 1:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 1
Training loss: 0.609613647851128


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.6096735503949249
F1 Score (weighted): 0.7595516885059291
Confusion Matrix:
[[2619  471  278]
 [ 374 2391  454]
 [ 285  504 2448]]
              precision    recall  f1-score   support

           0       0.80      0.78      0.79      3368
           1       0.71      0.74      0.73      3219
           2       0.77      0.76      0.76      3237

    accuracy                           0.76      9824
   macro avg       0.76      0.76      0.76      9824
weighted avg       0.76      0.76      0.76      9824

MCC score: 0.6389122716358716



Epoch 2:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 2
Training loss: 0.4986506356619404


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.6255365789591878
F1 Score (weighted): 0.766518462410234
Confusion Matrix:
[[2644  448  276]
 [ 352 2339  528]
 [ 241  451 2545]]
              precision    recall  f1-score   support

           0       0.82      0.79      0.80      3368
           1       0.72      0.73      0.72      3219
           2       0.76      0.79      0.77      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6496050239102458



Epoch 3:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 3
Training loss: 0.40522705496959915


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.6710031564902599
F1 Score (weighted): 0.7711366633709529
Confusion Matrix:
[[2812  321  235]
 [ 482 2291  446]
 [ 311  446 2480]]
              precision    recall  f1-score   support

           0       0.78      0.83      0.81      3368
           1       0.75      0.71      0.73      3219
           2       0.78      0.77      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.658026834524874



Epoch 4:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 4
Training loss: 0.32989423555180175


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.741452163282527
F1 Score (weighted): 0.7711017064204738
Confusion Matrix:
[[2716  389  263]
 [ 396 2339  484]
 [ 256  460 2521]]
              precision    recall  f1-score   support

           0       0.81      0.81      0.81      3368
           1       0.73      0.73      0.73      3219
           2       0.77      0.78      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6566952042908404



Epoch 5:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 5
Training loss: 0.2740058112728703


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.8499318361221676
F1 Score (weighted): 0.7698235266738883
Confusion Matrix:
[[2665  455  248]
 [ 369 2319  531]
 [ 228  431 2578]]
              precision    recall  f1-score   support

           0       0.82      0.79      0.80      3368
           1       0.72      0.72      0.72      3219
           2       0.77      0.80      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6547549486526985



Epoch 6:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 6
Training loss: 0.22873671522562858


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 0.9213605481275761
F1 Score (weighted): 0.7723677986106365
Confusion Matrix:
[[2704  444  220]
 [ 382 2413  424]
 [ 257  513 2467]]
              precision    recall  f1-score   support

           0       0.81      0.80      0.81      3368
           1       0.72      0.75      0.73      3219
           2       0.79      0.76      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6581430369161145



Epoch 7:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 7
Training loss: 0.19944221310754667


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 1.0826856297537903
F1 Score (weighted): 0.7694858450090646
Confusion Matrix:
[[2643  452  273]
 [ 365 2348  506]
 [ 219  452 2566]]
              precision    recall  f1-score   support

           0       0.82      0.78      0.80      3368
           1       0.72      0.73      0.73      3219
           2       0.77      0.79      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6540551507696727



Epoch 8:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 8
Training loss: 0.17506024529974692


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 1.1492493315486099
F1 Score (weighted): 0.7697040800788267
Confusion Matrix:
[[2681  413  274]
 [ 374 2326  519]
 [ 230  453 2554]]
              precision    recall  f1-score   support

           0       0.82      0.80      0.81      3368
           1       0.73      0.72      0.73      3219
           2       0.76      0.79      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6545536893592653



Epoch 9:   0%|          | 0/3185 [00:00<?, ?it/s]

Epoch: 9
Training loss: 0.16485650994642412


  0%|          | 0/614 [00:00<?, ?it/s]

Validation loss: 1.199278458062535
F1 Score (weighted): 0.7696899084257834
Confusion Matrix:
[[2672  448  248]
 [ 374 2354  491]
 [ 236  468 2533]]
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      3368
           1       0.72      0.73      0.73      3219
           2       0.77      0.78      0.78      3237

    accuracy                           0.77      9824
   macro avg       0.77      0.77      0.77      9824
weighted avg       0.77      0.77      0.77      9824

MCC score: 0.6542147152477191



## Earlier Dataset on our RoBERTa Model

In [None]:
# Our RoBERTa-Base-All
train(model, train_loader, test_loader, optimizer, scheduler)

Epoch 0:   0%|          | 0/1444 [00:00<?, ?it/s]

Epoch: 0
Training loss: 0.41131512911015106


  0%|          | 0/133 [00:00<?, ?it/s]

Validation loss: 0.4366325318701285
F1 Score (weighted): 0.8055018196043585
Confusion Matrix:
[[ 654  188]
 [ 227 1057]]
              precision    recall  f1-score   support

           0       0.74      0.78      0.76       842
           2       0.85      0.82      0.84      1284

    accuracy                           0.80      2126
   macro avg       0.80      0.80      0.80      2126
weighted avg       0.81      0.80      0.81      2126

MCC score: 0.595617015298504



Epoch 1:   0%|          | 0/1444 [00:00<?, ?it/s]

Epoch: 1
Training loss: 0.24042673267609854


  0%|          | 0/133 [00:00<?, ?it/s]

Validation loss: 0.5291104168260008
F1 Score (weighted): 0.820993749920834
Confusion Matrix:
[[ 624  218]
 [ 160 1124]]
              precision    recall  f1-score   support

           0       0.80      0.74      0.77       842
           2       0.84      0.88      0.86      1284

    accuracy                           0.82      2126
   macro avg       0.82      0.81      0.81      2126
weighted avg       0.82      0.82      0.82      2126

MCC score: 0.6249203976578552



Epoch 2:   0%|          | 0/1444 [00:00<?, ?it/s]

Epoch: 2
Training loss: 0.17198881463981633


  0%|          | 0/133 [00:00<?, ?it/s]

Validation loss: 0.7231472676367801
F1 Score (weighted): 0.8234196480069723
Confusion Matrix:
[[ 638  204]
 [ 170 1114]]
              precision    recall  f1-score   support

           0       0.79      0.76      0.77       842
           2       0.85      0.87      0.86      1284

    accuracy                           0.82      2126
   macro avg       0.82      0.81      0.81      2126
weighted avg       0.82      0.82      0.82      2126

MCC score: 0.6300545595447852



Epoch 3:   0%|          | 0/1444 [00:00<?, ?it/s]

Epoch: 3
Training loss: 0.13220505458307633


  0%|          | 0/133 [00:00<?, ?it/s]

Validation loss: 0.7807049735946426
F1 Score (weighted): 0.8164481495304896
Confusion Matrix:
[[ 623  219]
 [ 169 1115]]
              precision    recall  f1-score   support

           0       0.79      0.74      0.76       842
           2       0.84      0.87      0.85      1284

    accuracy                           0.82      2126
   macro avg       0.81      0.80      0.81      2126
weighted avg       0.82      0.82      0.82      2126

MCC score: 0.6153258993804965



Epoch 4:   0%|          | 0/1444 [00:00<?, ?it/s]

Epoch: 4
Training loss: 0.09441861466653297


  0%|          | 0/133 [00:00<?, ?it/s]

Validation loss: 0.9746032917304874
F1 Score (weighted): 0.8188901536839144
Confusion Matrix:
[[ 662  180]
 [ 206 1078]]
              precision    recall  f1-score   support

           0       0.76      0.79      0.77       842
           2       0.86      0.84      0.85      1284

    accuracy                           0.82      2126
   macro avg       0.81      0.81      0.81      2126
weighted avg       0.82      0.82      0.82      2126

MCC score: 0.622680132496728



## Earlied Dataset on XLM-RoBERTa-Base

In [None]:
# XLM-RoBERTa-Base
train(model, train_loader, test_loader, optimizer, scheduler)

Epoch 0:   0%|          | 0/1444 [00:00<?, ?it/s]

Epoch: 0
Training loss: 0.3904455008508942


  0%|          | 0/133 [00:00<?, ?it/s]

Validation loss: 0.39346628102536935
F1 Score (weighted): 0.8567935939656646
Confusion Matrix:
[[ 617  225]
 [  73 1211]]
              precision    recall  f1-score   support

           0       0.89      0.73      0.81       842
           2       0.84      0.94      0.89      1284

    accuracy                           0.86      2126
   macro avg       0.87      0.84      0.85      2126
weighted avg       0.86      0.86      0.86      2126

MCC score: 0.7060502852050432



Epoch 1:   0%|          | 0/1444 [00:00<?, ?it/s]

Epoch: 1
Training loss: 0.2424309109092543


  0%|          | 0/133 [00:00<?, ?it/s]

In [None]:
# https://huggingface.co/datasets/newsph_nli
# GOOGLETRANSLATE excel

In [None]:
# https://huggingface.co/datasets/snli

In [None]:
# def train(model, train_loader, test_loader, optimizer, scheduler):
#   total_step = len(train_loader)

#   for epoch in range(EPOCHS):
#     start = time.time()
#     model.train()

#     loss_train_total = 0

#     progress_bar = tqdm(train_loader,
#                       desc='Epoch {:1d}'.format(epoch),
#                       leave=False,
#                       disable=False)

#     for batch in progress_bar:

#       model.zero_grad()
#       batch = tuple(b.to(device) for b in batch)
#       inputs = {
#             'input_ids': batch[0],
#             'attention_mask': batch[1],
#             'labels': batch[2]
#         }

#       outputs = model(**inputs)
#       loss = outputs[0]
#       loss_train_total += loss.item()
#       loss.backward()

#       torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

#       optimizer.step()
#       scheduler.step()

#       progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

#     loss_train_avg = loss_train_total/len(train_loader)
#     print(f'Epoch: {epoch}')
#     tqdm.write(f'Training loss: {loss_train_avg}')

#     # Evaluation
#     val_loss, predictions, true_vals = evaluate(test_loader)

#     val_f1 = f1_score_func(predictions, true_vals)
#     print(f'Validation loss: {val_loss}')
#     print(f'F1 Score (weighted): {val_f1}')
#     single_model_performance(predictions, true_vals)
#     print('')

true i predicted kolonite
suma=0
for t,p in zip(true,predicted):

  for i,j in zip(t,p):
    if i==j:
       count+=1
    suma+=1
count/vkupenbrojtagovi