In [1]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

import evaluate
from datasets import load_dataset

from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings('ignore', category = FutureWarning)

In [3]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
dataset = load_dataset('multi_nli')

In [5]:
def filter_labels(example):
    return example['label'] != -1

dataset['validation_matched'] = dataset['validation_matched'].filter(filter_labels)

In [6]:
max_length = 128
def tokenize_fn(example):
    
    enc = tokenizer(
        example['premise'],
        example['hypothesis'],
        padding  ='max_length',
        truncation = True,
        max_length = max_length
    )
    
    return {
        'input_ids' : enc['input_ids'],
        'attention_mask' : enc['attention_mask'],
        'label' : example['label']
    }

In [7]:
dataset = dataset.map(tokenize_fn, batched = True, batch_size = 2000)
dataset.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

In [8]:
train_data = dataset['train'].shuffle(seed = 42).select(range(100000))
val_data = dataset['validation_matched']

In [9]:
accuracy = evaluate.load('accuracy')
f1 = evaluate.load('f1')

In [10]:
def compute_metrics(eval_pred):
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
    
    acc_result = accuracy.compute(predictions = predictions, references = labels)
    f1_result = f1.compute(predictions = predictions, references = labels, average = 'weighted')

    return {**acc_result, **f1_result}

In [11]:
training_args = TrainingArguments(
    output_dir = './results',
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    num_train_epochs = 2,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    learning_rate = 0.00001,
    weight_decay = 0.08,
    warmup_ratio = 0.2,
    logging_dir = './logs',
    logging_strategy = 'steps',
    logging_steps = 20,
    report_to = 'none',
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy',
    save_total_limit = 1,
    fp16 = True
)

In [12]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_data,
    eval_dataset = val_data,
    compute_metrics = compute_metrics,
    processing_class = tokenizer
)

In [13]:
trainer.train()

final_eval = trainer.evaluate()
print('Final Evaluation : \n')
print(f"Eval Loss: {final_eval['eval_loss']:.4f} | Eval Accuracy: {final_eval['eval_accuracy']:.4f} | Eval F1 Score: {final_eval['eval_f1']:.4f}")

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6972,0.555298,0.777585,0.777764
2,0.4173,0.552073,0.80326,0.803676


Final Evaluation : 

Eval Loss: 0.5521 | Eval Accuracy: 0.8033 | Eval F1 Score: 0.8037
