In [12]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

import evaluate
from datasets import load_dataset

from tqdm.notebook import tqdm

In [None]:
import warnings
warnings.filterwarnings("ignore", category = FutureWarning)

In [2]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
dataset = load_dataset('multi_nli')

In [4]:
def filter_labels(example):
    return example['label'] != -1

dataset['validation_matched'] = dataset['validation_matched'].filter(filter_labels)

In [5]:
max_length = 256
def tokenize_fn(example):
    
    enc = tokenizer(
        example['premise'],
        example['hypothesis'],
        padding  ='max_length',
        truncation = True,
        max_length = max_length
    )
    
    return {
        'input_ids' : enc['input_ids'],
        'attention_mask' : enc['attention_mask'],
        'label' : example['label']
    }

In [6]:
dataset = dataset.map(tokenize_fn, batched = False)
dataset.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

In [7]:
train_data = dataset['train'].shuffle(seed = 42).select(range(5000))
val_data = dataset['validation_matched'].select(range(1000))

In [8]:
accuracy = evaluate.load('accuracy')
f1 = evaluate.load('f1')

In [9]:
def compute_metrics(eval_pred):
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
    
    acc_result = accuracy.compute(predictions = predictions, references = labels)
    f1_result = f1.compute(predictions = predictions, references = labels, average = 'weighted')

    return {**acc_result, **f1_result}

In [10]:
training_args = TrainingArguments(
    output_dir = './results-bert-mnli',
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    num_train_epochs = 5,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 64,
    learning_rate = 0.0002,
    weight_decay = 0.01,
    logging_dir = './logs',
    report_to = 'none',
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy',
    save_total_limit = 1 
)

In [11]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_data,
    eval_dataset = val_data,
    compute_metrics = compute_metrics,
    processing_class = tokenizer
)

In [13]:
trainer.train()

final_eval = trainer.evaluate()
print('Final Evaluation : \n')
print(f'Eval Loss: {final_eval['eval_loss']:.4f} | Eval Accuracy: {final_eval['eval_accuracy']:.4f} | Eval F1 Score: {final_eval['eval_f1']:4f}')

  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.81564,0.643,0.642096
2,0.881300,0.794645,0.66,0.660677
3,0.881300,0.822424,0.668,0.668414


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


Final Evaluation : {'eval_loss': 0.8224238157272339, 'eval_accuracy': 0.668, 'eval_f1': 0.668413636750832, 'eval_runtime': 9.3947, 'eval_samples_per_second': 106.443, 'eval_steps_per_second': 3.406, 'epoch': 3.0}
