# NLI Example

Based on https://talman.io/nli/pytorch/demo/2020/12/11/natural-language-inference-with-pytorch-and-transformers.html

In [20]:
!export CUDA_VISIBLE_DEVICES=""

In [21]:
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW, logging
import datasets
from tqdm import tqdm
import numpy as np

In [22]:
nli_data = datasets.load_dataset("multi_nli")

train_dataset = nli_data['train'].select(range(20000)) 
# limiting the training set size to 20,000 for demo purposes
dev_dataset = nli_data['validation_matched']
test_dataset = nli_data[]

Reusing dataset multi_nli (/home/jmperez/.cache/huggingface/datasets/multi_nli/plain_text/1.0.0/9969e1448f410fe7c6c688a84bfcb61312d0a3f2741d57341c26ef99f28a5451)


In [23]:
num = 0

# Clearer if it is a dict
label_name = {
    0: "entailment",
    1: "neutral", 
    2: "contradiction",
}

for num in range(170, 185):
    print("="*80, "\n")
    print("Hypothesis: ", train_dataset["hypothesis"][num])
    print("Premise   : ", train_dataset["premise"][num])
    print("Label     : ", label_name[train_dataset["label"][num]])


Hypothesis:  Workers carve sculptures and paint scrolls with great enthusiasm.
Premise   :  The individual artisans' shops are no longer here, but you can visit a silk-weaving factory, a ceramics plant, and the Foshan Folk Art Studio, where you can observe workers making Chinese lanterns, carving sculptures, painting scrolls, and cutting intricate designs in paper.
Label     :  neutral

Hypothesis:  Sir Ernest bent his head slightly, and continued.
Premise   :  Really, Sir Ernest, protested the judge, "these questions are not relevant." Sir Ernest bowed, and having shot his arrow proceeded. 
Label     :  entailment

Hypothesis:  The house is very large and boasts over ten bedrooms, a huge kitchen, and a full sized olympic pool.
Premise   :  The house is surprisingly small and simple, with one bedroom, a tiny kitchen, and a couple of social rooms.
Label     :  contradiction

Hypothesis:  Hiding things is just dirty, whereas there is glory in fiction
Premise   :  Fiction has its glories

In [25]:
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

tokenizer.model_max_length = 256

def tokenize(batch):
    return tokenizer(batch['premise'], batch['hypothesis'], padding='max_length', truncation=True)

batch_size = 16
eval_batch_size = 8

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=batch_size)
dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)



HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1227.0), HTML(value='')))




You can check the tokenizer has added a `[SEP]` token



In [26]:
lens = {sum(example["attention_mask"]) for example in train_dataset}



In [27]:
max(lens)

256

In [28]:
len([l for l in lens if l >= 256])

1

Uso 256!

In [29]:
example = train_dataset[0]

print(example.keys())
print("Premise    :", example["premise"])
print("Hypothesis :", example["hypothesis"])

tokenizer.decode(example["input_ids"])

dict_keys(['attention_mask', 'hypothesis', 'input_ids', 'label', 'premise'])
Premise    : Conceptually cream skimming has two basic dimensions - product and geography.
Hypothesis : Product and geography are what make cream skimming work. 


'[CLS] conceptually cream skimming has two basic dimensions - product and geography. [SEP] product and geography are what make cream skimming work. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [30]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-5)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [31]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    """
    Compute metrics for Trainer
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    #_, _, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        #'macro f1': macro_f1,
        'precision': precision,
        'recall': recall
    }

In [32]:
from transformers import Trainer, TrainingArguments
epochs = 5

total_steps = (epochs * len(train_dataset)) // batch_size
warmup_steps = total_steps // 10
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
    warmup_steps=warmup_steps,
    evaluation_strategy="epoch",
    do_eval=True,
    weight_decay=0.01,
    logging_dir='./logs',
)

results = []

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
1,0.7966,0.719308,0.68813,0.686414,0.694824,0.686998,40.6425,241.496
2,0.5491,0.695121,0.70759,0.707682,0.710948,0.708051,40.6544,241.425
3,0.2715,0.871913,0.719715,0.717914,0.718131,0.717857,40.4743,242.499
4,0.1401,1.372733,0.722364,0.719126,0.72144,0.720013,40.5743,241.902
5,0.0738,1.580839,0.724401,0.723247,0.72333,0.723287,40.8692,240.156


TrainOutput(global_step=6250, training_loss=0.38396575485229495, metrics={'train_runtime': 1529.4957, 'train_samples_per_second': 4.086, 'total_flos': 10284407654400000, 'epoch': 5.0})

In [35]:
trainer.evaluate(dev_dataset)

{'eval_loss': 1.580838918685913,
 'eval_accuracy': 0.7244014263881814,
 'eval_f1': 0.7232469122677775,
 'eval_precision': 0.7233302559520883,
 'eval_recall': 0.723286715463998,
 'eval_runtime': 36.6277,
 'eval_samples_per_second': 267.966,
 'epoch': 5.0}