In [None]:
!pip install datasets

In [2]:
# Make necessary imports

# for array operations
import numpy as np
# PyTorch framework
import torch
# plotting
from matplotlib import pyplot as plt
# reproducibility
import random
# to watch progress
from tqdm.auto import tqdm

# HuggingFace ecosystem
# tokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding
# model
from transformers import AutoModelForSequenceClassification
# optimizer, lr-scheduler
from transformers import AdamW, get_scheduler
# dataset
from datasets import load_dataset, load_metric

In [3]:
# a seed for reproducibility
SEED = 42
# set seed
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)

# check for GPU device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Device available:', device)

Device available: cuda:0


In [4]:
raw_data = load_dataset("glue", "wnli")

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/635 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/71 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/146 [00:00<?, ? examples/s]

In [5]:
# how does it look like?
raw_data

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 635
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 71
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 146
    })
})

In [6]:
# Sample a data
raw_data["train"][0]

{'sentence1': 'I stuck a pin through a carrot. When I pulled the pin out, it had a hole.',
 'sentence2': 'The carrot had a hole.',
 'label': 1,
 'idx': 0}

In [7]:
# what features are there in data?
# What are the label names?
raw_data["train"].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_entailment', 'entailment'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [8]:
checkpoint = 'bert-base-uncased'
# bert tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# data collator for dynamic padding as per batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
# define a tokenize function
def Tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

In [10]:
# tokenize entire data
tokenized_data = raw_data.map(Tokenize_function, batched=True)

Map:   0%|          | 0/635 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

In [11]:
tokenized_data = tokenized_data.remove_columns(['idx','sentence1','sentence2'])
tokenized_data = tokenized_data.rename_column('label','labels')
tokenized_data.set_format('pt')
tokenized_data["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [12]:
train_data = torch.utils.data.DataLoader(tokenized_data["train"],
                                         shuffle=True,
                                         batch_size=8,
                                         collate_fn=data_collator
                                        )
val_data = torch.utils.data.DataLoader(tokenized_data["validation"],
                                       batch_size=8,
                                       collate_fn=data_collator
                                      )
test_data = torch.utils.data.DataLoader(tokenized_data["test"],
                                        batch_size=8,
                                        collate_fn=data_collator
                                       )

In [13]:
# do a chekck for proper data preprocessing
for batch in train_data:
    [print('{:>20} : {}'.format(k,v.shape)) for k,v in batch.items()]
    break

              labels : torch.Size([8])
           input_ids : torch.Size([8, 49])
      token_type_ids : torch.Size([8, 49])
      attention_mask : torch.Size([8, 49])


In [14]:
# cache a pre-trained BERT model for two-class classification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.6737, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [16]:
EPOCHS = 3
NUM_TRAINING_STEPS = EPOCHS * len(train_data)
print(NUM_TRAINING_STEPS)

optimizer = AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_scheduler("linear",
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=NUM_TRAINING_STEPS
                            )

240




In [17]:
model.to(device)
device

device(type='cuda', index=0)

In [18]:
progress_bar = tqdm(range(NUM_TRAINING_STEPS))

model.train()
for epoch in range(EPOCHS):
    for batch in train_data:
        batch = {k:v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/240 [00:00<?, ?it/s]

In [19]:
metric = load_metric("glue","wnli")

model.eval()
for batch in val_data:
    batch = {k:v.to(device) for k,v in batch.items()}
    print(batch['labels'], batch['labels'].shape)
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=preds,references=batch['labels'])
metric.compute()

  metric = load_metric("glue","wnli")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

tensor([0, 1, 0, 1, 1, 0, 1, 1], device='cuda:0') torch.Size([8])
tensor([0, 0, 0, 1, 0, 0, 0, 0], device='cuda:0') torch.Size([8])
tensor([1, 0, 0, 0, 0, 0, 0, 1], device='cuda:0') torch.Size([8])
tensor([0, 1, 0, 1, 1, 1, 1, 0], device='cuda:0') torch.Size([8])
tensor([1, 1, 0, 1, 0, 0, 1, 1], device='cuda:0') torch.Size([8])
tensor([0, 0, 0, 1, 0, 0, 1, 0], device='cuda:0') torch.Size([8])
tensor([1, 0, 0, 1, 0, 0, 1, 0], device='cuda:0') torch.Size([8])
tensor([1, 0, 1, 1, 0, 0, 1, 1], device='cuda:0') torch.Size([8])
tensor([0, 1, 1, 0, 1, 0, 0], device='cuda:0') torch.Size([7])


{'accuracy': 0.4084507042253521}

In [20]:
# make predictions
preds = []
model.eval()
for batch in test_data:
    batch['labels'] = torch.ones(len(batch['labels'])).type(torch.int64)
    batch = {k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    yhat = torch.argmax(logits, dim=-1)
    preds.append(yhat)

In [21]:
preds

[tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0'),
 tensor([1, 1], device='cuda:0')]