# Finetuning bert using glue_sst2_10k Dataset 

In [1]:
# Load the tokenizer

from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [2]:
# Load the dataset
from datasets import load_dataset

raw_dataset = load_dataset("Tohrumi/glue_sst2_10k")

In [3]:
raw_dataset["train"][0]

{'sentence': 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ',
 'label': 0,
 'idx': 4}

In [4]:
# Function for tokenizing the dataset
def tokenize_fn(example):
    return tokenizer(example["sentence"], truncation=True)

#### Testing tokenization

In [5]:
tokenized_dataset = raw_dataset.map(tokenize_fn, batched=True)
tokenized_dataset["train"][0]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'sentence': 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ',
 'label': 0,
 'idx': 4,
 'input_ids': [101,
  2006,
  1996,
  5409,
  7195,
  1011,
  1997,
  1011,
  1996,
  1011,
  11265,
  17811,
  18856,
  17322,
  2015,
  1996,
  16587,
  2071,
  2852,
  24225,
  2039,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [6]:
# data collator
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
# Load the model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Prepare training arguments
from transformers import TrainingArguments
training_args = TrainingArguments("sst-trainer", eval_strategy="epoch")

In [12]:
# Prepare trainer
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    processing_class=tokenizer,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"],
    data_collator=data_collator,
        
)

In [13]:
# start training
trainer.train()

Step,Training Loss
500,0.4404
1000,0.351
1500,0.2424
2000,0.1747
2500,0.1855
3000,0.0675
3500,0.0678


TrainOutput(global_step=3750, training_loss=0.20730536804199218, metrics={'train_runtime': 152.3342, 'train_samples_per_second': 196.935, 'train_steps_per_second': 24.617, 'total_flos': 454335237063360.0, 'train_loss': 0.20730536804199218, 'epoch': 3.0})

#### Train again using compute metrics

In [11]:
import evaluate
import numpy as np

# Compute matrics
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [12]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    processing_class=tokenizer,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
        
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.346,0.32257,0.906,0.91439
2,0.1785,0.352947,0.916,0.925
3,0.0737,0.477322,0.914,0.923077


TrainOutput(global_step=3750, training_loss=0.20792645874023438, metrics={'train_runtime': 151.3669, 'train_samples_per_second': 198.194, 'train_steps_per_second': 24.774, 'total_flos': 454335237063360.0, 'train_loss': 0.20792645874023438, 'epoch': 3.0})