# Task 2.
To review Chapters 16 of Raschka book  and submit one jupyter notebook as a tutorial to fine-tune a BERT model to predict the sentiment of IMDb movie reviews  (HW10a.ipynb). (25 points ) Your tutorial notebook should have the following sections and corresponding code examples: 
* Loading the IMDb movie review dataset
* Tokenizing the dataset
* Loading and fine-tuning a pre-trained BERT model
* Fine-tuning a transformer more conveniently using the Trainer API

### Loading the IMDb movie review dataset

All the packages needed to prepare data and fine-tune the DistilBERT model:

In [None]:
import gzip
import shutil
import time

In [None]:
import pandas as pd
import requests

In [None]:
import torch
import torch.nn.functional as F
import torchtext

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m102.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

General settings:

In [None]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NUM_EPOCHS = 3

Fetch the compressed data and unzip it:

In [None]:
url = ("https://github.com/rasbt/"
       "machine-learning-book/raw/"
       "main/ch08/movie_data.csv.gz")
filename = url.split("/")[-1]
with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)
with gzip.open('movie_data.csv.gz', 'rb') as f_in:
    with open('movie_data.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

Load the data into a pandas Dataframe and make it sure it looks all right:

In [None]:
df = pd.read_csv('movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


Split the dataset into separate training, validation, adn test sets:

In [None]:
# 70% training
train_texts = df.iloc[:35000]['review'].values
train_labels = df.iloc[:35000]['sentiment'].values
# 10% validation
valid_texts = df.iloc[35000:40000]['review'].values
valid_labels = df.iloc[35000:40000]['sentiment'].values
# 20% testing
test_texts = df.iloc[40000:]['review'].values
test_labels = df.iloc[40000:]['sentiment'].values

### Tokenizing the dataset

Tokenize the texts using the tokenizer implementation inherited from the pre-trained model class:

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained(
    'distilbert-base-uncased'
)
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Pack everything into a class called IMDbDataset and create the corresponding dataloaders:

In [None]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) 
                for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = IMDbDataset(train_encodings, train_labels)
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=16, shuffle=True) 
valid_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=16, shuffle=False) 
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=16, shuffle=False)

### Loading and fine-tuning a pre-trained BERT model

Load the pre-trained model:

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased')
model.to(DEVICE)
model.train()
optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

Define an accuracy function to evaluate the model performance:
    
* Note that we load the dataset batch by batch to work around RAM or GPU memory limitations when working with large depe learning models.

In [None]:
def compute_accuracy(model, data_loader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0
        for batch_idx, batch in enumerate(data_loader):
            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            predicted_labels = torch.argmax(logits, 1)
            num_examples += labels.size(0)
            correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float()/num_examples * 100

Train the model:
* Load the input into the device that we work on.
* Compute the model output and loss.
* Adjust the weight parameters by backpropagating the loss.
* Evaluate the model performance on both the training and validation set.

In [None]:
start_time = time.time()
for epoch in range(NUM_EPOCHS):

    model.train()

    for batch_idx, batch in enumerate(train_loader):
    
        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        ### Forward pass
        outputs = model(input_ids, 
                        attention_mask=attention_mask,
                        labels=labels)
        loss, logits = outputs['loss'], outputs['logits']
    
        ### Backward pass
        optim.zero_grad()
        loss.backward()
        optim.step()
   
        ### Logging
        if not batch_idx % 250:
            print(f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d}' 
                    f' | Batch' 
                    f'{batch_idx:04d}/'
                    f'{len(train_loader):04d} | '
                    f'Loss: {loss:.4f}')
        
    model.eval()
    with torch.set_grad_enabled(False):
        print(f'Training accuracy: '
             f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
             f'\nValid accuracy: '
             f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
   
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 0001/0003 | Batch0000/2188 | Loss: 0.6649
Epoch: 0001/0003 | Batch0250/2188 | Loss: 0.3041
Epoch: 0001/0003 | Batch0500/2188 | Loss: 0.3358
Epoch: 0001/0003 | Batch0750/2188 | Loss: 0.0951
Epoch: 0001/0003 | Batch1000/2188 | Loss: 0.4039
Epoch: 0001/0003 | Batch1250/2188 | Loss: 0.3294
Epoch: 0001/0003 | Batch1500/2188 | Loss: 0.4290
Epoch: 0001/0003 | Batch1750/2188 | Loss: 0.3063
Epoch: 0001/0003 | Batch2000/2188 | Loss: 0.2363
Training accuracy: 95.87%
Valid accuracy: 91.54%
Time elapsed: 38.32 min
Epoch: 0002/0003 | Batch0000/2188 | Loss: 0.0914
Epoch: 0002/0003 | Batch0250/2188 | Loss: 0.3176
Epoch: 0002/0003 | Batch0500/2188 | Loss: 0.1145
Epoch: 0002/0003 | Batch0750/2188 | Loss: 0.0062
Epoch: 0002/0003 | Batch1000/2188 | Loss: 0.2026
Epoch: 0002/0003 | Batch1250/2188 | Loss: 0.1100
Epoch: 0002/0003 | Batch1500/2188 | Loss: 0.0558
Epoch: 0002/0003 | Batch1750/2188 | Loss: 0.0213
Epoch: 0002/0003 | Batch2000/2188 | Loss: 0.0576
Training accuracy: 98.69%
Valid accuracy: 92.

### Fine-tuning a transformer more conveniently using the Trainer API

Using the Trainer API means that training the model can be relagated to a simple function call.

Load the pre-trained model:

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)
model.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

Train the model:

In [None]:
optim = torch.optim.Adam(model.parameters(), lr=5e-5)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./results', 
    num_train_epochs=3,     
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,   
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=train_dataset,
   optimizers=(optim, None) # optim, learning rate scheduler
)

There are many available training arguments settings, documentation is available at https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments.

Note that the TRainer API only shows the training loss by default and does not provide model evaluation.

Display final model performance:
* Method 1: define an evaluation function as the ```compute_metrics``` argument for another ```Trainer``` instance.

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<

In [None]:
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")
def compute_metrics(eval_pred):
      logits, labels = eval_pred
      # note: logits are a numpy array, not a pytorch tensor
      predictions = np.argmax(logits, axis=-1)
      return metric.compute(
          predictions=predictions, references=labels)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
# update Trainer instantiation
trainer = Trainer(
    model = model,        
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics,
    optimizers = (optim, None) # optim and learning rate scheduler
)

In [None]:
# train the model
'''
This code takes a while to run. But it works as intended.
'''
start_time = time.time()
trainer.train()

print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')

Step,Training Loss
10,0.7065
20,0.6774
30,0.6172
40,0.462
50,0.2725
60,0.4082
70,0.5769
80,0.3453
90,0.3849
100,0.2892


In [None]:
# obtain model performance on the test set
print(trainer.evaluate())

* Method 2: re-use the previously defined ```compute_accuracy``` function to directly evaluate the performance of the fine-tuned model on the test dataset.

In [None]:
model.eval()
model.to(DEVICE)
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Note that if you want to check the model's performance regularly during training, you can require the trainer to print the model evalulation after each epoch:

But for optimizing hyperparameters and viewing the performance on the validation set, we can instantiate the ```Trainer``` using ```valid_dataset```:

In [None]:
trainer=Trainer(
    model=model,        
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)