### IMDB Movie Reviews Classification using DistilBERT

### Import Libraries

In [1]:
import numpy as np
import torch
import evaluate
import warnings
warnings.filterwarnings("ignore")

# hugging face libs
from datasets import load_dataset
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import get_scheduler
from transformers import AutoModelForSequenceClassification
from tqdm.auto import tqdm

from torch.optim import AdamW
from torch.utils.data import DataLoader

### Load Dataset

In [4]:
imdb = load_dataset("imdb")

#### Example text with label

In [7]:
imdb["test"][0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

### Preprocessing

#### Tokenize text field by using a DistilBERT tokensizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

#### Create a preprocessing function

In [19]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True, return_tensors = "pt")

In [21]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

#### Build tokens by padding them to match length of maximum sentence length of an item in IMDB dataset

In [24]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Compute Metrics

#### Load accuracy metric 

In [32]:
accuracy = evaluate.load("accuracy")

#### Create a function that computes metrics to be pushed into the training loop (while re-training DistilBERT)

In [35]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

### Training

In [38]:
torch.cuda.empty_cache()

In [40]:
tokenized_imdb = tokenized_imdb.remove_columns(["text"])

#### Rename the label column to "labels" because the model expects the argument to be named labels:

In [43]:
tokenized_imdb = tokenized_imdb.rename_column("label", "labels")

#### Set the format of the dataset to return PyTorch tensors instead of lists:

In [46]:
tokenized_imdb.set_format("torch")

In [48]:
tokenized_imdb

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})

#### Then create a smaller subset of the dataset as previously shown to speed up the fine-tuning

In [51]:
small_train_dataset = tokenized_imdb["train"].shuffle(seed=42).select(range(1000))

In [53]:
small_eval_dataset = tokenized_imdb["test"].shuffle(seed=42).select(range(1000))

In [55]:
small_train_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

In [57]:
small_eval_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

### Data Loader

#### Create a DataLoader for your training and test datasets so you can iterate over batches of data:

In [61]:
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)

In [63]:
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

#### Load your model with the number of expected labels:

#### Since we have labelled text (either Positive or Negative), we need to  map the ID to Label and Label to ID in our model as well

In [67]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [69]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Optimizer and learning rate scheduler

#### Create an optimizer and learning rate scheduler to fine-tune the model. Let’s use the AdamW optimizer from PyTorch:

In [73]:
optimizer = AdamW(model.parameters(), lr=5e-5)

#### Create the default learning rate scheduler from Trainer:

In [76]:
num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [78]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [80]:
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [82]:
progress_bar = tqdm(range(num_training_steps))
model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/250 [00:00<?, ?it/s]

In [83]:
metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

In [84]:
metric.compute()

{'accuracy': 0.872}