In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Introduction
Transformers provide an accurate and fast way to test NLP problem where context is needed. Here we will use a pre-trained model to re-train for the IMDB movie review datasets. 
Libraries used:
- PyTorch
- Transformers
- Numpy
- Scikit-learn


Sample architecture of a ![Transformers]("assets\images\Transformer_full_architecture.png")

## Load the IMBD dataset

In [None]:
print("Loading IMDB dataset...")
imdb = load_dataset("imdb")

# For faster development and testing, you can use a smaller subset of the data.
# We'll create a smaller training and test set by shuffling and selecting a subset.
small_train_dataset = imdb["train"].shuffle(seed=42).select(range(1000))
small_test_dataset = imdb["test"].shuffle(seed=42).select(range(1000))

print(f"Using {len(small_train_dataset)} training examples and {len(small_test_dataset)} testing examples.")

Loading IMDB dataset...


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Using 1000 training examples and 1000 testing examples.


## Load a pre-trained tokenizer
Every Transformer model has a corresponding tokenizer that converts text into a format the model can understand (input IDs, attention mask, etc.).
We'll use the tokenizer for 'distilbert-base-uncased', a smaller and faster version of BERT.

In [None]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Tokenize the dataset
We'll create a function to tokenize the text in our dataset.  

`truncation=True` ensures that long reviews are cut to the model's max length.

`padding=True` adds padding to shorter reviews to make all inputs the same length.

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

print("Tokenizing datasets...")
tokenized_train_dataset = small_train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = small_test_dataset.map(tokenize_function, batched=True)

Tokenizing datasets...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Load a pre-trained model

We'll load 'distilbert-base-uncased' with a sequence classification head.

`num_labels=2` specifies that this is a binary classification problem (positive/negative).

In [None]:
print("Loading pre-trained model...")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Loading pre-trained model...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define training arguments

`TrainingArguments` is a class that contains all the hyperparameters for training.

This includes settings like learning rate, number of epochs, batch size, etc.

In [None]:

training_args = TrainingArguments(
    output_dir="./results",          # Directory to save the model and results
    num_train_epochs=3,              # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10
)

  "class": algorithms.Blowfish,


## Define evaluation metrics

We need a function to compute metrics during evaluation.

This function will be called by the Trainer at each evaluation step.

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Create a Trainer instance

The `Trainer` class provides a high-level API for training and evaluating

Hugging Face Transformers models.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
)

#  Train the model
Use transfer learning to train the model.

Calling `train()` on the Trainer instance will start the fine-tuning process.

In [None]:
print("Starting model training...")
trainer.train()
print("Training finished.")

Starting model training...


Step,Training Loss
10,0.6917
20,0.7026
30,0.6983
40,0.6957
50,0.6952
60,0.6895
70,0.6821
80,0.6741
90,0.6687
100,0.629


Attempted to log scalar metric loss:
0.6917
Attempted to log scalar metric grad_norm:
0.9115180969238281
Attempted to log scalar metric learning_rate:
9e-07
Attempted to log scalar metric epoch:
0.15873015873015872
Attempted to log scalar metric loss:
0.7026
Attempted to log scalar metric grad_norm:
0.9072819352149963
Attempted to log scalar metric learning_rate:
1.9e-06
Attempted to log scalar metric epoch:
0.31746031746031744
Attempted to log scalar metric loss:
0.6983
Attempted to log scalar metric grad_norm:
1.3646469116210938
Attempted to log scalar metric learning_rate:
2.9e-06
Attempted to log scalar metric epoch:
0.47619047619047616
Attempted to log scalar metric loss:
0.6957
Attempted to log scalar metric grad_norm:
0.9887279272079468
Attempted to log scalar metric learning_rate:
3.9e-06
Attempted to log scalar metric epoch:
0.6349206349206349
Attempted to log scalar metric loss:
0.6952
Attempted to log scalar metric grad_norm:
1.1940356492996216
Attempted to log scalar metric

## Evaluate the model

After training, you can evaluate your model on the test set.

In [None]:
print("Evaluating the model on the test set...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Evaluating the model on the test set...


Attempted to log scalar metric eval_loss:
0.3231147527694702
Attempted to log scalar metric eval_accuracy:
0.858
Attempted to log scalar metric eval_f1:
0.8453159041394336
Attempted to log scalar metric eval_precision:
0.9023255813953488
Attempted to log scalar metric eval_recall:
0.7950819672131147
Attempted to log scalar metric eval_runtime:
4.4084
Attempted to log scalar metric eval_samples_per_second:
226.841
Attempted to log scalar metric eval_steps_per_second:
3.629
Attempted to log scalar metric epoch:
3.0
Evaluation results: {'eval_loss': 0.3231147527694702, 'eval_accuracy': 0.858, 'eval_f1': 0.8453159041394336, 'eval_precision': 0.9023255813953488, 'eval_recall': 0.7950819672131147, 'eval_runtime': 4.4084, 'eval_samples_per_second': 226.841, 'eval_steps_per_second': 3.629, 'epoch': 3.0}


In [18]:
print("Overall results: ", 100*eval_results["eval_accuracy"])

Overall results:  85.8


## Make predictions on new text
You can now use your fine-tuned model to predict the sentiment of new sentences.

In [None]:
def predict_sentiment(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Move tensors to the same device as the model
    device = model.device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get model predictions
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get the predicted class (0 for negative, 1 for positive)
    predicted_class_id = torch.argmax(logits, dim=1).item()
    return "Positive" if predicted_class_id == 1 else "Negative"

# Evaluate our model
Let's see how well the transfer learning did.

In [14]:
# Example predictions
review1 = "This movie was fantastic! I really enjoyed the acting and the plot."
review2 = "It was a complete waste of time. The story was boring and predictable."

print(f"Review: '{review1}'")
print(f"Predicted sentiment: {predict_sentiment(review1)}")

print(f"Review: '{review2}'")
print(f"Predicted sentiment: {predict_sentiment(review2)}")

Review: 'This movie was fantastic! I really enjoyed the acting and the plot.'
Predicted sentiment: Positive
Review: 'It was a complete waste of time. The story was boring and predictable.'
Predicted sentiment: Negative


## Results
It appears using a Transformer yield good results with very little specific model selection and re-training.
This is a great approach with starting from 0 to 1.