In [1]:
import os
import torch
import numpy as np

import evaluate
from datasets import load_dataset
from peft import PeftModel, LoraConfig, get_peft_model
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def print_model_size(path):
    size = 0
    for f in os.scandir(path):
        size += os.path.getsize(f)
    print(f"Model size: {(size / 1e6):.2} MB")

def print_trainable_parameters(model, label):
    parameters, trainable = 0, 0
    for _, p in model.named_parameters():
        parameters += p.numel()
        trainable += p.numel() if p.requires_grad else 0
    print(f"{label} trainable parameters: {trainable:,}/{parameters:,} ({100 * trainable / parameters:.2f}%)")

In [21]:
model_checkpoint = "distilbert-base-uncased"

def build_lora_model(num_labels):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=num_labels
    )
    print_trainable_parameters(model, label = "Base Model")
    
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_lin", "v_lin", "k_lin", "out_lin"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS"
    )
    
    lora_model = get_peft_model(model, lora_config)
    print_trainable_parameters(model, label = "LoRA Model")

    return lora_model

In [6]:
def preprocess_function(examples, tokenizer):
    # Process text
    texts = [str(text).lower().strip() for text in examples["text"]]
    
    # Tokenize
    result = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors=None
    )
    
    # Add labels
    result["labels"] = examples["labels"]
    
    return result

### Test

In [34]:
# Model
print(f"Using model: {model_checkpoint}")

# Load dataset
print("Loading dataset...")
dataset1 = load_dataset("imdb", split="train[:1000]")
dataset2 = load_dataset("ag_news", split="train[:1000]")

print(f"Dataset1 size: {len(dataset1)} examples")
print(f"Dataset2 size: {len(dataset2)} examples")

# Prepare datasets
dataset1 = dataset1.rename_column("label", "labels")
dataset2 = dataset2.rename_column("label", "labels")

# Split datasets
train_size = int(0.8 * len(dataset1))
dataset1_train = dataset1.select(range(train_size))
dataset1_test = dataset1.select(range(train_size, len(dataset1)))
dataset2_train = dataset2.select(range(train_size))
dataset2_test = dataset2.select(range(train_size, len(dataset2)))

Using model: distilbert-base-uncased
Loading dataset...
Dataset1 size: 1000 examples
Dataset2 size: 1000 examples


In [35]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# Automatically pad the received input
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

config = {
    "sentiment": {
        "train_data": dataset1_train,
        "test_data": dataset1_test,
        "num_labels": 2,
        "epochs": 5,
        "path": "./lora-sentiment"
    },
    "topic": {
        "train_data": dataset2_train,
        "test_data": dataset2_test,
        "num_labels": 4,
        "epochs": 5,
        "path": "./lora-topic"
    }
}

Loading tokenizer...


In [36]:
# Preprocess datasets
print("Preprocessing datasets...")
for cfg in config.values():
    cfg["train_data"] = cfg["train_data"].map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=["text"]
    )
    cfg["test_data"] = cfg["test_data"].map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=["text"]
    )
    # Set format for torch
    cfg["train_data"].set_format("torch")
    cfg["test_data"].set_format("torch")

Preprocessing datasets...


In [37]:
training_arguments = TrainingArguments(
    output_dir="./checkpoints",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_steps=10,
    warmup_steps=100,
    seed=42,
)

metric = evaluate.load("accuracy")

In [38]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [39]:
for name, cfg in config.items():
    print(f"\nTraining {name} classifier...")

    model = build_lora_model(cfg["num_labels"])

    trainer = Trainer(
        model=model,
        args=training_arguments,
        train_dataset=cfg["train_data"],
        eval_dataset=cfg["test_data"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    eval_results = trainer.evaluate()
    print(f"Evaluation accuracy: {eval_results['eval_accuracy']:.4f}")

    trainer.save_model(cfg["path"])
    print_model_size(cfg["path"])


Training sentiment classifier...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base Model trainable parameters: 66,955,010/66,955,010 (100.00%)
LoRA Model trainable parameters: 1,181,954/68,136,964 (1.73%)


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0122,0.000995,1.0
2,0.0,1e-05,1.0
3,0.0,3e-06,1.0
4,0.0,1e-06,1.0
5,0.0,1e-06,1.0




Evaluation accuracy: 1.0000
Model size: 5.7 MB

Training topic classifier...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base Model trainable parameters: 66,956,548/66,956,548 (100.00%)
LoRA Model trainable parameters: 1,183,492/68,140,040 (1.74%)


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0673,1.218049,0.345
2,0.4602,0.601183,0.8
3,0.3504,0.449098,0.82
4,0.2837,0.44814,0.82
5,0.2201,0.497212,0.815




Evaluation accuracy: 0.8200
Model size: 5.7 MB


In [40]:
# Prediction function
def predict_text(text, model_path, num_labels, task_type):
    base_model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels
    )
    model = PeftModel.from_pretrained(base_model, model_path)
    model.eval()

    inputs = tokenizer(
        text.lower().strip(), return_tensors="pt", truncation=True, max_length=128
    )

    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()
        confidence = probabilities[0][predicted_class].item()

    if task_type == "sentiment":
        label_map = {0: "Negative", 1: "Positive"}
    else:
        label_map = {
            0: "World",
            1: "Sports",
            2: "Business",
            3: "Science/Technology",
        }

    return label_map[predicted_class], confidence

In [41]:
# Test examples
test_texts = [
    {
        "text": "This movie was absolutely fantastic! The acting was superb.",
        "model": "sentiment",
        "num_labels": 2,
        "task_type": "sentiment",
        "expected": "Positive",
    },
    {
        "text": "The worst film I've ever seen. Complete waste of time.",
        "model": "sentiment",
        "num_labels": 2,
        "task_type": "sentiment",
        "expected": "Negative",
    },
    {
        "text": "Tesla stock surges 20 percent after strong quarterly earnings report.",
        "model": "topic",
        "num_labels": 4,
        "task_type": "topic",
        "expected": "Business",
    },
    {
        "text": "New AI model achieves breakthrough in protein folding.",
        "model": "topic",
        "num_labels": 4,
        "task_type": "topic",
        "expected": "Science/Technology",
    },
]

print("\nRunning predictions on test examples:")
for test in test_texts:
    prediction, confidence = predict_text(
        test["text"],
        config[test["model"]]["path"],
        test["num_labels"],
        test["task_type"],
    )
    print(f"\nText: {test['text']}")
    print(f"Expected: {test['expected']}")
    print(f"Predicted: {prediction}")
    print(f"Confidence: {confidence:.2%}")


Running predictions on test examples:


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Text: This movie was absolutely fantastic! The acting was superb.
Expected: Positive
Predicted: Negative
Confidence: 99.75%


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Text: The worst film I've ever seen. Complete waste of time.
Expected: Negative
Predicted: Negative
Confidence: 99.80%


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Text: Tesla stock surges 20 percent after strong quarterly earnings report.
Expected: Business
Predicted: Business
Confidence: 92.22%


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Text: New AI model achieves breakthrough in protein folding.
Expected: Science/Technology
Predicted: Science/Technology
Confidence: 98.03%
