<a href="https://colab.research.google.com/github/dev-aryaman/dl-demos/blob/main/M4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets accelerate peft bitsandbytes scikit-learn

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_c

In [14]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model
from datasets import Dataset, load_dataset
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from torch.utils.data import WeightedRandomSampler
from peft.tuners.lora import LoraLayer
import torch.nn.functional as F

In [15]:
# constants and config
MODEL_NAME = "roberta-base"
NUM_LABELS = 4
MAX_LENGTH = 256
OUTPUT_DIR = "lora_finetuning_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)
RANDOM_SEED = 37

In [16]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [18]:
# Dataset preprocessing
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )

# Load and prepare dataset
dataset = load_dataset("ag_news")
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["text"]
)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

# Stratified train-test split
train_df = tokenized_dataset["train"].to_pandas()

# Remove auto-generated index column if present
if "__index_level_0__" in train_df.columns:
    train_df = train_df.drop(columns=["__index_level_0__"])

train_df, eval_df = train_test_split(
    train_df,
    test_size=0.02,
    random_state=RANDOM_SEED,
    stratify=train_df["labels"]
)

# Convert back to Dataset format
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
eval_dataset = Dataset.from_pandas(eval_df.reset_index(drop=True))

In [19]:
# Initialize base model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Optimized LoRA Configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
    init_lora_weights="gaussian"
)

peft_model = get_peft_model(model, peft_config)

# Training Arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="steps",
    eval_steps=200,
    logging_steps=50,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    fp16=True,
    gradient_accumulation_steps=2,
    learning_rate=3e-5,
    weight_decay=0.01,
    adam_epsilon=1e-6,
    max_grad_norm=1.0,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    num_train_epochs=5,
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    gradient_checkpointing=True
)

In [21]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 888,580 || all params: 125,537,288 || trainable%: 0.7078


In [None]:
# Create trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1))},
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()  # Typically stops after 2-3 epochs
trainer.save_model(os.path.join(OUTPUT_DIR, "best_model"))

In [64]:
# Add device detection at the beginning
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move both models to the same device
peft_model = peft_model.to(device)

# Start training with mixed precision
with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
    trainer.train()

Step,Training Loss,Validation Loss,Accuracy
200,0.9758,0.974049,0.2475
400,0.9813,0.974049,0.2475
600,0.9735,0.974049,0.2475
800,0.9767,0.974049,0.2475




In [None]:
# Generate submission
def generate_submission():
    test_data = pd.read_pickle("test_unlabelled.pkl")
    test_df = test_data.to_pandas() if isinstance(test_data, Dataset) else test_data.copy()

    if "ID" not in test_df.columns:
        test_df["ID"] = range(len(test_df))

    test_dataset = Dataset.from_pandas(test_df).map(
        preprocess_function,
        batched=True,
        remove_columns=["text"]
    )

    predictions = trainer.predict(test_dataset)
    pred_labels = np.argmax(predictions.predictions, axis=-1)

    submission_df = pd.DataFrame({
        "ID": test_df["ID"],
        "Label": pred_labels
    })

    submission_path = os.path.join(OUTPUT_DIR, "submission.csv")
    submission_df.to_csv(submission_path, index=False)
    return submission_df

final_submission = generate_submission()
print("\nFirst 5 predictions:")
print(final_submission.head())