In [1]:
# 0) Install dependencies
!pip install -qU transformers datasets evaluate accelerate peft trl bitsandbytes wandb
!pip install -qU nvidia-ml-py3

# 1) Imports & W&B login
import os
import pandas as pd
import torch
import numpy as np
import wandb
from datasets import load_dataset, Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
)
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# Log into Weights & Biases
wandb.login()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("🚀 Using device:", device)

# 2) Load & tokenize AG News
base_model = "roberta-base"
raw = load_dataset("ag_news", split="train")
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=128,  # ↓ reduced from 256 to 128
    )


tok = raw.map(preprocess, batched=True, remove_columns=["text"])
tok = tok.rename_column("label", "labels")
tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

split = tok.train_test_split(test_size=640, seed=42)
train_ds, eval_ds = split["train"], split["test"]

# 3) PEFT / LoRA configuration
num_labels = raw.features["label"].num_classes
id2label = {i: name for i, name in enumerate(raw.features["label"].names)}

peft_config = LoraConfig(
    r=4,                     # ↓ reduced from 8
    lora_alpha=8,            # ↓ reduced from 16
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "value"],  # removed classifier.out_proj
    modules_to_save=[],                 # removed extra modules to save
    task_type=TaskType.SEQ_CLS,
)


base = RobertaForSequenceClassification.from_pretrained(
    base_model, num_labels=num_labels, id2label=id2label
)
model = get_peft_model(base, peft_config)
model.to(device)

# 3a) Verify trainable params < 1M
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total     = sum(p.numel() for p in model.parameters())
print(f"🔢 Trainable params: {trainable} ({trainable/1e6:.3f}M)")
assert trainable < 1_000_000, "❌ Trainable parameters exceed 1M—lower LoRA rank!"

# 4) Metrics
accuracy = evaluate.load("accuracy")
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return accuracy.compute(predictions=preds, references=p.label_ids)





[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m358.4/491.2 kB[0m [31m11.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/354.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgreeshmahedvikar18[0m ([33mgreeshmahedvikar18-new-york-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


🚀 Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔢 Trainable params: 741124 (0.741M)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [2]:
# 5) TrainingArguments with W&B reporting
training_args = TrainingArguments(
    output_dir="results",
    eval_strategy="steps",   # ✅ both set to steps
    save_strategy="steps",         # ✅ fix here
    save_steps=200,
    eval_steps=200,
    learning_rate=1e-5,                 # ↑ slightly higher learning rate
    lr_scheduler_type="cosine",
    warmup_steps=500,                   # ← fixed warmup instead of ratio
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=64,
    num_train_epochs=8,
    weight_decay=0.01,
    label_smoothing_factor=0.1,
    optim="adamw_torch",
    gradient_checkpointing=True,
    fp16=True,
    group_by_length=True,               # ← batches similar-length samples
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=200,
    report_to="wandb",
    run_name="agnews-lora-roberta",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    seed=42,
)


# 6) Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)


# 7) Train & Evaluate
trainer.train()
res = trainer.evaluate()
print("✅ Validation Accuracy:", res["eval_accuracy"])

# 8) Save fine‑tuned model & tokenizer
model.save_pretrained("./lora_roberta_agnews_final")
tokenizer.save_pretrained("./lora_roberta_agnews_final")

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.




Step,Training Loss,Validation Loss,Accuracy
200,1.399,1.392456,0.257812
400,1.3815,1.373982,0.34375
600,1.3649,1.35646,0.46875
800,1.346,1.338629,0.60625
1000,1.3287,1.319826,0.757812
1200,1.3095,1.303007,0.751563
1400,1.2915,1.283929,0.789062
1600,1.2734,1.264245,0.803125
1800,1.2512,1.24158,0.832812
2000,1.229,1.220911,0.83125




✅ Validation Accuracy: 0.88125


('./lora_roberta_agnews_final/tokenizer_config.json',
 './lora_roberta_agnews_final/special_tokens_map.json',
 './lora_roberta_agnews_final/vocab.json',
 './lora_roberta_agnews_final/merges.txt',
 './lora_roberta_agnews_final/added_tokens.json')

In [4]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# ─── 1) Load your pickled test set ───────────────────────────
raw = pd.read_pickle("test_unlabelled.pkl")

# If it’s an HF Dataset, convert to variable hf_test directly
if isinstance(raw, Dataset):
    hf_test = raw
else:
    # It’s a pandas DataFrame
    hf_test = Dataset.from_pandas(raw.reset_index(drop=True))

print("✅ test set rows:", len(hf_test), "columns:", hf_test.column_names)

# ─── 2) Tokenize (drop the original text column) ──────────────
def tokenize_fn(ex):
    return tokenizer(
        ex["text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

hf_test = hf_test.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
)
hf_test.set_format("torch", columns=["input_ids", "attention_mask"])

# ─── 3) DataLoader & Inference ────────────────────────────────
loader = DataLoader(hf_test, batch_size=64)
model.to(device).eval()

all_preds = []
with torch.no_grad():
    for batch in tqdm(loader, desc="Inference"):
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(**batch).logits
        all_preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())

# ─── 4) Build & Save submission.csv ──────────────────────────
submission = pd.DataFrame({
    "ID": np.arange(len(all_preds)),
    "Label": all_preds
})
submission.to_csv("submission.csv", index=False)
print("📝 submission.csv rows:", len(submission))

✅ test set rows: 8000 columns: ['text']


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Inference:   0%|          | 0/125 [00:00<?, ?it/s]

📝 submission.csv rows: 8000
