<a href="https://colab.research.google.com/github/eneskosar/paper1/blob/main/1encoders_ft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive")


In [None]:
!pip -q install -U \
  "transformers>=4.41.0" \
  "datasets>=2.19.0" \
  "accelerate>=0.30.0" \
  "peft>=0.11.0" \
  "evaluate>=0.4.2"


# Multi-model experiment runner (no changes to training logic)
This cell runs the same FoLiO LoRA fine-tuning pipeline across multiple **compatible** encoder checkpoints (DeBERTa family), and collects results in one table.


In [None]:
import os, gc
import numpy as np
import pandas as pd

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed,
)
import evaluate
from peft import LoraConfig, get_peft_model, TaskType

set_seed(42)

# === Models to compare (small + large for each family) ===
# Goal: try *different* pretrained backbones without changing the rest of your pipeline.
# NOTE: different architectures have different module names, so LoRA `target_modules`
# must be selected per-model (handled below).
MODEL_LIST = [
    # RoBERTa family (strong general encoder baseline)
    "roberta-base",
    "roberta-large",

    # BERT family (classic baseline)
    "bert-base-uncased",
    "bert-large-uncased",

    # ELECTRA family (often strong/faster discriminative encoder)
    "google/electra-small-discriminator",
    "google/electra-large-discriminator",

    # DeBERTa-v3 family (very strong NLU / reasoning encoder)
    "microsoft/deberta-v3-base",
    "microsoft/deberta-v3-large",

    # ALBERT family (parameter sharing; very large variant exists)
    "albert-base-v2",
    "albert-xxlarge-v2",

    # XLM-RoBERTa family (different backbone; strong transformer encoder)
    "xlm-roberta-base",
    "xlm-roberta-large",
]

# Root folder for per-model outputs
OUT_ROOT = "/content/drive/MyDrive/folio_lora_experiments"
os.makedirs(OUT_ROOT, exist_ok=True)
print("OUT_ROOT:", OUT_ROOT)

# === Shared dataset load (do once) ===
ds = load_dataset("tasksource/folio")  # train + validation available
label_list = sorted(set(ds["train"]["label"]))
label2id = {lab: i for i, lab in enumerate(label_list)}
id2label = {i: lab for lab, i in label2id.items()}
num_labels = len(label_list)

print("Labels:", label_list)

# Metrics (kept as in your working code)
acc = evaluate.load("accuracy")
f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    out = {}
    out.update(acc.compute(predictions=preds, references=labels))
    out.update(f1.compute(predictions=preds, references=labels, average="macro"))
    return out

# === Core run function (same logic, parameterized by BASE_MODEL/OUT_DIR) ===
MAX_LEN = 1024  # If you hit OOM or shape errors, drop to 512/384.

# Global training parameters for FLAN-T5 and LLaMA runs
TRAIN_BSZ = 8
EVAL_BSZ = 16
LR = 2e-4
EPOCHS = 30
WD = 0.0
WARMUP_RATIO = 0.06
LOGGING_STEPS = 25
EVAL_STRATEGY = "steps"
EVAL_STEPS = 100
SAVE_STRATEGY = "steps"
SAVE_STEPS = 200 # For Llama/T5, let's keep it to a reasonable number, or 1 to save storage.
SAVE_TOTAL_LIMIT = 1
LOAD_BEST_AT_END = True
BEST_METRIC = "f1" # Will be overridden for FLAN-T5
REPORT_TO = "none"
FP16 = True

# Global LoRA parameters
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

# Define train_ds and dev_ds globally for run_one_llama and run_one_flan_t5
train_ds = ds["train"]
dev_ds = ds["validation"]

def cleanup():
    """Frees VRAM and clears objects after each model run."""
    # Free VRAM between models
    try:
        # These variables might not be defined if an error occurred early
        del trainer
    except NameError: # Catch NameError specifically if trainer was never created
        pass
    except Exception: # Catch other exceptions during deletion
        pass
    try:
        del model
    except NameError:
        pass
    except Exception:
        pass
    try:
        del base
    except NameError:
        pass
    except Exception:
        pass
    gc.collect()
    try:
        import torch
        torch.cuda.empty_cache()
    except Exception:
        pass

def run_one(BASE_MODEL: str):
    OUT_DIR = os.path.join(
        OUT_ROOT,
        BASE_MODEL.replace("/", "__")
    )
    os.makedirs(OUT_DIR, exist_ok=True)
    print("\n" + "="*90)
    print("MODEL:", BASE_MODEL)
    print("Saving to:", OUT_DIR)

    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)

    def preprocess(examples):
        tok = tokenizer(
            examples["premises"],
            examples["conclusion"],
            truncation=True,
            max_length=MAX_LEN,
        )
        tok["labels"] = [label2id[x] for x in examples["label"]]
        return tok

    tokenized = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)

    base = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )

    # (kept) layer selection logic (not used in config below, but harmless)
    num_layers = getattr(base.config, "num_hidden_layers", None)
    if num_layers is not None:
        start = int(num_layers * 3 / 4)
        layers_to_lora = list(range(start, num_layers))
    else:
        layers_to_lora = None


    def _pick_lora_targets(model):
        """Return LoRA target module name patterns that match this model family."""
        mt = getattr(getattr(model, "config", None), "model_type", None)

        # DeBERTa (kept here in case you add it back later)
        if mt in {"deberta", "deberta-v2", "deberta-v3"}:
            return ["query_proj", "key_proj", "value_proj", "intermediate.dense", "output.dense"]

        # RoBERTa / BERT (encoder-only, very similar internal naming)
        if mt in {"roberta", "bert", "albert", "xlm-roberta"}:
            return ["query", "key", "value", "intermediate.dense", "output.dense"]

        # ELECTRA (BERT-like module naming)
        if mt == "electra":
            return ["query", "key", "value", "intermediate.dense", "output.dense"]

        # Fallback: minimal, widely-matching attention projections (safer than 'dense' wildcard)
        return ["query", "key", "value"]

# LoRA config (completed; same intent as your existing cell)
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=8,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        # Explicitly define target modules for DeBERTa-family encoder blocks
        target_modules=_pick_lora_targets(base),
    )

    model = get_peft_model(base, lora_config)
    model.print_trainable_parameters()

    data_collator = DataCollatorWithPadding(tokenizer)

    args = TrainingArguments(
        output_dir=OUT_DIR,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=1,
        learning_rate=2e-4,
        num_train_epochs=30,
        max_grad_norm=1.0,
        weight_decay=0.0,
        warmup_ratio=0.06,
        eval_strategy="steps",
        eval_steps=100,
        logging_strategy="steps",
        logging_steps=25,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=True,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    try:
        trainer.train()
        eval_out = trainer.evaluate()

        # Save LoRA adapter + tokenizer
        trainer.model.save_pretrained(OUT_DIR)
        tokenizer.save_pretrained(OUT_DIR)

        # Pull "best" info if available
        best_metric = getattr(trainer.state, "best_metric", None)
        best_ckpt   = getattr(trainer.state, "best_model_checkpoint", None)

        row = {
            "model": BASE_MODEL,
            "out_dir": OUT_DIR,
            "eval_loss": float(eval_out.get("eval_loss", np.nan)),
            "eval_accuracy": float(eval_out.get("eval_accuracy", np.nan)),
            "eval_f1": float(eval_out.get("eval_f1", np.nan)),
            "best_metric": float(best_metric) if best_metric is not None else np.nan,
            "best_checkpoint": best_ckpt if best_ckpt is not None else "",
        }
        print("Final eval:", row)
        return row

    except RuntimeError as e:
        # Commonly OOM; keep the loop running
        msg = str(e)
        print("RuntimeError (likely OOM). Skipping this model.\n", msg[:600])
        return {
            "model": BASE_MODEL,
            "out_dir": OUT_DIR,
            "eval_loss": np.nan,
            "eval_accuracy": np.nan,
            "eval_f1": np.nan,
            "best_metric": np.nan,
            "best_checkpoint": "",
            "error": msg[:300],
        }
    finally:
        # Free VRAM between models
        try:
            del trainer
        except Exception:
            pass
        try:
            del model
        except Exception:
            pass
        try:
            del base
        except Exception:
            pass
        gc.collect()
        try:
            import torch
            torch.cuda.empty_cache()
        except Exception:
            pass

# === Run the sweep ===
results = []
for m in MODEL_LIST:
    results.append(run_one(m))


# === Optional: add FLAN-T5 (Seq2Seq) and LLaMA-style (decoder) models ===
# Notes:
# - FLAN-T5 is encoder-decoder (text-to-text). It needs a separate Seq2SeqTrainer path.
# - LLaMA-style models can be run as sequence classifiers via AutoModelForSequenceClassification,
#   but Meta's official Llama checkpoints on HF are often *gated* (you may need to accept the license
#   and use an HF token). We catch and record failures so the sweep continues.

FLAN_T5_LIST = []  # removed (use encoder-only models)

# Prefer Meta Llama if you have access; otherwise TinyLlama is a good "LLaMA-like" drop-in for experimentation.
LLAMA_LIST = []  # removed (use encoder-only models)

def run_one_llama(model_name: str):
    # Same pipeline as encoders, but with LLaMA LoRA targets + padding token fix if needed.
    import time
    t0 = time.time()
    out_dir = os.path.join(OUT_ROOT, model_name.replace("/", "__")) # Fixed: Use OUT_ROOT and replace '/' with '__'
    os.makedirs(out_dir, exist_ok=True)

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        # LLaMA tokenizers often have no pad token by default
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "right"

        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(label_list),
            id2label=id2label,
            label2id=label2id,
        )

        # LoRA targets for LLaMA / decoder-only attention proj names
        lora_cfg = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=LORA_R,
            lora_alpha=LORA_ALPHA,
            lora_dropout=LORA_DROPOUT,
            target_modules=["q_proj", "v_proj"],
        )
        model = get_peft_model(model, lora_cfg)

        def preprocess(examples):
            tok = tokenizer(
                examples["premises"],
                examples["conclusion"],
                truncation=True,
                max_length=MAX_LEN,
            )
            tok["labels"] = [label2id[x] for x in examples["label"]]
            return tok

        tok_train = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
        tok_dev    = dev_ds.map(preprocess, batched=True, remove_columns=dev_ds.column_names)

        collator = DataCollatorWithPadding(tokenizer=tokenizer)

        args = TrainingArguments(
            output_dir=out_dir,
            per_device_train_batch_size=TRAIN_BSZ,
            per_device_eval_batch_size=EVAL_BSZ,
            learning_rate=LR,
            num_train_epochs=EPOCHS,
            weight_decay=WD,
            warmup_ratio=WARMUP_RATIO,
            logging_steps=LOGGING_STEPS,
            evaluation_strategy=EVAL_STRATEGY,
            eval_steps=EVAL_STEPS,
            save_strategy=SAVE_STRATEGY,
            save_steps=SAVE_STEPS,
            save_total_limit=SAVE_TOTAL_LIMIT,
            load_best_model_at_end=LOAD_BEST_AT_END,
            metric_for_best_model=BEST_METRIC,
            greater_is_better=True,
            report_to=REPORT_TO,
            fp16=FP16,
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=tok_train,
            eval_dataset=tok_dev,
            tokenizer=tokenizer,
            data_collator=collator,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        metrics = trainer.evaluate()
        elapsed = time.time() - t0

        return {
            "model": model_name,
            "family": "llama_seqcls",
            "status": "ok",
            "eval_accuracy": float(metrics.get("eval_accuracy", np.nan)),
            "eval_loss": float(metrics.get("eval_loss", np.nan)),
            "seconds": elapsed,
            "out_dir": out_dir,
        }

    except Exception as e:
        elapsed = time.time() - t0
        return {
            "model": model_name,
            "family": "llama_seqcls",
            "status": f"failed: {type(e).__name__}",
            "eval_accuracy": np.nan,
            "eval_loss": np.nan,
            "seconds": elapsed,
            "out_dir": out_dir,
            "error": str(e)[:300],
        }
    finally:
        cleanup()

def run_one_flan_t5(model_name: str):
    # Text-to-text fine-tuning: generate the label string.
    import time
    from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

    t0 = time.time()
    out_dir = os.path.join(OUT_ROOT, model_name.replace("/", "__")) # Fixed: Use OUT_ROOT and replace '/' with '__'
    os.makedirs(out_dir, exist_ok=True)

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

        # Prompt format (kept simple and deterministic)
        def preprocess(examples):
            inputs = [
                f"premise: {p} conclusion: {c} label:"
                for p, c in zip(examples["premises"], examples["conclusion"])
            ]
            targets = [str(x) for x in examples["label"]]

            model_inputs = tokenizer(
                inputs,
                truncation=True,
                max_length=MAX_LEN,
            )
            with tokenizer.as_target_tokenizer():
                labels = tokenizer(
                    targets,
                    truncation=True,
                    max_length=16,
                )
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        tok_train = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
        tok_dev   = dev_ds.map(preprocess, batched=True, remove_columns=dev_ds.column_names)

        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

        def compute_metrics_t5(eval_pred):
            preds, label_ids = eval_pred
            # Some trainer versions return a tuple
            if isinstance(preds, tuple):
                preds = preds[0]
            pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
            label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

            pred_str = [p.strip() for p in pred_str]
            label_str = [l.strip() for l in label_str]

            acc = sum(int(p == l) for p, l in zip(pred_str, label_str)) / max(1, len(label_str))
            return {"accuracy": acc}

        args = Seq2SeqTrainingArguments(
            output_dir=out_dir,
            per_device_train_batch_size=TRAIN_BSZ,
            per_device_eval_batch_size=EVAL_BSZ,
            learning_rate=LR,
            num_train_epochs=EPOCHS,
            weight_decay=WD,
            warmup_ratio=WARMUP_RATIO,
            logging_steps=LOGGING_STEPS,
            # Removed evaluation_strategy as it caused a TypeError
            eval_steps=EVAL_STEPS,
            save_strategy=SAVE_STRATEGY,
            save_steps=SAVE_STEPS,
            save_total_limit=SAVE_TOTAL_LIMIT,
            load_best_model_at_end=LOAD_BEST_AT_END,
            metric_for_best_model="accuracy", # Fixed: Set to accuracy for Seq2SeqTrainer
            greater_is_better=True,
            report_to=REPORT_TO,
            fp16=FP16,
            predict_with_generate=True,
            generation_max_length=16,
        )

        trainer = Seq2SeqTrainer(
            model=model,
            args=args,
            train_dataset=tok_train,
            eval_dataset=tok_dev,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics_t5,
        )

        trainer.train()
        metrics = trainer.evaluate()
        elapsed = time.time() - t0

        return {
            "model": model_name,
            "family": "flan_t5_seq2seq",
            "status": "ok",
            "eval_accuracy": float(metrics.get("eval_accuracy", metrics.get("eval_accuracy", np.nan))),
            "eval_loss": float(metrics.get("eval_loss", np.nan)),
            "seconds": elapsed,
            "out_dir": out_dir,
        }

    except Exception as e:
        elapsed = time.time() - t0
        return {
            "model": model_name,
            "family": "flan_t5_seq2seq",
            "status": f"failed: {type(e).__name__}",
            "eval_accuracy": np.nan,
            "eval_loss": np.nan,
            "seconds": elapsed,
            "out_dir": out_dir,
            "error": str(e)[:300],
        }
    finally:
        cleanup()

# --- Run everything and collect results ---
df_enc = pd.DataFrame(results)

extra_results = []
for m in FLAN_T5_LIST:
    extra_results.append(run_one_flan_t5(m))

for m in LLAMA_LIST:
    extra_results.append(run_one_llama(m))

df_extra = pd.DataFrame(extra_results)

df_all = pd.concat([df_enc, df_extra], ignore_index=True)
df_all


In [None]:
# Keep only the required columns in the final results table
FINAL_COLS = [
    "model",
    "eval_loss",
    "eval_accuracy",
    "eval_f1",
    "best_metric",
]

df_final = df_all[FINAL_COLS].copy()

# Optional: sort by best metric (usually accuracy)
df_final = df_final.sort_values(
    by="best_metric",
    ascending=False
).reset_index(drop=True)

df_final
