In [None]:
# CELL 0 — Mount Google Drive (for saving LoRA adapters & results)
from google.colab import drive
drive.mount("/content/drive")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# CELL 1 — Install dependencies (Colab)
!pip -q install -U transformers datasets accelerate peft evaluate bitsandbytes


In [None]:
# CELL 2 — (Required) Hugging Face login for gated FOLIO dataset
# Run this BEFORE load_dataset("yale-nlp/FOLIO")
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# CELL 3 — Imports + experiment config
import os, gc, random
import numpy as np
import pandas as pd
import torch

from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    TrainerCallback,
)
from peft import LoraConfig, get_peft_model, TaskType

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("gpu:", torch.cuda.get_device_name(0))

# ---- Encoder–Decoder model list (large, Colab A100-friendly with LoRA) ----
MODEL_LIST = [
    "google/flan-t5-xl",
    "google/flan-t5-xxl",
    "google/t5-v1_1-xl",
    "google/t5-v1_1-xxl",
    "google/mt5-xl",
]

# ---- Data / prompt ----
MAX_SOURCE_LEN = 1024
MAX_TARGET_LEN = 4   # output is just 'A'/'B'/'C'
BATCH = 2  # base; may be overridden per model for big checkpoints

# ---- Output ----
OUT_ROOT = "/content/drive/MyDrive/logic/folio_seq2seq_lora"
os.makedirs(OUT_ROOT, exist_ok=True)
print("OUT_ROOT:", OUT_ROOT)

def cleanup():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


torch: 2.9.0+cu126
cuda available: True
gpu: NVIDIA A100-SXM4-40GB
OUT_ROOT: /content/drive/MyDrive/logic/folio_seq2seq_lora


In [None]:
# CELL 4 — Load FOLIO + build prompts (same format as your decoder notebook)
from collections import Counter

ds = load_dataset("yale-nlp/FOLIO")
print(ds)
print("Train/Val sizes:", len(ds["train"]), len(ds["validation"]))
print("Columns:", ds["train"].column_names)

LABEL_TO_LETTER = {"True":"A", "False":"B", "Unknown":"C"}
ALT_LABELS = {
    "Uncertain":"Unknown", "uncertain":"Unknown",
    "true":"True", "false":"False", "unknown":"Unknown"
}

def normalize_label(lbl: str) -> str:
    s = str(lbl).strip()
    s = ALT_LABELS.get(s, s)
    if s not in LABEL_TO_LETTER:
        raise ValueError(f"Unexpected label: {lbl!r}")
    return s

def build_user_text(premises, conclusion):
    # premises can be list[str] or a single string; handle both
    if isinstance(premises, (list, tuple)):
        prem = "\n".join([f"- {p}" for p in premises])
    else:
        prem = f"- {premises}"
    return (
        "Task: Determine whether the conclusion is entailed, contradicted, or unknown given the premises.\n"
        "Premises:\n"
        f"{prem}\n\n"
        "Conclusion:\n"
        f"{conclusion}\n\n"
        "Output format: Answer: A (entailed), B (contradicted), or C (unknown).\n"
        "Answer:"
    )

def map_ex(ex):
    label = normalize_label(ex["label"])
    return {
        "user_text": build_user_text(ex["premises"], ex["conclusion"]),
        "label": label,
        "label_letter": LABEL_TO_LETTER[label],
    }

ds2 = DatasetDict({k: ds[k].map(map_ex, remove_columns=ds[k].column_names) for k in ds})
print("Val label dist:", Counter(ds2["validation"]["label"]))
print("\n--- sample prompt ---\n")
print(ds2["train"][0]["user_text"])
print("gold:", ds2["train"][0]["label_letter"])


folio_v2_train.jsonl:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

folio_v2_validation.jsonl:   0%|          | 0.00/208k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1001 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/203 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['story_id', 'premises', 'premises-FOL', 'conclusion', 'conclusion-FOL', 'label', 'example_id'],
        num_rows: 1001
    })
    validation: Dataset({
        features: ['story_id', 'premises', 'premises-FOL', 'conclusion', 'conclusion-FOL', 'label', 'example_id'],
        num_rows: 203
    })
})
Train/Val sizes: 1001 203
Columns: ['story_id', 'premises', 'premises-FOL', 'conclusion', 'conclusion-FOL', 'label', 'example_id']


Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

Val label dist: Counter({'True': 72, 'Unknown': 69, 'False': 62})

--- sample prompt ---

Task: Determine whether the conclusion is entailed, contradicted, or unknown given the premises.
Premises:
- All people who regularly drink coffee are dependent on caffeine.
People regularly drink coffee, or they don't want to be addicted to caffeine, or both.
No one who doesn't want to be addicted to caffeine is unaware that caffeine is a drug.
Rina is either a student who is unaware that caffeine is a drug, or she is not a student and is she aware that caffeine is a drug.
Rina  is either a student who is dependent on caffeine, or she is not a student and not dependent on caffeine.

Conclusion:
Rina doesn't want to be addicted to caffeine or is unaware that caffeine is a drug.

Output format: Answer: A (entailed), B (contradicted), or C (unknown).
Answer:
gold: A


In [None]:
# CELL 5 — LoRA target selection (T5 vs BART) + metrics helpers

def pick_lora_targets(model_name: str):
    # T5 uses different module naming than BART
    name = model_name.lower()
    if "t5" in name:
        # T5 attention: q, k, v, o (works for flan-t5)
        return ["q", "k", "v", "o"]
    else:
        # BART attention projections
        return ["q_proj", "k_proj", "v_proj", "out_proj"]

def normalize_pred_letter(s: str) -> str:
    if s is None:
        return ""
    s = s.strip()
    if not s:
        return ""
    # take first non-space character
    c = s[0].upper()
    return c if c in {"A","B","C"} else ""

def compute_accuracy(pred_texts, gold_texts):
    preds = [normalize_pred_letter(t) for t in pred_texts]
    golds = [normalize_pred_letter(t) for t in gold_texts]
    invalid = sum(p == "" for p in preds)
    acc = sum(p == g for p, g in zip(preds, golds)) / max(1, len(golds))
    return acc, invalid / max(1, len(golds)), preds


In [None]:
# CELL 6 — Tokenization for Seq2Seq (fixes eval_loss NaN) + custom callback for clean logging
from dataclasses import dataclass

@dataclass
class RunningLog:
    last_train_loss: float = float("nan")

running = RunningLog()

class TableLoggerCallback(TrainerCallback):
    """Keeps the last seen training loss, and prints a compact table at eval steps."""
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            running.last_train_loss = float(logs["loss"])

def make_tokenize_fn(tokenizer):
    def tokenize_batch(batch):
        model_inputs = tokenizer(
            batch["user_text"],
            max_length=MAX_SOURCE_LEN,
            truncation=True,
        )
        # tokenize targets explicitly
        with tokenizer.as_target_tokenizer():
            lab = tokenizer(
                batch["label_letter"],
                max_length=MAX_TARGET_LEN,
                truncation=True,
            )
        labels = lab["input_ids"]
        # mask pad tokens to -100 so loss ignores them
        pad_id = tokenizer.pad_token_id
        labels = [[(t if t != pad_id else -100) for t in seq] for seq in labels]
        model_inputs["labels"] = labels
        return model_inputs
    return tokenize_batch

def sanity_check_labels(tokenized_ds, tokenizer, split="validation", n=50):
    # ensure every example has at least one non -100 label token
    bad = 0
    for i in range(min(n, len(tokenized_ds[split]))):
        labs = tokenized_ds[split][i]["labels"]
        if sum(t != -100 for t in labs) == 0:
            bad += 1
    print(f"Label sanity check ({split}, first {min(n, len(tokenized_ds[split]))}): bad={bad}")


In [13]:
# CELL 7 — Train/eval loop over encoder–decoder models (LoRA + EarlyStopping) + results table after each model
RESULTS = []

for model_name in MODEL_LIST:
    print("\n" + "="*100)
    print("MODEL:", model_name)

    out_dir = os.path.join(OUT_ROOT, model_name.replace("/", "__"))
    os.makedirs(out_dir, exist_ok=True)

    # ---- per-model memory-safe settings for Colab A100 (40GB) ----
    # XXL models (11B) are tight: use micro-batch 1 + grad accumulation.
    lname = model_name.lower()
    if 'xxl' in lname:
        per_device_bs = 1
        grad_accum = 16
        use_grad_ckpt = True
    elif 'xl' in lname or '3b' in lname:
        per_device_bs = 2
        grad_accum = 8
        use_grad_ckpt = True
    else:
        per_device_bs = BATCH
        grad_accum = 1
        use_grad_ckpt = False

    # tokenizer / model
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    base = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        torch_dtype=(torch.bfloat16 if torch.cuda.is_available() else None),
        device_map="auto" if torch.cuda.is_available() else None,
    )

    if use_grad_ckpt:
        base.gradient_checkpointing_enable()
        # required for some models when using gradient checkpointing
        if hasattr(base, 'enable_input_require_grads'):
            base.enable_input_require_grads()

    # LoRA
    targets = pick_lora_targets(model_name)
    lora_cfg = LoraConfig(
        r=8,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.SEQ_2_SEQ_LM,
        target_modules=targets,
    )
    model = get_peft_model(base, lora_cfg)

    # report trainable params
    model.print_trainable_parameters()

    # tokenize dataset for this tokenizer/model
    tok_fn = make_tokenize_fn(tokenizer)
    tokenized = ds2.map(tok_fn, batched=True, remove_columns=ds2["train"].column_names)
    sanity_check_labels(tokenized, tokenizer)

    collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    # accuracy from generation
    def compute_metrics(eval_pred):
        pred_ids, label_ids = eval_pred
        # decode predictions
        pred_texts = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        # labels: replace -100 with pad to decode
        label_ids = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
        gold_texts = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

        acc, invalid_rate, _ = compute_accuracy(pred_texts, gold_texts)
        return {"accuracy": acc, "invalid_rate": invalid_rate}

    from transformers import IntervalStrategy # Added import

    # training args (bf16 preferred on A100; fall back otherwise)
    use_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
    args = Seq2SeqTrainingArguments(
        output_dir=out_dir,
        per_device_train_batch_size=per_device_bs,
        per_device_eval_batch_size=per_device_bs,
        learning_rate=2e-4,
        gradient_accumulation_steps=grad_accum,
        num_train_epochs=50,
        warmup_ratio=0.03,
        lr_scheduler_type="cosine",
        weight_decay=0.0,
        eval_strategy=IntervalStrategy.STEPS, # Modified line
        eval_steps=200,
        save_steps=200,
        logging_steps=50,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_accuracy",
        greater_is_better=True,
        predict_with_generate=True,
        generation_max_length=MAX_TARGET_LEN, # Changed from generation_max_new_tokens
        generation_num_beams=1,
        report_to="none",
        fp16=False,
        bf16=bool(use_bf16),
        dataloader_num_workers=2,
        seed=SEED,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["validation"],
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        callbacks=[
            TableLoggerCallback(),
            EarlyStoppingCallback(early_stopping_patience=3),
        ],
    )

    # train
    train_toggle = trainer.train()
    metrics = trainer.evaluate()

    # grab best metrics
    eval_acc = float(metrics.get("eval_accuracy", float("nan")))
    eval_loss = float(metrics.get("eval_loss", float("nan")))
    invalid = float(metrics.get("eval_invalid_rate", float("nan")))

    # save LoRA adapter + tokenizer
    trainer.model.save_pretrained(os.path.join(out_dir, "lora_adapter"))
    tokenizer.save_pretrained(os.path.join(out_dir, "tokenizer"))

    RESULTS.append({
        "model": model_name,
        "trainable_params": int(sum(p.numel() for p in model.parameters() if p.requires_grad)),
        "eval_accuracy": eval_acc,
        "eval_loss": eval_loss,
        "invalid_rate": invalid,
        "best_checkpoint": getattr(trainer.state, "best_model_checkpoint", None),
    })

    # print results table after each model
    df = pd.DataFrame(RESULTS).sort_values("eval_accuracy", ascending=False)
    print("\n--- RESULTS SO FAR ---")
    display(df)

    # cleanup
    del trainer, model, base, tokenizer, tokenized
    cleanup()

# final save
final_df = pd.DataFrame(RESULTS).sort_values("eval_accuracy", ascending=False)
final_path = os.path.join(OUT_ROOT, "results_seq2seq.csv")
final_df.to_csv(final_path, index=False)
print("\nSaved:", final_path)
display(final_df)


MODEL: google/flan-t5-xl


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 9,437,184 || all params: 2,859,194,368 || trainable%: 0.3301


Map:   0%|          | 0/1001 [00:00<?, ? examples/s]



Map:   0%|          | 0/203 [00:00<?, ? examples/s]

Label sanity check (validation, first 50): bad=0


  trainer = Seq2SeqTrainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss,Accuracy,Invalid Rate
200,0.4155,0.386717,0.640394,0.0
400,0.2668,0.424855,0.699507,0.0
600,0.1724,0.478124,0.719212,0.0
800,0.1173,0.615679,0.724138,0.0
1000,0.094,0.69785,0.729064,0.0
1200,0.0689,0.778977,0.748768,0.0
1400,0.0673,0.74251,0.743842,0.0
1600,0.0454,0.857821,0.763547,0.0
1800,0.0208,0.854191,0.763547,0.0
2000,0.035,0.962941,0.763547,0.0



--- RESULTS SO FAR ---


Unnamed: 0,model,trainable_params,eval_accuracy,eval_loss,invalid_rate,best_checkpoint
0,google/flan-t5-xl,9437184,0.778325,1.066797,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...



MODEL: google/flan-t5-xxl


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/9.60G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/6.06G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 18,874,368 || all params: 11,154,206,720 || trainable%: 0.1692


Map:   0%|          | 0/1001 [00:00<?, ? examples/s]



Map:   0%|          | 0/203 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


Label sanity check (validation, first 50): bad=0


Step,Training Loss,Validation Loss,Accuracy,Invalid Rate
200,0.3405,0.323271,0.724138,0.0
400,0.2062,0.368282,0.743842,0.0
600,0.1141,0.510341,0.783251,0.0
800,0.0616,0.829311,0.743842,0.0
1000,0.0431,0.93052,0.714286,0.0
1200,0.0377,0.889429,0.748768,0.0



--- RESULTS SO FAR ---


Unnamed: 0,model,trainable_params,eval_accuracy,eval_loss,invalid_rate,best_checkpoint
1,google/flan-t5-xxl,18874368,0.783251,0.510341,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
0,google/flan-t5-xl,9437184,0.778325,1.066797,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...



MODEL: google/t5-v1_1-xl


tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/11.4G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/11.4G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 9,437,184 || all params: 2,859,194,368 || trainable%: 0.3301


Map:   0%|          | 0/1001 [00:00<?, ? examples/s]



Map:   0%|          | 0/203 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


Label sanity check (validation, first 50): bad=0


Step,Training Loss,Validation Loss,Accuracy,Invalid Rate
200,1.2339,0.722254,0.339901,0.0
400,0.6577,0.614679,0.374384,0.0
600,0.7308,0.564884,0.344828,0.0
800,0.5757,0.565554,0.344828,0.0
1000,0.5727,0.569259,0.35468,0.0



--- RESULTS SO FAR ---


Unnamed: 0,model,trainable_params,eval_accuracy,eval_loss,invalid_rate,best_checkpoint
1,google/flan-t5-xxl,18874368,0.783251,0.510341,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
0,google/flan-t5-xl,9437184,0.778325,1.066797,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
2,google/t5-v1_1-xl,9437184,0.374384,0.614679,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...



MODEL: google/t5-v1_1-xxl


tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/593 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/44.5G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/44.5G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 18,874,368 || all params: 11,154,206,720 || trainable%: 0.1692


Map:   0%|          | 0/1001 [00:00<?, ? examples/s]



Map:   0%|          | 0/203 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


Label sanity check (validation, first 50): bad=0


Step,Training Loss,Validation Loss,Accuracy,Invalid Rate
200,0.5717,0.57393,0.35468,0.0
400,0.5607,0.600264,0.35468,0.0
600,0.5337,0.525434,0.403941,0.0
800,0.4634,0.500088,0.482759,0.0
1000,0.3907,0.438808,0.630542,0.0
1200,0.3464,0.529827,0.596059,0.0
1400,0.3024,0.451096,0.694581,0.0
1600,0.221,0.525541,0.669951,0.0
1800,0.1955,0.551036,0.655172,0.0


Step,Training Loss,Validation Loss,Accuracy,Invalid Rate
200,0.5717,0.57393,0.35468,0.0
400,0.5607,0.600264,0.35468,0.0
600,0.5337,0.525434,0.403941,0.0
800,0.4634,0.500088,0.482759,0.0
1000,0.3907,0.438808,0.630542,0.0
1200,0.3464,0.529827,0.596059,0.0
1400,0.3024,0.451096,0.694581,0.0
1600,0.221,0.525541,0.669951,0.0
1800,0.1955,0.551036,0.655172,0.0
2000,0.165,0.619738,0.674877,0.0



--- RESULTS SO FAR ---


Unnamed: 0,model,trainable_params,eval_accuracy,eval_loss,invalid_rate,best_checkpoint
1,google/flan-t5-xxl,18874368,0.783251,0.510341,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
0,google/flan-t5-xl,9437184,0.778325,1.066797,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
3,google/t5-v1_1-xxl,18874368,0.694581,0.451096,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
2,google/t5-v1_1-xl,9437184,0.374384,0.614679,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...



MODEL: google/mt5-xl


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/15.0G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/15.0G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 9,437,184 || all params: 3,752,056,832 || trainable%: 0.2515


Map:   0%|          | 0/1001 [00:00<?, ? examples/s]



Map:   0%|          | 0/203 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


Label sanity check (validation, first 50): bad=0


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss,Accuracy,Invalid Rate
200,0.6007,0.555894,0.35468,0.0
400,0.5804,0.598011,0.35468,0.0
600,0.5681,0.542538,0.374384,0.0
800,0.5478,0.525984,0.418719,0.0
1000,0.5116,0.542452,0.403941,0.0
1200,0.4647,0.471144,0.546798,0.0
1400,0.4434,0.45028,0.551724,0.0
1600,0.4048,0.460438,0.576355,0.0
1800,0.3854,0.472222,0.576355,0.0
2000,0.3527,0.476821,0.571429,0.0


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 2b7d8df8-1307-40b7-8dc8-295e0f9820f9)')' thrown while requesting HEAD https://huggingface.co/google/mt5-xl/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 6d2e7b62-1094-4794-840e-7bdd919f1743)')' thrown while requesting HEAD https://huggingface.co/google/mt5-xl/resolve/main/config.json
Retrying in 2s [Retry 2/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 51c96f90-8151-4a60-a6bb-1bac15d30b61)')' thrown while requesting HEAD https://huggingface.co/google/mt5-xl/resolve/main/config.json
Retrying in 4s [Retry 3/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 21abdb9e-e3b9-4ae7-b


--- RESULTS SO FAR ---


Unnamed: 0,model,trainable_params,eval_accuracy,eval_loss,invalid_rate,best_checkpoint
1,google/flan-t5-xxl,18874368,0.783251,0.510341,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
0,google/flan-t5-xl,9437184,0.778325,1.066797,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
3,google/t5-v1_1-xxl,18874368,0.694581,0.451096,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
4,google/mt5-xl,9437184,0.596059,0.492822,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
2,google/t5-v1_1-xl,9437184,0.374384,0.614679,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...



Saved: /content/drive/MyDrive/logic/folio_seq2seq_lora/results_seq2seq.csv


Unnamed: 0,model,trainable_params,eval_accuracy,eval_loss,invalid_rate,best_checkpoint
1,google/flan-t5-xxl,18874368,0.783251,0.510341,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
0,google/flan-t5-xl,9437184,0.778325,1.066797,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
3,google/t5-v1_1-xxl,18874368,0.694581,0.451096,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
4,google/mt5-xl,9437184,0.596059,0.492822,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
2,google/t5-v1_1-xl,9437184,0.374384,0.614679,0.0,/content/drive/MyDrive/logic/folio_seq2seq_lor...
