## Task 2: Finetuning Notebook
Goal: To use the authors' training script to create our two finetuned models:
- Model A: flan-t5-base finetuned on train_context_only.jsonl.
- Model B: flan-t5-base finetuned on train_exp_ans.jsonl.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install autotrain-advanced
!pip install transformers peft
!pip install accelerate bitsandbytes

Collecting autotrain-advanced
  Downloading autotrain_advanced-0.8.36-py3-none-any.whl.metadata (21 kB)
Collecting albumentations==1.4.23 (from autotrain-advanced)
  Downloading albumentations-1.4.23-py3-none-any.whl.metadata (36 kB)
Collecting datasets~=3.2.0 (from datasets[vision]~=3.2.0->autotrain-advanced)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate==0.4.3 (from autotrain-advanced)
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting ipadic==1.0.0 (from autotrain-advanced)
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m139.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jiwer==3.0.5 (from autotrain-advanced)
  Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting joblib==1.4.2 (from autotrain-advanced)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collectin



In [3]:
# Create the directories for our two experiments
!mkdir -p /content/drive/MyDrive/reproducing_project/data_context_only
!mkdir -p /content/drive/MyDrive/reproducing_project/data_exp_ans

# Copy and RENAME our prepared files into the structures autotrain expects
print("--- Preparing data for Experiment A (Context-Only) ---")
!cp /content/drive/MyDrive/reproducing_project/data/splits/train_context_only.jsonl /content/drive/MyDrive/reproducing_project/data_context_only/train.jsonl
!cp /content/drive/MyDrive/reproducing_project/data/splits/dev_context_only.jsonl /content/drive/MyDrive/reproducing_project/data_context_only/valid.jsonl
!ls -lh /content/drive/MyDrive/reproducing_project/data_context_only/

print("\n--- Preparing data for Experiment B (Explain-and-Answer) ---")
!cp /content/drive/MyDrive/reproducing_project/data/splits/train_exp_ans.jsonl /content/drive/MyDrive/reproducing_project/data_exp_ans/train.jsonl
!cp /content/drive/MyDrive/reproducing_project/data/splits/dev_exp_ans.jsonl /content/drive/MyDrive/reproducing_project/data_exp_ans/valid.jsonl
!ls -lh /content/drive/MyDrive/reproducing_project/data_exp_ans/

--- Preparing data for Experiment A (Context-Only) ---
total 1.2M
-rw------- 1 root root 592K Nov  8 01:10 train.jsonl
-rw------- 1 root root 625K Nov  8 01:10 valid.jsonl

--- Preparing data for Experiment B (Explain-and-Answer) ---
total 1.4M
-rw------- 1 root root 643K Nov  8 01:10 train.jsonl
-rw------- 1 root root 700K Nov  8 01:10 valid.jsonl


In [4]:
!pip install huggingface_hub



In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
!pip install transformers datasets peft accelerate bitsandbytes
!pip install huggingface_hub



In [None]:
import os
from datasets import load_dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType

# --- 1. CONFIGURATION ---
# !! CHANGE THIS !!
# Set this to your Hugging Face username
YOUR_USERNAME = "YOUR_HF_USERNAME"

# --- Model & Data ---
base_model_id = "google/flan-t5-base" #
data_path = "./data_context_only/"
new_model_repo = f"{YOUR_USERNAME}/flan-t5-context-only" # The name for your new model

# --- LoRA Config (from the paper) ---
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,          # This was lora_r = 16
    lora_alpha=32, # This was lora_alpha = 32
    lora_dropout=0.05
) #

# --- Training Args (from the paper) ---
training_args = TrainingArguments(
    output_dir=new_model_repo,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    push_to_hub=True,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
) #

# --- 2. LOAD DATASET ---
# Load 'train' and 'valid' from our folder
raw_datasets = load_dataset('json', data_files={
    'train': os.path.join(data_path, 'train.jsonl'),
    'validation': os.path.join(data_path, 'valid.jsonl')
})

# --- 3. LOAD MODEL & TOKENIZER ---
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id) #

# --- 4. PREPROCESS (TOKENIZE) DATA ---
def preprocess_function(examples):
    # Tokenize inputs
    model_inputs = tokenizer(examples['input'], max_length=1024, truncation=True)

    # Tokenize labels (outputs)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['output'], max_length=256, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# --- 5. APPLY LoRA ---
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# --- 6. TRAIN ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
) #

print("--- STARTING TRAINING FOR EXPERIMENT A (Context-Only) ---")
trainer.train()

# --- 7. SAVE & PUSH TO HUB ---
print("--- SAVING MODEL A TO HUB ---")
trainer.push_to_hub()
print(f"--- EXPERIMENT A COMPLETE ---")

In [7]:
# Run the training for Experiment A
!autotrain llm --train \
  --project-name "flan-t5-context-only" \
  --model "google/flan-t5-base" \
  --data-path ./data_context_only/ \
  --use-peft \
  --learning-rate 2e-4 \
  --train_batch_size 4 \
  --num_train_epochs 3 \
  --trainer sft \
  --model_max_length 1024 \
  --text-column "input" \
  --target-column "output" \
  --push-to-hub

usage: autotrain <command> [<args>]
AutoTrain advanced CLI: error: unrecognized arguments: --use-peft --learning-rate 2e-4 --train_batch_size 4 --num_train_epochs 3 --target-column output
