In [None]:
# Python 3.10+
!pip install -U "transformers>=4.42.0" "datasets>=2.20.0" "trl>=0.9.6" \
  "peft>=0.17.0" bitsandbytes accelerate einops

# Optional logging
!pip install wandb



In [None]:
# run_before.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

PROMPT = """Instruct: Turn the following note into a concise, professional email (≤100 words) that
1) accepts responsibility for a missed deadline,
2) proposes a new deadline (Friday),
3) lists 3 quick next steps as bullet points,
4) ends with a courteous sign-off.

Note: "Sorry, I didn't finish the report. Can we push it to Friday?"
Output:"""

bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
                         bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)

tok = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2", quantization_config=bnb, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True
)

inputs = tok(PROMPT, return_tensors="pt").to(model.device)
out = model.generate(**inputs, max_new_tokens=180, temperature=0.2, top_p=0.9, do_sample=True)
print(tok.decode(out[0], skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Instruct: Turn the following note into a concise, professional email (≤100 words) that
1) accepts responsibility for a missed deadline,
2) proposes a new deadline (Friday),
3) lists 3 quick next steps as bullet points,
4) ends with a courteous sign-off.

Note: "Sorry, I didn't finish the report. Can we push it to Friday?"
Output: Dear [Name],

I apologize for missing the deadline for the report. I understand that this may have caused inconvenience and I take full responsibility for it.

To rectify the situation, I propose that we push the deadline to Friday. This will allow us to complete the report and ensure that it meets the required standards.

In the meantime, I would like to outline three quick next steps:
1. Review the existing data and gather any additional information that may be needed.
2. Analyze the data and identify key insights.
3. Prepare a draft of the report and incorporate the findings.

I appreciate your understanding and cooperation. If you have any questions or req

In [None]:
# data_prep_oasst1.py
from datasets import load_dataset, DatasetDict
import pandas as pd

def build_pairs():
    ds = load_dataset("OpenAssistant/oasst1")
    # Use the flat tables (already depth-first). Keep english, reviewed, not deleted.
    def to_df(split):
        df = pd.DataFrame(split)
        df = df[(df["lang"]=="en") & (~df["deleted"]) & (df["review_result"].fillna(True))]
        return df

    train_df = to_df(ds["train"])
    val_df   = to_df(ds["validation"])
    # Index messages for fast parent lookup
    msg_train = {m_id: row for m_id, row in zip(train_df["message_id"], train_df.to_dict("records"))}
    msg_val   = {m_id: row for m_id, row in zip(val_df["message_id"], val_df.to_dict("records"))}

    def make_pairs(df, table):
        rows = []
        for r in df.itertuples(index=False):
            if r.role != "assistant":
                continue
            p = table.get(r.parent_id)
            if not p or p["role"] != "prompter":
                continue
            prompt  = p["text"].strip()
            answer  = r.text.strip()
            text    = f"Instruct: {prompt}\nOutput: {answer}"
            rows.append({"text": text})
        return pd.DataFrame(rows)

    train_pairs = make_pairs(train_df, msg_train)
    val_pairs   = make_pairs(val_df, msg_val)

    return DatasetDict({
        "train":  datasets.Dataset.from_pandas(train_pairs, preserve_index=False),
        "validation": datasets.Dataset.from_pandas(val_pairs, preserve_index=False),
    })

if __name__ == "__main__":
    import datasets, os
    dd = build_pairs()
    os.makedirs("data", exist_ok=True)
    dd["train"].to_json("data/train.jsonl", lines=True)
    dd["validation"].to_json("data/val.jsonl", lines=True)
    print(dd)


  df = df[(df["lang"]=="en") & (~df["deleted"]) & (df["review_result"].fillna(True))]


Creating json from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 23085
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1185
    })
})


In [None]:
# train_qlora_fast.py — TRL 0.21.0, Phi-2, QLoRA, T4-safe + periodic previews
import os, logging, torch, math
from datasets import load_dataset
from transformers import AutoTokenizer, BitsAndBytesConfig, TrainerCallback  # <-- fixed import
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig

# ----------------- knobs you care about -----------------
MODEL_ID   = "microsoft/phi-2"
OUT_DIR    = "phi2-oasst1-qlora"
TRAIN_JSON = "data/train.jsonl"   # {"text": "..."} lines
VAL_JSON   = "data/val.jsonl"     # unused during training (eval disabled)
MAX_LENGTH = 256                  # sequence length (good for T4 VRAM)
MAX_TRAIN_EXAMPLES = 32000         # cap dataset size to finish in ~3–5h
PREVIEW_EVERY = 100               # print a generation every N optimizer steps

# Same prompt before/after to see improvements
PREVIEW_PROMPT = """Instruct: Turn the following note into a concise, professional email (≤100 words) that
1) accepts responsibility for a missed deadline,
2) proposes a new deadline (Friday),
3) lists 3 quick next steps as bullet points,
4) ends with a courteous sign-off.

Note: "Sorry, I didn't finish the report. Can we push it to Friday?"
Output:"""

# ----------------- logging -----------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
log = logging.getLogger("phi2_qlora_fast")

if torch.cuda.is_available():
    d = torch.cuda.current_device()
    p = torch.cuda.get_device_properties(d)
    log.info(f"CUDA device: {p.name} | VRAM {(p.total_memory/1e9):.2f} GB")
else:
    log.warning("CUDA not available; training on CPU will be very slow.")

# ----------------- tokenizer -----------------
tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tok.padding_side = "right"
tok.pad_token = tok.eos_token

# ----------------- data -----------------
log.info("Loading datasets…")
ds = load_dataset("json", data_files={"train": TRAIN_JSON, "validation": VAL_JSON})

def within_ctx(e):
    # keep a 1-token cushion under Phi-2 context (2048) to avoid EOS overflow warnings
    return len(tok(e["text"]).input_ids) <= 2047

log.info("Filtering rare over-length samples…")
ds["train"] = ds["train"].filter(within_ctx)

# Subsample to control wall-time
if len(ds["train"]) > MAX_TRAIN_EXAMPLES:
    ds["train"] = ds["train"].select(range(MAX_TRAIN_EXAMPLES))
log.info(f"Train examples: {len(ds['train'])}")

# Sanity: show one truncated example
log.info("Sample (truncated): " + ds["train"][0]["text"][:180].replace("\n", "\\n") + "...")

# ----------------- 4-bit quant + LoRA -----------------
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,  # fp16 compute on T4
)

peft_cfg = LoraConfig(
    r=32, lora_alpha=16, lora_dropout=0.05,
    bias="none", task_type="CAUSAL_LM",
    target_modules="all-linear",
)

# ----------------- SFT config (speed-biased) -----------------
cfg = SFTConfig(
    output_dir=OUT_DIR,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,        # effective batch = 16 per step (single GPU)
    learning_rate=2e-4,
    num_train_epochs=1,                   # single pass
    # cut overhead:
    eval_strategy="no",                   # no dev pass during train
    save_strategy="no",                   # no mid-run checkpoints
    logging_steps=25,
    log_level="info",
    dataloader_num_workers=2,
    dataloader_pin_memory=True,

    # precision & memory:
    fp16=True,                            # faster on T4
    bf16=False,
    gradient_checkpointing=False,         # speed > memory; set True if you OOM

    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,

    # sequence handling:
    max_length=MAX_LENGTH,
    packing=False,                        # avoid FA2 requirement on T4

    report_to="none",
    model_init_kwargs={
        "quantization_config": bnb,
        "device_map": "auto",
        "torch_dtype": torch.float16,
        "trust_remote_code": True,
        "attn_implementation": "sdpa",    # T4-friendly attention
    },
)

trainer = SFTTrainer(
    model=MODEL_ID,
    args=cfg,
    train_dataset=ds["train"],
    eval_dataset=None,                    # eval disabled above
    processing_class=tok,                 # tokenizer
    peft_config=peft_cfg,
)

# ----------------- preflight sanity (dry forward + "before" generation) -----------------
def preflight_check():
    log.info("Preflight: taking one batch through the model…")
    loader = trainer.get_train_dataloader()
    batch = next(iter(loader))
    for k in batch:
        if hasattr(batch[k], "to"):
            batch[k] = batch[k].to(trainer.model.device)
    trainer.model.eval()
    with torch.no_grad():
        _ = trainer.model(**batch)  # ensure loss can be computed
    trainer.model.train()
    log.info("Preflight forward pass OK.")

    # "Before" generation — LoRA init is near-zero; outputs approximate base model
    log.info("=== BEFORE (step 0) preview ===")
    inputs = tok(PREVIEW_PROMPT, return_tensors="pt").to(trainer.model.device)
    trainer.model.eval()
    with torch.no_grad():
        out = trainer.model.generate(**inputs, max_new_tokens=150, temperature=0.2, top_p=0.9, do_sample=True)
    text = tok.decode(out[0], skip_special_tokens=True)
    log.info(text)
    trainer.model.train()

preflight_check()

# ----------------- periodic generation callback -----------------
class GenerationPreviewCallback(TrainerCallback):
    def __init__(self, tok, prompt, every=100, max_new=150):
        self.tok, self.prompt, self.every, self.max_new = tok, prompt, every, max_new
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step == 0 or state.global_step % self.every != 0:
            return
        model = kwargs["model"]
        model.eval()
        try:
            old = getattr(model.config, "use_cache", None)
            model.config.use_cache = True
            inputs = self.tok(self.prompt, return_tensors="pt").to(model.device)
            with torch.no_grad():
                out = model.generate(**inputs, max_new_tokens=self.max_new, temperature=0.2, top_p=0.9, do_sample=True)
            text = self.tok.decode(out[0], skip_special_tokens=True)
            snippet = text.split("Output:", 1)[-1].strip()
            log.info(f"=== PREVIEW @ step {state.global_step} ===\n{snippet}\n")
        finally:
            if old is not None:
                model.config.use_cache = old
        model.train()

trainer.add_callback(GenerationPreviewCallback(tok, PREVIEW_PROMPT, every=PREVIEW_EVERY))

# ----------------- train -----------------
log.info("Starting training…")
trainer.train()

# ----------------- save + final preview -----------------
log.info("Saving adapter and tokenizer…")
trainer.model.save_pretrained(OUT_DIR)
tok.save_pretrained(OUT_DIR)
log.info(f"Saved to: {OUT_DIR}")

log.info("=== AFTER (final) preview ===")
inputs = tok(PREVIEW_PROMPT, return_tensors="pt").to(trainer.model.device)
trainer.model.eval()
with torch.no_grad():
    out = trainer.model.generate(**inputs, max_new_tokens=150, temperature=0.2, top_p=0.9, do_sample=True)
log.info(tok.decode(out[0], skip_special_tokens=True))


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/23085 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2851 > 2048). Running this sequence through the model will result in indexing errors


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Adding EOS to train dataset:   0%|          | 0/23058 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/23058 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/23058 [00:00<?, ? examples/s]

Using auto half precision backend
The following columns in the Training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text. If text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The following columns in the Training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text. If text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
skipped Embedding(51200, 2560): 125.0M params
skipped: 125.0M params
***** Running training *****
  Num examples = 23,058
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 8
  Total optimization steps = 1,442
  Number of trainable parameters = 47,185,920


Step,Training Loss
25,1.8693
50,1.7451
75,1.7207
100,1.6957
125,1.6001
150,1.7207
175,1.6511
200,1.6299
225,1.6091
250,1.6414


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [None]:
# run_after.py
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

PROMPT = """Instruct: Turn the following note into a concise, professional email (≤100 words) that
1) accepts responsibility for a missed deadline,
2) proposes a new deadline (Friday),
3) lists 3 quick next steps as bullet points,
4) ends with a courteous sign-off.

Note: "Sorry, I didn't finish the report. Can we push it to Friday?"
Output:"""

ADAPTER_DIR = "phi2-oasst1-qlora"

bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
                         bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)

tok = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
base = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2", quantization_config=bnb, device_map="auto",
    torch_dtype=torch.bfloat16, trust_remote_code=True
)
model = PeftModel.from_pretrained(base, ADAPTER_DIR)

inputs = tok(PROMPT, return_tensors="pt").to(model.device)
out = model.generate(**inputs, max_new_tokens=180, temperature=0.2, top_p=0.9, do_sample=True)
print(tok.decode(out[0], skip_special_tokens=True))


loading file vocab.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-2/snapshots/ef382358ec9e382308935a992d908de099b64c23/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--microsoft--phi-2/snapshots/ef382358ec9e382308935a992d908de099b64c23/merges.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-2/snapshots/ef382358ec9e382308935a992d908de099b64c23/tokenizer.json
loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-2/snapshots/ef382358ec9e382308935a992d908de099b64c23/added_tokens.json
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-2/snapshots/ef382358ec9e382308935a992d908de099b64c23/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-2/snapshots/ef382358ec9e382308935a992d908de099b64c23/tokenizer_config.json
loadin

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing PhiForCausalLM.

All the weights of PhiForCausalLM were initialized from the model checkpoint at microsoft/phi-2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use PhiForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-2/snapshots/ef382358ec9e382308935a992d908de099b64c23/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256
}

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Instruct: Turn the following note into a concise, professional email (≤100 words) that
1) accepts responsibility for a missed deadline,
2) proposes a new deadline (Friday),
3) lists 3 quick next steps as bullet points,
4) ends with a courteous sign-off.

Note: "Sorry, I didn't finish the report. Can we push it to Friday?"
Output: Dear [Recipient],

I am writing to apologize for missing the deadline for the report. I understand that this is an important project and I take full responsibility for the delay.

To make up for the delay, I propose that we push the deadline to Friday. I have already taken the following steps to ensure that the report is completed on time:

1. I have reviewed the report and made the necessary revisions.
2. I have reached out to the necessary stakeholders to gather any additional information.
3. I have scheduled a meeting with the team to discuss the report and ensure that everyone is on the same page.

I understand that this delay may have caused some inconven