In [None]:
import torch, platform, sys, subprocess

print("Python:", sys.version)
print("OS:", platform.platform())
print("Torch:", torch.__version__)
print("CUDA?", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    print("CUDA capability:", torch.cuda.get_device_capability(0))

try:
    out = subprocess.check_output(["nvidia-smi"], text=True)
    print("\n=== nvidia-smi ===\n", out)
except Exception as e:
    print("nvidia-smi not available:", e)

In [None]:
import json, pathlib, random, uuid

DATA_DIR = pathlib.Path("data")
RAW = DATA_DIR/"raw"
OUT = DATA_DIR/"train.jsonl"
RAW.mkdir(parents=True, exist_ok=True)

def controls_to_prompt(ctrl):
    parts = [
        f"topics={','.join(ctrl['topics'])}",
        f"difficulty={ctrl['difficulty']}",
        f"length={ctrl['length']}",
        f"format={'+'.join(ctrl['format'])}",
    ]
    return "; ".join(parts)

def load_api_like_examples():
    exams = []
    for p in RAW.glob("*.json"):
        with open(p, "r", encoding="utf-8") as f:
            obj = json.load(f)
            exams.append(obj)
    return exams

def normalize_exam(exam_items):
    lines = []
    for it in exam_items:
        id_ = it.get("id") or it.get("qid") or it.get("number") or "?"
        text = it.get("text") or it.get("question") or ""
        lines.append(f"{id_}. {text}".strip())
        if it.get("options"):
            for opt in it["options"]:
                lines.append(f"- {opt}")
    return "\n".join(lines).strip()

placeholder_exam = [
    {"id": 1, "text": "Solve for x: 2x + 3 = 11", "type": "open_answer", "options": ["x=3","x=4","x=5","x=2"], "answer": "x=4", "subquestions": None}
]
(RAW/"placeholder.json").write_text(json.dumps(placeholder_exam, ensure_ascii=False, indent=2), encoding="utf-8")

random.seed(7)
exams = load_api_like_examples()
topic_pools = [["algebra","linear-equations"],["calculus","derivatives"],["physics","kinematics"],["biology","photosynthesis"]]
difficulties = ["easy","medium","hard"]
formats = [["multiple_choice"],["multiple_choice","short"],["multiple_choice","long"],["open_answer"]]

records = []
for ex in exams:
    prompt = controls_to_prompt({
        "topics": random.choice(topic_pools),
        "difficulty": random.choice(difficulties),
        "length": random.choice([8,10,12]),
        "format": random.choice(formats)
    })
    input_text = "Exam format:\n" + normalize_exam(ex)
    target = normalize_exam(ex)
    records.append({"id": str(uuid.uuid4()), "prompt": prompt, "input": input_text, "output": target})

OUT.write_text("\n".join(json.dumps(r, ensure_ascii=False) for r in records), encoding="utf-8")
len(records), str(OUT)

In [None]:
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5TokenizerFast, DataCollatorForSeq2Seq, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
import torch

MODEL_NAME = "t5-small"
DATA_PATH = "data/train.jsonl"
OUT_DIR = "out-t5-lora"

def format_example(ex):
    src = f"controls: {ex['prompt']}\n\nexemplars:\n{ex['input']}\n\n# task: generate new exam as JSON"
    tgt = ex["output"]
    return {"src": src, "tgt": tgt}

tok = T5TokenizerFast.from_pretrained(MODEL_NAME)
base = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

lora = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM", target_modules=["q","v"])
model = get_peft_model(base, lora)

ds = load_dataset("json", data_files=DATA_PATH, split="train")
ds = ds.map(format_example)

max_src_len = 1024
max_tgt_len = 1024

def tok_map(batch):
    mi = tok(batch["src"], max_length=max_src_len, truncation=True)
    labels = tok(batch["tgt"], max_length=max_tgt_len, truncation=True)
    mi["labels"] = labels["input_ids"]
    return mi

ds = ds.map(tok_map, batched=True, remove_columns=ds.column_names)
collator = DataCollatorForSeq2Seq(tokenizer=tok, model=model)

args = TrainingArguments(output_dir=OUT_DIR, num_train_epochs=1, per_device_train_batch_size=2, gradient_accumulation_steps=8, learning_rate=2e-4, warmup_ratio=0.03, logging_steps=10, save_strategy="epoch", bf16=torch.cuda.is_available(), fp16=False, optim="adamw_torch", report_to="none")

trainer = Trainer(model=model, args=args, train_dataset=ds, data_collator=collator)

trainer.train()
model.save_pretrained(OUT_DIR)
tok.save_pretrained(OUT_DIR)
print("Dumped", OUT_DIR)

In [None]:
import json, re
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from jsonschema import Draft7Validator

CKPT = "out-t5-lora"

exam_schema = {
  "type": "object",
  "properties": {
    "metadata": {
      "type": "object",
      "properties": {
        "topics": {"type": "array", "items": {"type": "string"}},
        "difficulty": {"type": "string"},
        "length": {"type": "integer"},
        "format": {"type": "array", "items": {"type": "string"}}
      },
      "required": ["topics","difficulty","length","format"]
    },
    "questions": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "id": {"type": "integer"},
          "text": {"type": "string"},
          "type": {"type": "string"},
          "options": {"type": "array", "items": {"type": "string"}}
        },
        "required": ["id","text","type"]
      }
    }
  },
  "required": ["metadata","questions"]
}

tok = T5TokenizerFast.from_pretrained(CKPT)
model = T5ForConditionalGeneration.from_pretrained(CKPT)

def generate_json_exam(topics, difficulty, length, fmt, retries=2):
    prompt = f"topics={','.join(topics)}; difficulty={difficulty}; length={length}; format={'+'.join(fmt)}"
    exemplar = """\
    ```json
    {
      "metadata": {
        "topics": ["algebra","linear-equations"],
        "difficulty": "medium",
        "length": 2,
        "format": ["mcq"]
      },
      "questions": [
        {"id": 1, "text": "Solve for x: 3x - 5 = 16.", "type": "mcq",
         "options": ["x = 7", "x = 6", "x = 5", "x = 4"]},
        {"id": 2, "text": "Which is linear?", "type": "mcq",
         "options": ["y = 2x + 1", "y = x^2", "y = sin x", "y = 2^x"]}
      ]
    }
    ```"""
    src = (
        "controls: " + prompt + "\n\n"
        "You are an exam generator. Output MUST be a single fenced JSON block and NOTHING else.\n"
        "Keys: metadata{topics,difficulty,length,format}, questions[list of {id,text,type,options?}].\n"
        "No prose, no headings, no backticks outside the JSON fence. Do not echo the prompt.\n\n"
        "Example format:\n" + exemplar + "\n\n"
        "# task: generate a new exam in JSON for the given controls"
    )
    for attempt in range(retries + 1):
        inp = tok(src, return_tensors="pt").to(model.device)
        out = model.generate(
            **inp, max_new_tokens=900,
            do_sample=False, num_beams=4, length_penalty=0.8, early_stopping=True
        )
        text = tok.decode(out[0], skip_special_tokens=True).strip()

        m = re.search(r"```json\s*(\{[\s\S]*?\})\s*```", text, flags=re.IGNORECASE)
        json_text = m.group(1) if m else (text if text.startswith("{") and text.endswith("}") else "")

        if json_text:
            try:
                obj = json.loads(json_text)
                # validate
                v = Draft7Validator(exam_schema)
                errs = sorted(v.iter_errors(obj), key=lambda e: e.path)
                if not errs and obj.get("questions"):
                    return obj, errs, text
            except Exception:
                pass
        src += "\n\nRemember: Output only fenced JSON; do not echo the controls."

    raise ValueError("Model did not return valid fenced JSON. Try lowering creativity or improving training data.")

obj, errs, raw_text = generate_json_exam(["algebra","linear-equations"], "hard", 16, ["mcq","short"])

print("Raw out:")
print(raw_text)
print("\nValidation errors:", len(errs))
for e in errs[:5]:
    print("-", "/".join(map(str,e.path)), ":", e.message)

print("\nJSON:")
print(json.dumps(obj, indent=2, ensure_ascii=False))