In [3]:


import time
start_time = time.time()

def print_time(message):
    elapsed = time.time() - start_time
    print(f"[{elapsed//60:.0f}m{elapsed%60:.0f}s] {message}")

# 1. INSTALLATION
!pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
print_time("Installation complete")

from unsloth import FastLanguageModel
import torch

print_time(f"torch: {torch.__version__}, CUDA: {torch.cuda.is_available()}")

# 2. Model
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
print_time(f"Model loaded (seq_len={max_seq_length})")

# 3. LOADING DATASET
from datasets import load_dataset

full_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="train")
print_time(f"Dataset loaded: {len(full_dataset)} samples")

shuffled = full_dataset.shuffle(seed=42)
train_size = 50000
val_size = 5000

train_dataset = shuffled.select(range(train_size))
validation_dataset = shuffled.select(range(train_size, train_size + val_size))

print_time(f"Split - Train: {len(train_dataset)}, Val: {len(validation_dataset)}")

true_count = sum(1 for ex in train_dataset if ex["is_correct"])
false_count = len(train_dataset) - true_count
print(f"Data balance - True: {true_count} ({100*true_count/len(train_dataset):.1f}%), False: {false_count}")

# 4. FORMATTING DATA

training_prompt = """Check if the solution is correct.

Question: {}
Solution: {}
Answer: {}

Correct?
Output:"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    texts = []
    for q, s, a, o in zip(examples["question"], examples["solution"],
                          examples["answer"], examples["is_correct"]):
        output_str = "True" if o else "False"
        text = training_prompt.format(str(q).strip(), str(s).strip(), str(a).strip())
        texts.append(text + f" {output_str}{EOS_TOKEN}")
    return {"text": texts}

formatted_train = train_dataset.map(formatting_prompts_func, batched=True,
                                     remove_columns=train_dataset.column_names)
formatted_val = validation_dataset.map(formatting_prompts_func, batched=True,
                                        remove_columns=validation_dataset.column_names)

print_time("Data formatted")
print(f"Sample prompt length: {len(formatted_train[0]['text'])} chars")

# ==================== 5. CONFIGURING LORA ====================

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print_time(f"LoRA configured - Trainable: {trainable:,} ({100*trainable/total:.2f}%)")

# 6. TRAINING SETUP
from trl import SFTTrainer
from transformers import TrainingArguments
import os

os.makedirs("outputs", exist_ok=True)

batch_size = 2
grad_accum = 4
effective_batch = batch_size * grad_accum
max_steps = 1000

estimated_time = max_steps * 0.5 / 60
print_time(f"Training config:")
print(f"  Batch size: {batch_size}")
print(f"  Grad accumulation: {grad_accum}")
print(f"  Effective batch: {effective_batch}")
print(f"  Max steps: {max_steps}")
print(f"  Estimated time: {estimated_time:.1f} minutes")

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_train,
    eval_dataset = formatted_val,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = TrainingArguments(
        per_device_train_batch_size = batch_size,
        gradient_accumulation_steps = grad_accum,
        warmup_steps = 50,
        max_steps = max_steps,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 100,
        eval_strategy = "steps",
        eval_steps = 200,
        save_strategy = "steps",
        save_steps = 500,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 42,
        output_dir = "outputs",
        report_to = "none",
    ),
)

# 7. Starting training

trainer_stats = trainer.train()

train_time = trainer_stats.metrics['train_runtime']
print_time(f"Training completed in {train_time:.0f}s ({train_time/60:.1f}min)")
print(f"Final train loss: {trainer_stats.metrics.get('train_loss', 'N/A')}")

# 8. VALIDATION

FastLanguageModel.for_inference(model)

inference_prompt = """Check if the solution is correct.

Question: {}
Solution: {}
Answer: {}

Correct?
Output:"""

def parse_output(text):
    try:
        part = text.split("Output:")[-1].strip().lower()
        if "true" in part[:100]:
            return True
        elif "false" in part[:100]:
            return False
        return "true" in part
    except:
        return False

print_time("Quick test on 10 samples...")
for i in range(10):
    ex = validation_dataset[i]
    prompt = inference_prompt.format(ex["question"], str(ex["solution"]), str(ex["answer"]))
    inputs = tokenizer([prompt], return_tensors="pt", truncation=True, max_length=max_seq_length).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=10, use_cache=True, temperature=0.1)
    pred = parse_output(tokenizer.decode(outputs[0]))
    match = "OK" if pred == ex["is_correct"] else "MISS"
    print(f"  Sample {i+1}: {match} Pred={pred}, Actual={ex['is_correct']}")

print_time("Full validation on 500 samples...")
correct = 0
total = 500
predictions_val = []
actuals_val = []

for i in range(total):
    ex = validation_dataset[i]
    prompt = inference_prompt.format(ex["question"], str(ex["solution"]), str(ex["answer"]))
    inputs = tokenizer([prompt], return_tensors="pt", truncation=True, max_length=max_seq_length).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=10, use_cache=True, temperature=0.1)
    pred = parse_output(tokenizer.decode(outputs[0]))
    predictions_val.append(pred)
    actuals_val.append(ex["is_correct"])

    if pred == ex["is_correct"]:
        correct += 1

accuracy = correct / total
pred_true_ratio = sum(predictions_val) / len(predictions_val)
actual_true_ratio = sum(actuals_val) / len(actuals_val)

print_time("Validation complete")
print(f"\n{'='*60}")
print(f"Validation Accuracy: {accuracy:.4f} ({correct}/{total})")
print(f"Prediction distribution - True: {100*pred_true_ratio:.1f}%, False: {100*(1-pred_true_ratio):.1f}%")
print(f"Actual distribution     - True: {100*actual_true_ratio:.1f}%, False: {100*(1-actual_true_ratio):.1f}%")
print(f"{'='*60}")

if accuracy < 0.55:
    print("\nWARNING: ACCURACY < 0.55")
elif accuracy < 0.726:
    print("\nWARNING: ACCURACY < 0.726 (baseline)")
elif accuracy < 0.80:
    print("\nGOOD: ABOVE BASELINE")
else:
    print("\nEXCELLENT PERFORMANCE")


try:
    from google.colab import drive
    drive.mount('/content/drive')
    save_path = "/content/drive/MyDrive/llama3_math_verifier_fast"
except:
    save_path = "./saved_model_fast"

os.makedirs(save_path, exist_ok=True)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print_time(f"Model saved to: {save_path}")

import pandas as pd
from tqdm import tqdm

test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp", split="test")
print_time(f"Test samples: {len(test_dataset)}")

predictions = []
failed_count = 0

for i in tqdm(range(len(test_dataset)), desc="Predicting"):
    ex = test_dataset[i]

    prompt = inference_prompt.format(ex["question"], str(ex["solution"]), str(ex["answer"]))

    try:
        inputs = tokenizer(
            [prompt],
            return_tensors="pt",
            truncation=True,
            max_length=max_seq_length,
            padding=False
        ).to("cuda")

        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            use_cache=True,
            temperature=0.1
        )
        pred = parse_output(tokenizer.decode(outputs[0]))
    except Exception as e:
        failed_count += 1
        if failed_count <= 3:
            print(f"\nWarning: Sample {i} failed: {str(e)[:80]}")
        pred = False

    predictions.append(pred)

if failed_count > 0:
    print(f"\nTotal failed samples: {failed_count} (defaulted to False)")

submission = pd.DataFrame({'ID': range(len(predictions)), 'is_correct': predictions})
submission.to_csv('submission.csv', index=False)

true_count = sum(predictions)
true_ratio = true_count / len(predictions)

print(f"Submission Statistics:")
print(f"  Total: {len(predictions)}")
print(f"  True: {true_count} ({100*true_ratio:.1f}%)")
print(f"  False: {len(predictions)-true_count} ({100*(1-true_ratio):.1f}%)")

if true_ratio > 0.95 or true_ratio < 0.05:
    print(f"\nWARNING: Heavily skewed to {'True' if true_ratio > 0.5 else 'False'}")
else:
    print(f"\nDistribution looks reasonable")

total_time = time.time() - start_time
print(f"Total runtime: {total_time/60:.1f} minutes")
print(f"Validation accuracy: {accuracy:.4f}")
print(f"Model saved: {save_path}")
print(f"Submission file: submission.csv")
print(f"\nYour Results Summary:")
print(f"  Validation: {accuracy:.4f} {'(EXCELLENT!)' if accuracy >= 0.80 else ''}")
print(f"  Baseline: 0.726")
print(f"  Status: {'BEAT BASELINE ' if accuracy >= 0.726 else 'BELOW BASELINE'}")

INSTALLATION
[0m0s] Installing Unsloth...
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m273.6/273.6 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m149.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/



🦥 Unsloth Zoo will now patch everything to make training faster!
[0m59s] torch: 2.8.0+cu126, CUDA: True

LOADING MODEL
[0m59s] Loading model...
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

[1m39s] Model loaded (seq_len=2048)

LOADING DATASET
[1m39s] Loading dataset...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

[2m12s] Dataset loaded: 1000000 samples
[2m13s] Split - Train: 50000, Val: 5000
Data balance - True: 19967 (39.9%), False: 30033

FORMATTING DATA
[2m16s] Formatting dataset...


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


[2m17s] Data formatted
Sample prompt length: 999 chars

CONFIGURING LORA
[2m17s] Setting up LoRA...


Unsloth 2025.10.12 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


[2m23s] LoRA configured - Trainable: 41,943,040 (0.92%)

TRAINING SETUP
[2m23s] Training config:
  Batch size: 2
  Grad accumulation: 4
  Effective batch: 8
  Max steps: 1000
  Estimated time: 8.3 minutes


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/50000 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/5000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.



TRAINING
[2m38s] Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1 | Total steps = 1,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
200,0.8157,0.787072
400,0.7932,0.769358
600,0.7637,0.750478
800,0.7388,0.738202
1000,0.7127,0.734961


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


[159m37s] Training completed in 9417s (157.0min)
Final train loss: 0.7795703811645508

VALIDATION
[159m37s] Preparing for inference...
[159m37s] Quick test on 10 samples...
  Sample 1: OK Pred=False, Actual=False
  Sample 2: OK Pred=False, Actual=False
  Sample 3: OK Pred=False, Actual=False
  Sample 4: OK Pred=False, Actual=False
  Sample 5: OK Pred=False, Actual=False
  Sample 6: OK Pred=False, Actual=False
  Sample 7: OK Pred=False, Actual=False
  Sample 8: MISS Pred=True, Actual=False
  Sample 9: OK Pred=True, Actual=True
  Sample 10: OK Pred=False, Actual=False
[159m41s] Full validation on 500 samples...
[162m18s] Validation complete

Validation Accuracy: 0.8440 (422/500)
Prediction distribution - True: 40.8%, False: 59.2%
Actual distribution     - True: 35.6%, False: 64.4%

EXCELLENT PERFORMANCE

SAVING MODEL
[162m18s] Saving model...
[164m20s] Model saved to: ./saved_model_fast

GENERATING SUBMISSION - FIXED VERSION
[164m20s] Loading test dataset...
[164m21s] Test samples: 10000

Predicting: 100%|██████████| 10000/10000 [52:41<00:00,  3.16it/s]

[217m3s] Submission file created

Submission Statistics:
  Total: 10000
  True: 4112 (41.1%)
  False: 5888 (58.9%)

Distribution looks reasonable

COMPLETE
Total runtime: 217.1 minutes
Validation accuracy: 0.8440
Model saved: ./saved_model_fast
Submission file: submission.csv

Your Results Summary:
  Validation: 0.8440 (EXCELLENT!)
  Baseline: 0.726
  Status: BEAT BASELINE ✓



