In [1]:
"""
Modern LLM Fine-Tuning for Physics Articles
============================================

This notebook fine-tunes a small LLM on Q&As generated from 25 physics papers.

Key Features:
- Uses Llama 3.2 3B or Qwen 2.5 3B (modern, efficient models)
- Unsloth for 2-5x faster training
- QLoRA for memory-efficient training
- Works on Google Colab (Free tier with T4 GPU)
- Can also run on Mac M-series chips with MLX

Setup Instructions:
1. Upload your 25 PDF physics articles to Colab or have them locally
2. Run all cells in order
3. Training takes ~30-60 minutes on free Colab
"""



In [2]:
# ==========================================
# STEP 1: Install Dependencies
# ==========================================

# Fix PyArrow first
!pip install --upgrade pyarrow>=14.0.0

# Then install Unsloth and dependencies
!pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -q --no-deps "xformers<0.0.27" "trl<0.9.0" # peft accelerate bitsandbytes
!pip install -q pypdf2 pandas datasets

# For Mac M-series (comment out Colab section, use this instead)
# !pip install mlx-lm transformers datasets peft

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for xformers (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for xformers[0m[31m
[0m[31mERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (xformers)[0m[31m
[0m[?25h

In [3]:
# ==========================================
# STEP 2: Import Libraries
# ==========================================
import torch
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel
import json
import re
from pathlib import Path


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [5]:
# ==========================================
# STEP 3: Configuration
# ==========================================

# Model Configuration
MAX_SEQ_LENGTH = 2048  # Can handle longer context
MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct"  # Modern, fast model
# Alternative: "unsloth/Qwen2.5-3B-Instruct"

# LoRA Configuration
LORA_R = 16  # Rank (higher = more capacity but slower)
LORA_ALPHA = 16  # Alpha scaling
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj",
                  "gate_proj", "up_proj", "down_proj"]

# Training Configuration
OUTPUT_DIR = "./physics-llm-finetuned"
NUM_EPOCHS = 3
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4  # Effective batch = 8
LEARNING_RATE = 2e-4
WARMUP_STEPS = 5
MAX_GRAD_NORM = 0.3

In [9]:
# ==========================================
# STEP 4: Generate Q&A Dataset from Articles
# ==========================================

import json

# Sample Q&As based on physics articles
# (Upload using the folder icon on the left sidebar)
with open('QnA.json', 'r', encoding='utf-8') as f:
    PHYSICS_QA_DATASET = json.load(f)

print(f"Loaded {len(PHYSICS_QA_DATASET)} Q&A pairs")
print(f"\nFirst example:")
print(f"Q: {PHYSICS_QA_DATASET[0]['question'][:100]}...")
print(f"A: {PHYSICS_QA_DATASET[0]['answer'][:100]}...")

Loaded 50 Q&A pairs

First example:
Q: What are the main components of the D0 detector at the Tevatron?...
A: The D0 detector consists of three primary components: (1) A central tracking system comprising a sil...


In [10]:
def format_dataset_for_training(qa_list, system_prompt):
    """Convert Q&A pairs to training format"""
    formatted_data = []

    for qa in qa_list:
        text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{qa['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{qa['answer']}<|eot_id|>"""

        formatted_data.append({"text": text})

    return formatted_data

# System prompt for your physics expert
SYSTEM_PROMPT = """You are an expert in particle physics, specializing in experimental techniques at collider experiments like the Tevatron and LHC. You have deep knowledge of neural networks for particle identification, jet physics, calorimetry, and data analysis methods. Provide accurate, detailed responses citing experimental methods and results when relevant."""

# Format the dataset
formatted_train_data = format_dataset_for_training(PHYSICS_QA_DATASET, SYSTEM_PROMPT)

# Create HuggingFace Dataset
train_dataset = Dataset.from_list(formatted_train_data)

# Split into train/validation (90/10)
train_test_split = train_dataset.train_test_split(test_size=0.1, seed=42)
train_data = train_test_split['train']
val_data = train_test_split['test']

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"\nSample training example:\n{train_data[0]['text'][:500]}...")

Training samples: 45
Validation samples: 5

Sample training example:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert in particle physics, specializing in experimental techniques at collider experiments like the Tevatron and LHC. You have deep knowledge of neural networks for particle identification, jet physics, calorimetry, and data analysis methods. Provide accurate, detailed responses citing experimental methods and results when relevant.<|eot_id|><|start_header_id|>user<|end_header_id|>

What were the key findings from diphoton ...


In [11]:
# ==========================================
# STEP 5: Load Model with QLoRA
# ==========================================

print("\n" + "="*50)
print("Loading model with 4-bit quantization...")
print("="*50)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,  # Auto-detect best dtype
    load_in_4bit=True,  # Use 4-bit quantization
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,
    target_modules=TARGET_MODULES,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized checkpointing
    random_state=42,
)

print(f"\nTrainable parameters: {model.print_trainable_parameters()}")


Loading model with 4-bit quantization...
==((====))==  Unsloth 2025.9.11: Fast Llama patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.9.11 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


trainable params: 24,313,856 || all params: 3,237,063,680 || trainable%: 0.7511

Trainable parameters: None


In [15]:
# ==========================================
# STEP 6: Configure Training
# ==========================================

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LEARNING_RATE,
    max_grad_norm=MAX_GRAD_NORM,

    # Optimization
    optim="adamw_8bit",  # 8-bit optimizer for memory efficiency
    weight_decay=0.01,
    lr_scheduler_type="cosine",

    # Logging & Evaluation
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2,

    # Performance
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    group_by_length=True,
    report_to="none",  # Change to "wandb" if you want tracking
)

In [18]:
# Initialize trainer
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_args,
    # Removed: dataset_text_field - no longer needed
    # Removed: max_seq_length - now set in model initialization
    # Removed: packing - deprecated
)

Adding EOS to train dataset (num_proc=6):   0%|          | 0/45 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=6):   0%|          | 0/45 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=6):   0%|          | 0/45 [00:00<?, ? examples/s]

num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.


Adding EOS to eval dataset (num_proc=5):   0%|          | 0/5 [00:00<?, ? examples/s]

num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.


Tokenizing eval dataset (num_proc=5):   0%|          | 0/5 [00:00<?, ? examples/s]

num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.


Truncating eval dataset (num_proc=5):   0%|          | 0/5 [00:00<?, ? examples/s]

In [19]:
#tokenizer

In [20]:
# ==========================================
# STEP 7: Train the Model
# ==========================================

print("\n" + "="*50)
print("Starting training...")
print("="*50)

# Train
trainer_stats = trainer.train()

print("\nTraining completed!")
print(f"Training time: {trainer_stats.metrics['train_runtime']:.2f} seconds")
print(f"Training loss: {trainer_stats.metrics['train_loss']:.4f}")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 45 | Num Epochs = 3 | Total steps = 18
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 0 of 3,237,063,680 (0.00% trained)



Starting training...
Unsloth: Will smartly offload gradients to save VRAM!


AssertionError: No inf checks were recorded for this optimizer.

In [None]:
# ==========================================
# STEP 8: Save the Model
# ==========================================

print("\n" + "="*50)
print("Saving model...")
print("="*50)

# Save LoRA adapters
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Optional: Save merged model (base + LoRA)
model.save_pretrained_merged(
    f"{OUTPUT_DIR}/merged_16bit",
    tokenizer,
    save_method="merged_16bit",
)

# Optional: Save for GGUF (llama.cpp format)
# model.save_pretrained_gguf(f"{OUTPUT_DIR}", tokenizer)

print(f"Model saved to {OUTPUT_DIR}")

In [None]:
# ==========================================
# STEP 9: Test the Fine-tuned Model
# ==========================================

print("\n" + "="*50)
print("Testing fine-tuned model...")
print("="*50)

# Enable inference mode
FastLanguageModel.for_inference(model)

test_questions = [
    "What is the Manhattan algorithm and why is it effective for neural network training?",
    "Explain the role of electromagnetic calorimeters in particle detection.",
    "What are the key variables used to separate HW signal from DP background?"
]

for question in test_questions:
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": question}
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    outputs = model.generate(
        inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract just the assistant's response
    response = response.split("assistant\n\n")[-1]

    print(f"\n{'='*50}")
    print(f"Q: {question}")
    print(f"\nA: {response}")

In [None]:
# ==========================================
# STEP 10: Optional - Push to Hugging Face Hub
# ==========================================

"""
# Uncomment to push to Hub
from huggingface_hub import login

login()  # You'll need your HF token

model.push_to_hub(
    "your-username/physics-llm-3b",
    token="your_hf_token"
)

tokenizer.push_to_hub(
    "your-username/physics-llm-3b",
    token="your_hf_token"
)
"""

print("\n" + "="*50)
print("Fine-tuning complete! 🎉")
print("="*50)
print(f"\nModel location: {OUTPUT_DIR}")
print("\nNext steps:")
print("1. Test with more physics questions")
print("2. Generate more Q&As from your 25 articles (aim for 200-500)")
print("3. Iterate on training hyperparameters")
print("4. Deploy using vLLM or Ollama for fast inference")