In [None]:
"""
Modern LLM Fine-Tuning for Physics Articles
============================================

This notebook fine-tunes a small LLM on Q&As generated from 25 physics papers.

Key Features:
- Uses Llama 3.2 3B or Qwen 2.5 3B (modern, efficient models)
- Unsloth for 2-5x faster training
- QLoRA for memory-efficient training
- Works on Google Colab (Free tier with T4 GPU)
- Can also run on Mac M-series chips with MLX

Setup Instructions:
1. Upload your 25 PDF physics articles to Colab or have them locally
2. Run all cells in order
3. Training takes ~30 minutes on free Colab
"""



In [23]:
# ==========================================
# STEP 1: Install Dependencies
# ==========================================

!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps trl transformers accelerate peft bitsandbytes
!pip install pypdf2 pandas datasets

Found existing installation: torch 2.8.0+cu126
Uninstalling torch-2.8.0+cu126:
  Successfully uninstalled torch-2.8.0+cu126
Found existing installation: torchvision 0.23.0+cu126
Uninstalling torchvision-0.23.0+cu126:
  Successfully uninstalled torchvision-0.23.0+cu126
Found existing installation: torchaudio 2.8.0+cu126
Uninstalling torchaudio-2.8.0+cu126:
  Successfully uninstalled torchaudio-2.8.0+cu126
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.1%2Bcu118-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.7.1%2Bcu118-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)
  Downloading htt

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-htqfaq8d/unsloth_721c5cf9185f4c0fb5dc44a8973d0198
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-htqfaq8d/unsloth_721c5cf9185f4c0fb5dc44a8973d0198
  Resolved https://github.com/unslothai/unsloth.git to commit 855c1b632a31b9d624bc7c4e2a2d9dced190530c
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [25]:
# ==========================================
# STEP 2: Import Libraries
# ==========================================
import torch
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel
import json
import re
from pathlib import Path

In [26]:
# ==========================================
# STEP 3: Configuration
# ==========================================

# Model Configuration
MAX_SEQ_LENGTH = 2048  # Can handle longer context
MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct"  # Modern, fast model
# Alternative: "unsloth/Qwen2.5-3B-Instruct"

# LoRA Configuration
LORA_R = 16  # Rank (higher = more capacity but slower)
LORA_ALPHA = 16  # Alpha scaling
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj",
                  "gate_proj", "up_proj", "down_proj"]

# Training Configuration
OUTPUT_DIR = "./physics-llm-finetuned"
NUM_EPOCHS = 3
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4  # Effective batch = 8
LEARNING_RATE = 2e-4
WARMUP_STEPS = 5
MAX_GRAD_NORM = 0.3

In [27]:
# ==========================================
# STEP 4: Generate Q&A Dataset from Articles
# ==========================================

import json

# Sample Q&As based on physics articles
# (Upload using the folder icon on the left sidebar)
with open('QnA.json', 'r', encoding='utf-8') as f:
    PHYSICS_QA_DATASET = json.load(f)

print(f"Loaded {len(PHYSICS_QA_DATASET)} Q&A pairs")
print(f"\nFirst example:")
print(f"Q: {PHYSICS_QA_DATASET[0]['question'][:100]}...")
print(f"A: {PHYSICS_QA_DATASET[0]['answer'][:100]}...")

Loaded 50 Q&A pairs

First example:
Q: What are the main components of the D0 detector at the Tevatron?...
A: The D0 detector consists of three primary components: (1) A central tracking system comprising a sil...


In [28]:
def format_dataset_for_training(qa_list, system_prompt):
    """Convert Q&A pairs to training format"""
    formatted_data = []

    for qa in qa_list:
        text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{qa['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{qa['answer']}<|eot_id|>"""

        formatted_data.append({"text": text})

    return formatted_data

# System prompt for your physics expert
SYSTEM_PROMPT = """You are an expert in particle physics, specializing in experimental techniques at collider experiments like the Tevatron and LHC. You have deep knowledge of neural networks for particle identification, jet physics, calorimetry, and data analysis methods. Provide accurate, detailed responses citing experimental methods and results when relevant."""

# Format the dataset
formatted_train_data = format_dataset_for_training(PHYSICS_QA_DATASET, SYSTEM_PROMPT)

# Create HuggingFace Dataset
train_dataset = Dataset.from_list(formatted_train_data)

# Split into train/validation (90/10)
train_test_split = train_dataset.train_test_split(test_size=0.1, seed=42)
train_data = train_test_split['train']
val_data = train_test_split['test']

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"\nSample training example:\n{train_data[0]['text'][:500]}...")

Training samples: 45
Validation samples: 5

Sample training example:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert in particle physics, specializing in experimental techniques at collider experiments like the Tevatron and LHC. You have deep knowledge of neural networks for particle identification, jet physics, calorimetry, and data analysis methods. Provide accurate, detailed responses citing experimental methods and results when relevant.<|eot_id|><|start_header_id|>user<|end_header_id|>

What were the key findings from diphoton ...


In [29]:
# ==========================================
# STEP 5: Load Model with QLoRA
# ==========================================

print("\n" + "="*50)
print("Loading model with 4-bit quantization...")
print("="*50)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,  # Auto-detect best dtype
    load_in_4bit=True,  # Use 4-bit quantization
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,
    target_modules=TARGET_MODULES,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized checkpointing
    random_state=42,
)

print(f"\nTrainable parameters: {model.print_trainable_parameters()}")


Loading model with 4-bit quantization...
==((====))==  Unsloth 2025.9.11: Fast Llama patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
trainable params: 24,313,856 || all params: 3,237,063,680 || trainable%: 0.7511

Trainable parameters: None


In [30]:
# ==========================================
# STEP 6: Train with Unsloth's Trainer
# ==========================================

from trl import SFTTrainer, SFTConfig

training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LEARNING_RATE,
    max_grad_norm=MAX_GRAD_NORM,

    # Dataset formatting
    max_seq_length=MAX_SEQ_LENGTH,  # <-- NOW goes in SFTConfig
    dataset_text_field="text",
    packing=False,

    # Optimization
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",

    # Logging & Evaluation
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2,

    # Performance
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    group_by_length=True,
    report_to="none",
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,  # Use 'tokenizer' not 'processing_class'
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_args,
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/45 [00:00<?, ? examples/s]

num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.


Unsloth: Tokenizing ["text"] (num_proc=5):   0%|          | 0/5 [00:00<?, ? examples/s]

In [31]:
# Train
print("\n" + "="*50)
print("Starting training...")
print("="*50)

trainer_stats = trainer.train()

print("\nTraining completed!")
print(f"Training loss: {trainer_stats.metrics.get('train_loss', 'N/A')}")


Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 45 | Num Epochs = 3 | Total steps = 18
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss,Validation Loss



Training completed!
Training loss: 2.245257271660699


In [None]:
# ==========================================
# STEP 7: Save the Model
# ==========================================

print("\n" + "="*50)
print("Saving model...")
print("="*50)

# Save LoRA adapters
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Optional: Save merged model (base + LoRA)
model.save_pretrained_merged(
    f"{OUTPUT_DIR}/merged_16bit",
    tokenizer,
    save_method="merged_16bit",
)

# Optional: Save for GGUF (llama.cpp format)
# model.save_pretrained_gguf(f"{OUTPUT_DIR}", tokenizer)

print(f"Model saved to {OUTPUT_DIR}")


Saving model...


config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00002.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.



Unsloth: Preparing safetensor model files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]


Unsloth: Preparing safetensor model files:  50%|█████     | 1/2 [04:36<04:36, 276.32s/it]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 2/2 [05:54<00:00, 177.48s/it]
Unsloth: Merging weights into 16bit:   0%|          | 0/2 [00:00<?, ?it/s]

In [34]:
# ==========================================
# STEP 8: Test the Fine-tuned Model
# ==========================================

print("\n" + "="*50)
print("Testing fine-tuned model...")
print("="*50)

# Enable inference mode
FastLanguageModel.for_inference(model)

test_questions = [
    "What is the Manhattan algorithm and why is it effective for neural network training?",
    "Explain the role of electromagnetic calorimeters in particle detection.",
    "What are the key variables used to separate HW signal from DP background?"
]

for question in test_questions:
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": question}
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    outputs = model.generate(
        inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract just the assistant's response
    response = response.split("assistant\n\n")[-1]

    print(f"\n{'='*50}")
    print(f"Q: {question}")
    print(f"\nA: {response}")


Testing fine-tuned model...

Q: What is the Manhattan algorithm and why is it effective for neural network training?

A: The Manhattan algorithm is an optimization technique used to train neural networks, particularly for particle identification tasks. It was introduced by Lee et al. in 2016 [1] for deep neural networks used in calorimetry at the Tevatron. The Manhattan algorithm is effective because it combines two key aspects of neural network training:

1. **Batch normalization**: This technique normalizes the input to each layer by subtracting the mean and dividing by the standard deviation for each feature. This helps stabilize the training process and reduces the effect of internal covariate shift.
2. **Gradient clipping**: This technique limits the magnitude of gradients during backpropagation to prevent exploding gradients, which can cause the network to diverge or become unstable.

The Manhattan algorithm combines batch normalization and gradient clipping in the following way

In [35]:
# ==========================================
# STEP 9: Optional - Push to Hugging Face Hub
# ==========================================

!pip install huggingface_hub





In [36]:
from huggingface_hub import login

# You'll be prompted to enter your access token
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
print(f"Model saved to {OUTPUT_DIR}")

Model saved to ./physics-llm-finetuned


In [38]:
from huggingface_hub import HfApi

# Choose a name for your model repository
model_name = "bandurin/hep-physics-llm-3b-finetuned"

# Push model
print("Pushing model to Hugging Face Hub...")
model.push_to_hub(
    model_name,
    use_auth_token=True  # Uses the token from login()
)

# Push tokenizer
print("Pushing tokenizer...")
tokenizer.push_to_hub(
    model_name,
    use_auth_token=True
)

print(f"\nModel successfully pushed to: https://huggingface.co/{model_name}")

Pushing model to Hugging Face Hub...


README.md:   0%|          | 0.00/613 [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 45.8kB / 97.3MB            

Saved model to https://huggingface.co/bandurin/hep-physics-llm-3b-finetuned
Pushing tokenizer...


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mpg2hfgh1s/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            


Model successfully pushed to: https://huggingface.co/bandurin/hep-physics-llm-3b-finetuned


In [12]:
import json

# Sample Q&As based on physics articles
# (Upload using the folder icon on the left sidebar)
with open('QnA.json', 'r', encoding='utf-8') as f:
    PHYSICS_QA_DATASET = json.load(f)

print(f"Loaded {len(PHYSICS_QA_DATASET)} Q&A pairs")
print(f"\nFirst example:")
print(f"Q: {PHYSICS_QA_DATASET[0]['question'][:100]}...")
print(f"A: {PHYSICS_QA_DATASET[0]['answer'][:100]}...")

Loaded 84 Q&A pairs

First example:
Q: What are the main components of the D0 detector at the Tevatron?...
A: The D0 detector consists of three primary components: (1) A central tracking system comprising a sil...
