In [5]:
#!pip install unsloth torch

In [9]:
from unsloth import FastLanguageModel
import torch

# Load your merged model from HuggingFace
model_name = "bandurin/hep-physics-llm-3b-finetuned"  # Your merged model

print("Loading model from HuggingFace...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,  # Use 4-bit for efficiency
)

# Enable inference mode
FastLanguageModel.for_inference(model)
print("Model loaded successfully!")

Loading model from HuggingFace...
==((====))==  Unsloth 2025.9.11: Fast Llama patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.9.11 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Model loaded successfully!


In [10]:
# System prompt
SYSTEM_PROMPT = """You are an expert in particle physics, specializing in experimental techniques
                   at collider experiments like the Tevatron and LHC.
                   You have deep knowledge of neural networks for particle identification, jet physics, calorimetry,
                   and data analysis methods. Provide accurate, detailed responses citing experimental
                   methods and results when relevant."""

def ask_question(question):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": question}
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    outputs = model.generate(
        inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract just the assistant's response
    response = response.split("assistant\n\n")[-1]
    return response

In [11]:
# Test it
questions = [
    "What are the main components of the D0 detector?",
    "How do electromagnetic calorimeters work?",
    "What is jet physics in particle colliders?"
]

for q in questions:
    print(f"\nQuestion: {q}")
    print(f"Answer: {ask_question(q)}")
    print("-" * 70)


Question: What are the main components of the D0 detector?
Answer: The D0 detector at Fermilab consists of several key components:

1. Central Tracker: A silicon-based tracker that provides precise position and momentum measurements for charged particles. It's divided into layers with varying thickness and material content for better energy resolution and particle identification.
2. Muon System: A hybrid system combining gas-based muon chambers with drift chambers to detect and measure muons. It provides excellent tracking and momentum measurements for muons.
3. Electromagnetic Calorimeter (EMC): A scintillator-based calorimeter that measures electromagnetic energy with high precision. It's divided into barrel and endcap sections with varying thickness and material content.
4. Hadronic Calorimeter (HCal): A scintillator-based calorimeter that measures hadronic energy with moderate precision. It's divided into barrel and endcap sections with varying thickness and material content.
5. O