## Example: Using OnnxRuntime-GenAI model

Requirements:
```
pip install -e .[onnxruntime_genai]
pip install huggingface_hub
```

In [None]:
import os

from huggingface_hub import snapshot_download
from pydantic import BaseModel
from transformers import AutoTokenizer

import guidance
from guidance.chat import Phi4MiniChatTemplate

In [None]:
# Download Phi4 model
model_sub_dir = "gpu/gpu-int4-rtn-block-32"
base_model_path = snapshot_download(
    repo_id="microsoft/Phi-4-mini-instruct-onnx",
    allow_patterns=f"{model_sub_dir}/*",
)
model_path = os.path.join(base_model_path, model_sub_dir)

In [None]:
base_lm = guidance.models.OnnxRuntimeGenAI(
    model=model_path,
    transformers_tokenizer=AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct"),
    chat_template=Phi4MiniChatTemplate(),
    # execution_provider="cuda", # Uncomment to use GPU
)

In [None]:
def run_gen_test(lm):
    with guidance.user():
        lm += "What is the capital of France? and its population?"
        lm += "Format your answer as follows: Capital: <capital>, Population: <population>"

    with guidance.assistant():
        lm += guidance.gen(max_tokens=1024, temperature=0.7, name="answer")
        print(lm["answer"])

run_gen_test(base_lm)

In [None]:
def run_gen_stop_test(lm):
    with guidance.user():
        lm += "What is the capital of France? and its population?"
        lm += "Format your answer as follows: Capital: <capital>, Population: <population>"
        lm += "Say 'STOP RIGHT THERE' when you are done."

    with guidance.assistant():
        lm += guidance.gen(max_tokens=1024, temperature=0.7, name="answer", stop=["STOP"])
        print(lm["answer"])

run_gen_stop_test(base_lm)

In [None]:
def run_json_test(lm):
    class CityInfo(BaseModel):
        capital: str
        population: int

    with guidance.user():
        lm += "What is the capital of France? and its population? Output as JSON."

    with guidance.assistant():
        lm += guidance.json(schema=CityInfo, name="answer")
        print(lm["answer"])

run_json_test(base_lm)

In [None]:
def run_json_object_test(lm):
    class CityInfo(BaseModel):
        capital: str
        population: int

    with guidance.user():
        lm += "What is the capital of France? and its population? output json"

    with guidance.assistant():
        lm += guidance.json(schema=None, name="answer")  # No schema, just output JSON
        print(lm["answer"])

run_json_object_test(base_lm)

In [None]:
def run_lark_grammar(lm):
    lark_grammar = """
start: "Capital: " CAPITAL ", Population: " INT
CAPITAL: /[A-Z][a-z]+/
INT: /[0-9]+/
"""

    with guidance.user():
        lm += "What is the capital of France? and its population?"

    with guidance.assistant():
        lm += guidance.lark(lark_grammar=lark_grammar, name="answer")
        print(lm["answer"])

run_lark_grammar(base_lm)