## Example: Using LiteLLM model to access VLLM server

Requirements:
- Installed VLLM instance: Follow this [instruction](https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html#nvidia-cuda)

Launch an VLLM instance:
```
vllm serve Qwen/Qwen3-1.7B --host 0.0.0.0 \
--port 8000 \
--reasoning-parser deepseek_r1 \
--enable-prefix-caching \
--guided-decoding-backend guidance \
--max-model-len 16384
```

In [None]:
import os
from pydantic import BaseModel
import guidance

In [None]:
litellm_desc = {
    "model_name": "Qwen/Qwen3-1.7B",
    "litellm_params": {  # params for litellm completion/embedding call
        "model": "hosted_vllm/Qwen/Qwen3-1.7B",
        "api_key": os.environ.get("VLLM_API_KEY", "NO_KEY"), # set your vLLM API key if needed
        "api_base": "http://localhost:8000/v1", # change to your vLLM API base URL
    },
}
base_lm = guidance.models.experimental.LiteLLM(model_description=litellm_desc, echo=False)

In [None]:
def run_gen_test(lm):
    with guidance.user():
        lm += "What is the capital of France? and its population?"
        lm += "Format your answer as follows: Capital: <capital>, Population: <population>"

    with guidance.assistant():
        lm += guidance.gen(max_tokens=1024, temperature=0.7, name="answer")
        print(lm["answer"])

run_gen_test(base_lm)

In [None]:
def run_gen_stop_test(lm):
    with guidance.user():
        lm += "What is the capital of France? and its population?"
        lm += "Format your answer as follows: Capital: <capital>, Population: <population>"
        lm += "Say 'STOP RIGHT THERE' when you are done."

    with guidance.assistant():
        lm += guidance.gen(max_tokens=1024, temperature=0.7, name="answer", stop=["STOP"])
        print(lm["answer"])

run_gen_stop_test(base_lm)

In [None]:
def run_json_test(lm):
    class CityInfo(BaseModel):
        capital: str
        population: int

    with guidance.user():
        lm += "What is the capital of France? and its population? Output as JSON."

    with guidance.assistant():
        lm += guidance.json(schema=CityInfo, name="answer")
        print(lm["answer"])

run_json_test(base_lm)

In [None]:
def run_json_object_test(lm):
    class CityInfo(BaseModel):
        capital: str
        population: int

    with guidance.user():
        lm += "What is the capital of France? and its population? output json"

    with guidance.assistant():
        lm += guidance.json(schema=None, name="answer")  # No schema, just output JSON
        print(lm["answer"])

run_json_object_test(base_lm)

In [None]:
def run_lark_grammar(lm):
    lark_grammar = """
start: "Capital: " CAPITAL ", Population: " INT
CAPITAL: /[A-Z][a-z]+/
INT: /[0-9]+/
"""

    with guidance.user():
        lm += "What is the capital of France? and its population?"

    with guidance.assistant():
        lm += guidance.lark(lark_grammar=lark_grammar, name="answer")
        print(lm["answer"])

run_lark_grammar(base_lm)