# Direct Engine Access

This notebook is about playing around to create an entry point where the user manages their state themselves.

In [None]:
import guidance
from huggingface_hub import hf_hub_download

gguf = hf_hub_download(
    repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
    filename="Phi-3-mini-4k-instruct-q4.gguf",
)

# Define the model we will use
# lm = guidance.models.LlamaCpp(gguf, n_gpu_layers=-1)
lm = guidance.models.Transformers("microsoft/Phi-3-mini-4k-instruct")

In [None]:
messages = [
    {
        "role" : "user",
        "content": "What is your name?"
    }
]

In [None]:
grammar = """%llguidance {}

start: START
START: "Aa"{2,} "Bb"{1,3}
"""

Grab the `engine` object out of the Model. Note that this is only going to work for local models (basically, Models contain Interpreters, but only the local Interpreters then contain an engine)

In [None]:
engine = lm._interpreter.engine

There should be a chat template available:

In [None]:
chat_template = engine.get_chat_template().template_str

Render the template:

In [None]:
from jinja2 import Environment, BaseLoader

rtemplate = Environment(loader=BaseLoader).from_string(chat_template)
rendered_prompt = rtemplate.render(messages=messages, eos_token=engine.tokenizer.eos_token.decode("utf-8"))
rendered_prompt += engine.get_chat_template().get_role_start("assistant")

print("Rendered Prompt:\n", rendered_prompt)

In [None]:
state = guidance.models._engine.EngineState()

state.prompt = rendered_prompt

Run the grammar:

In [None]:
full_response = bytearray()
for nxt in engine(state, grammar):
    nxt_tokens = [x.token_id for x in nxt.tokens]
    nxt_bytes = engine.tokenizer.decode(nxt_tokens)
    full_response += nxt_bytes
print(full_response.decode("utf-8"))