# Imports

In [None]:
import packages
from configs import settings, const, components
from configs.settings import logger
import asyncio, os, time, yaml, json, datetime, copy, random
from typing import Any, AsyncGenerator, Generator, Callable, Literal, Optional, TypeAlias, Union
from tqdm import tqdm
from pprint import pprint

from toolkit.llm.llama_index import (
	agents, cores, deploys as dpls, evaluation, messages, models, 
	observability, types, utils as utils_llama_index, workflows as wfs
)
from toolkit.llm.llama_index.data import loading, querying, storing

from features.agents.car.tools import VehicleDB
from features.agents.tools import map

from toolkit.utils import utils, typer as t
from toolkit.utils.llm import measure_performance, main as utils_llm
from toolkit.utils.utils import rp_print

## Test Model

In [None]:
queries = [
	"Hello",
	"Tell me a joke",
	"Tell me a long joke",
	"Tell me a super long joke",
	"My name is John",
	"What is my name",
]

In [None]:
def messages_to_prompt(messages):
    """
    Format messages for Mistral model instruction format.
    Prevents prompt contamination and ensures proper message structure.
    """
    # Clean and validate the messages
    cleaned_messages = []
    for msg in messages:
        # Remove any existing instruction tags to prevent contamination
        content = str(msg).replace("[INST]", "").replace("[/INST]", "").strip()
        cleaned_messages.append(content)
    
    # Join messages with clear separation
    prompt = " ".join(cleaned_messages)
    
    # Apply single instruction wrapper
    return f"<s>[INST] {prompt} [/INST]</s>"

def completion_to_prompt(completion):
    """
    Format completion for Mistral model instruction format.
    Prevents prompt contamination.
    """
    # Clean the completion of any instruction tags
    cleaned_completion = str(completion).replace("[INST]", "").replace("[/INST]", "").strip()
    return f"<s>[INST] {cleaned_completion} [/INST]</s>"

llm_vllm = models.OpenLLM(
	api_base=f'http://localhost:{os.getenv("PORT_SVC_LLM_VLLM")}/v1',
	# model="Qwen/Qwen2.5-0.5B-Instruct",
	model=const.Model.Vllm.LLAMA_3_2_1B_INST,
	max_tokens=512,
	temperature=0.0,
	additional_kwargs={
		"frequency_penalty": 0.5,
		# "repetition_penalty": 1.2,
		# "length_penalty": 1.1,
	},
	is_chat_model=True,
	is_function_calling_model=True,
	strict=True,
	# messages_to_prompt=messages_to_prompt,
	# completion_to_prompt=completion_to_prompt,
)

llm_nano = models.OpenAI(
	api_base=f'http://localhost:{os.getenv("PORT_SVC_LLM_NANO")}/v1',
)

llm = llm_vllm

In [None]:
user_query = queries[2]

token_generator = utils_llama_index.interact_model(
  prompt=user_query, mode="astream", user_query=None,
  measure_performance=True,
  llm=llm,
	# llm=None, # Use cores.Settings.llm
)

async for token in await token_generator:
  print(token, end="", flush=True)

In [None]:
result = await utils_llama_index.interact_model(
	llm=llm,
	user_query=queries[2],
	mode="chat",
)

rp_print(result)