In [2]:
import os  
from pprint import pprint
from huggingface_hub import InferenceClient

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
MODEL_NAME_HF = "meta-llama/Llama-3.3-70B-Instruct"
MODEL_NAME_OLLAMA = "ollama_chat/qwen2:7b"
MODEL_NAME_OLLAMA_SHORT = "qwen2:7b"

HF_TOKEN_ENV_VAR = "HF_TOKEN"
MAIN_QUESTION_PRUSSIA = "The capital of the old and already non-existant Prussian region was"
MAIN_QUESTION_WEATHER = "Question: what is the weather in Amsterdam now?"

In [None]:
#  ## Run with HF 
# def read_hf_token(path: str) -> str:
#     file = os.open(path, os.O_RDONLY)
#     buffer = b""
#     while chunk := os.read(file, 2048):
#         buffer += chunk
#     content_str = buffer.decode("utf-8")
   
#     start = content_str.find(HF_TOKEN_ENV_VAR) + len(HF_TOKEN_ENV_VAR)+1
#     end = content_str[start:].find("\n")

#     return content_str[start:start+end]

# os.environ[HF_TOKEN_ENV_VAR] = read_hf_token(".env")

# MODEL_NAME_HF = "meta-llama/Llama-3.3-70B-Instruct"

# client = InferenceClient(MODEL_NAME_HF, provider="hf-inference")
# client.chat.completions.create(
#     messages=[{"role": "user", "content": MAIN_QUESTION_PRUSSIA }],
#     max_tokens=100,
# )

# ## INSPECT 

# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_HF)

# raw_prompt = tokenizer.apply_chat_template(
#     [{"role": "user", "content": MAIN_QUESTION_PRUSSIA}],
#     tokenize=False  # to get a raw string instead of token IDs
# )
# print(raw_prompt)

In [11]:
# This system prompt is a bit more complex and actually contains the function description already appended.
# Here we suppose that the textual description of the tools has already been appended.

SYSTEM_PROMPT = """Answer the following questions as best you can. You have access to the following tools:

get_weather: Get the current weather in a given location

The way you use the tools is by specifying a json blob.
Specifically, this json should have an `action` key (with the name of the tool to use) and an `action_input` key (with the input to the tool going here).

The only values that should be in the "action" field are:
get_weather: Get the current weather in a given location, args: {"location": {"type": "string"}}
example use :

{{
  "action": "get_weather",
  "action_input": {"location": "New York"}
}}


ALWAYS use the following format:

Question: the input question you must answer
Thought: you should always think about one action to take. Only one action at a time in this format:
Action:

$JSON_BLOB (inside markdown cell)

Observation: the result of the action. This Observation is unique, complete, and the source of truth.
... (this Thought/Action/Observation can repeat N times, you should take several steps when needed. The $JSON_BLOB must be formatted as markdown and only use a SINGLE action at a time.)

You must always end your output with the following format:

Thought: I now know the final answer
Final Answer: the final answer to the original input question

Now begin! Reminder to ALWAYS use the exact characters `Final Answer:` when you provide a definitive answer. """

messages = [
    {"role":"system", "content": SYSTEM_PROMPT},
    {"role":"user", "content": MAIN_QUESTION_WEATHER}
]

In [None]:
## OPTION: use litellm. 
## Open question: why do we need it if ollama implements openai api? 

# from litellm import completion
# response = completion(
#             model="ollama/"+MODEL_NAME_OLLAMA_SHORT,
#             messages = messages,
#             api_base="http://localhost:11434",
# )

In [None]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    # required but ignored
    api_key='ollama',
)

chat_completion = client.chat.completions.create(
    messages=messages,
    model=MODEL_NAME_OLLAMA_SHORT,
)
print(chat_completion)

In [47]:
## Stop LLM output when it reaches a certain string 
out = client.chat.completions.create(
    messages=messages,
    model=MODEL_NAME_OLLAMA_SHORT,
    stop=["Observation:"]
)
print(out)

ChatCompletion(id='chatcmpl-395', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Thought: I need to get the current weather condition for Amsterdam, as per the user\'s query.\n\nAction:\n```\n{   "action": "get_weather",\n    "action_input": {  "location": {  "type": "string" } },\n}\n```\n\n\n', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None))], created=1747745662, model='qwen2:7b', object='chat.completion', service_tier=None, system_fingerprint='fp_ollama', usage=CompletionUsage(completion_tokens=59, prompt_tokens=335, total_tokens=394, completion_tokens_details=None, prompt_tokens_details=None))


In [51]:
## Now we define a function that we would call after parsing the first message. 
def get_weather(location):
    return f"the weather in {location} is sunny with low temperatures. \n"

get_weather('Amsterdam')

messages = [
    {"role":"system", "content": SYSTEM_PROMPT},
    {"role":"user", "content": "Question: what is the weather in Amsterdam now?"},
    {"role":"assistant", "content":out.choices[0].message.content+ "Observation: " + get_weather('Amsterdam')},
]

out = client.chat.completions.create(
    messages=messages,
    model=MODEL_NAME_OLLAMA_SHORT,
    # max_completion_tokens=200,
)
print(out.choices[0])

Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Thought: With this result confirmed, I can infer that Amsterdam currently experiences a delightful climate.\n\nFinal Answer:\nThe current weather conditions in Amsterdam are sunny with cool low temperatures.', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None))


In [54]:
out.choices[0].message.content


'Thought: With this result confirmed, I can infer that Amsterdam currently experiences a delightful climate.\n\nFinal Answer:\nThe current weather conditions in Amsterdam are sunny with cool low temperatures.'