In [1]:
# chose one or the other:
# from openai import AsyncOpenAI
from langfuse.openai import AsyncOpenAI

## Setup

In [2]:
client = AsyncOpenAI(
    base_url="http://localhost:11434/v1",  # Ollama's default local endpoint
    api_key="ollama"  # Dummy key, required but not used by Ollama
)

def _msg(role, content):
    # simple helper function to create a message object
    return {"role": role, "content": content}

def system(content):
    return _msg("system", content)

def user(content):
    return _msg("user", content)

def assistant(content):
    return _msg("assistant", content)

# sanity check:
completion = await client.chat.completions.create(
    messages=[system("You are a helpful assistant."), user("Tell me a joke")],
    model="llama3.2:3b",
)

print(completion.choices[0].message.content)

Here's one:

What do you call a fake noodle?

(wait for it...)

An impasta!

I hope that made you smile! Do you want to hear another one?


# Caching

In [3]:
from diskcache_decorator import cached

In [4]:
# cache = Cache() # temporary cache (as long as the kernel is running)
# cache = Cache(directory=".cache") # persistent cache

# async def set_async(key, value, **kwargs):
#     return await asyncio.to_thread(cache.set(key, value, **kwargs))

In [5]:
def cache_to_disk(_func=None, *, cache_path='.cache', **kwargs):
    """Decorator that caches to .cache directory.
    Can be used as @cache_to_disk or @cache_to_disk(...).
    Extra kwargs are forwarded to diskcache_decorator.cached.
    """
    def _decorator(func):
        return cached(cache_path=cache_path, **kwargs)(func)
    return _decorator if _func is None else _decorator(_func)


In [6]:
@cache_to_disk
async def get_completion(model, messages):
    return await client.chat.completions.create(
        messages=messages,
        model=model,
    )


In [7]:
completion = await get_completion(model="llama3.2:3b", messages=[system("You are a helpful assistant."), user("Write a random sentence.")])

print(completion.choices[0].message.content)


The old bookstore creaked as the librarian pulled open the worn leather cover of the rarest novel in her collection.


# Tracing

(using LLM client and helper functions from caching part above)

In [8]:
from dotenv import load_dotenv
load_dotenv()

from langfuse.openai import AsyncOpenAI

In [9]:
# run helper cell above for system client, user. ....

In [10]:
for _ in range(2):
    completion = await client.chat.completions.create(
        messages=[system("You are a helpful assistant."), user("Tell me a joke")],
        model="llama3.2:3b",
        temperature=0.5,
    )

In [11]:
print(completion.choices[0].message.content)

Here's one:

What do you call a fake noodle?

An impasta!


In [20]:
# Testing if caching does not show up in langfuse. witch earlier wrapper function:

completion = await get_completion(model="llama3.2:3b", messages=[system("You are a helpful assistant."), user("Write a random sentence. lololol")])


In [13]:
print(completion.choices[0].message.content)


The old bookstore creaked as the librarian pulled open the worn leather cover of the rarest novel in her collection.
