# Get API Keys

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from openai import AsyncOpenAI

In [3]:
client = AsyncOpenAI()

In [4]:
GPT4O_MINI = "o4-mini-2025-04-16"

In [5]:
def _msg(role, content):
    return {'role': role, 'content': content}

def system(content):
    return _msg('system', content)

def user(content):
    return _msg('user', content)

def assistant(content):
    return _msg('assistant', content)

# Not Cached

In [6]:
# sanity check

completion = await client.chat.completions.create(
    messages=[user("What is caching in software engineering?")],
    model=GPT4O_MINI,
    max_completion_tokens=1000
)

In [7]:
from pprint import pprint

pprint(completion.choices[0].message.content)

('Caching in software engineering is the practice of storing copies of data or '
 'computations in a “fast‐access” storage layer so that future requests for '
 'the same data can be served more quickly, avoid repeated work, and reduce '
 'load on slower or more expensive resources (databases, file systems, remote '
 'services, etc.).\n'
 '\n'
 'Key aspects of caching:\n'
 '\n'
 '1. What gets cached  \n'
 '   • Computed results (e.g., function return values, rendered templates)  \n'
 '   • Database query results or ORM objects  \n'
 '   • HTTP responses or fragments (pages, images, JSON)  \n'
 '   • Files or blobs (on-disk cache for downloads)  \n'
 '\n'
 '2. Where caches live  \n'
 '   • In-process memory (application-level caches)  \n'
 '   • Dedicated cache servers (Redis, Memcached)  \n'
 '   • Content Delivery Networks (CDNs) and reverse proxies (e.g., Varnish, '
 'Nginx)  \n'
 '   • Browser cache or operating‐system file cache  \n'
 '\n'
 '3. Cache policies and mechanics  \n'
 '  

# Cache to disk

In [8]:
from diskcache import Cache

In [9]:
# cache = Cache() # temporary cache
cache = Cache(directory=".cache_course")

In [10]:
#cache.set("Hello", "blabla")
#cache.get("Hello")

In [11]:
import asyncio

In [12]:
async def set_async(key, val, **kwargs):
    return await asyncio.to_thread(cache.set, key, val, **kwargs)

async def get_async(key, default=None, **kwargs):
    return await asyncio.to_thread(cache.get, key, default, **kwargs)

In [18]:
# sanity checks
(
    await get_async("key_that_does_not_exist", default="NOT FOUND"),
    await get_async("Hello"),
    await set_async("key_exist", "yes"),
    await get_async("key_exist"),
)

('NOT FOUND', 'blabla', True, 'yes')

# LLMs calls are cached

<h2>Concept</h2>

In [19]:
from hashlib import md5

md5(b"dqsdsqdqs").hexdigest()

'48818a53e154bcdfc356cff1fbab9ae1'

In [24]:
from hashlib import md5

args = ["1", "2"]
kwargs = dict(a=2, b=4)

dirty = str(args) + str(kwargs)

print(dirty)

md5(dirty.encode('utf-8')).hexdigest()

['1', '2']{'a': 2, 'b': 4}


'8fe21d329868d5b6aef0d2d118a97c62'

<h2>Why is it dirty?</h2>

In [30]:
from hashlib import md5

args = [1, 2]
kwargs = dict(a=2, b=4)

dirty = str(args) + str(kwargs)

print(dirty)

md5(dirty.encode('utf-8')).hexdigest()

[1, 2]{'a': 2, 'b': 4}


'61b079025bfa60a4e712ebef7d78bd0e'

In [32]:
from hashlib import md5

args = [1, 2, 2, 4]
kwargs = dict()

dirty = str(args) + str(kwargs)

print(dirty)

md5(dirty.encode('utf-8')).hexdigest()

[1, 2, 2, 4]{}


'0c6d1d9f15baeb020e7541e36250390d'

In [33]:
from hashlib import md5

args = [1, 2]
kwargs = dict(b=4, a=2)

dirty = str(args) + str(kwargs)

print(dirty)

md5(dirty.encode('utf-8')).hexdigest()

[1, 2]{'b': 4, 'a': 2}


'7d78d95310185eb7a6a2805760b58e56'

In [26]:
def func(arg1, arg2, a, b):
    pass

func(1, 2, 3, 4)
func(1, 2, a=1, b=2)
func(1, 2, b=2, a=1)

<h3>are equivalent</h3>

<h3>BUT there are different hashes for the same arguments!</h3>

# Clean way

In [43]:
import json

def make_cache_key(key_name, **kwargs):
    kwargs_string = json.dumps(kwargs, sort_keys=True)
    kwargs_hash = md5(kwargs_string.encode('utf-8')).hexdigest()
    cache_key = f"{key_name}__{kwargs_hash}"
    
    return cache_key

In [47]:
def _make_cache_key_for_chat_completion(
    *,
    model,
    messages,
    **kwargs,
):
    return make_cache_key(
        "openai_chat_completion",
        model=model,
        messages=messages,
        **kwargs
    )

In [48]:
_make_cache_key_for_chat_completion(
    messages=[1, 2, 3],
    model=GPT4O_MINI
)

'openai_chat_completion__45981e772eec9f1d6b6e8c28511644ae'

In [52]:
from openai.types.chat import ChatCompletion

CACHE_MISS_SENTINEL = object()

async def cached_chat_completion(
    *,
    model,
    messages,
    **kwargs,
) -> ChatCompletion:
    # CREATE CAHE KEY
    cache_key = _make_cache_key_for_chat_completion(
        model=model,
        messages=messages,
        **kwargs
    )
    cached_value = await get_async(cache_key, default=CACHE_MISS_SENTINEL)
    # CACHE MISS
    if cached_value is CACHE_MISS_SENTINEL:
        completion = await client.chat.completions.create(
            model=model,
            messages=messages,
            **kwargs
        )
        await set_async(cache_key, completion.json())
        return completion
    # CACHE HIT
    else:
        return ChatCompletion.validate(json.loads(cached_value))

    

In [49]:
from openai.types.chat import ChatCompletion

In [50]:
ChatCompletion.validate(json.loads(completion.json()))

/tmp/ipykernel_3987418/815953895.py:1: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  ChatCompletion.validate(json.loads(completion.json()))
/tmp/ipykernel_3987418/815953895.py:1: PydanticDeprecatedSince20: The `validate` method is deprecated; use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  ChatCompletion.validate(json.loads(completion.json()))


ChatCompletion(id='chatcmpl-C1cG0k055MRTSG639cUcx9EWWSUTK', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Caching in software engineering is the practice of storing copies of data or computations in a “fast‐access” storage layer so that future requests for the same data can be served more quickly, avoid repeated work, and reduce load on slower or more expensive resources (databases, file systems, remote services, etc.).\n\nKey aspects of caching:\n\n1. What gets cached  \n   • Computed results (e.g., function return values, rendered templates)  \n   • Database query results or ORM objects  \n   • HTTP responses or fragments (pages, images, JSON)  \n   • Files or blobs (on-disk cache for downloads)  \n\n2. Where caches live  \n   • In-process memory (application-level caches)  \n   • Dedicated cache servers (Redis, Memcached)  \n   • Content Delivery Networks (CDNs) and reverse proxies (e.g., Varnish, Nginx)  \n   • Browser cache or

In [53]:
completion = await cached_chat_completion(
    messages=[user("What is caching in software engineering?")],
    model=GPT4O_MINI,
    max_completion_tokens=1000
)

/tmp/ipykernel_3987418/3530422529.py:25: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  await set_async(cache_key, completion.json())


In [54]:
completion

ChatCompletion(id='chatcmpl-C1eYDHrAK4n3R9YIbJYYgRUZXnzgA', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Caching in software engineering is the practice of storing copies of data or computation results in a faster‐access storage layer (the “cache”) so that future requests for that data can be served more quickly than by going back to the original, usually slower, source (disk, database, remote service, etc.).\n\n1. Purpose and Benefits  \n  • Reduced latency: fetching data from memory or a nearby cache is much faster than from a database or over the network.  \n  • Lower backend load: repeated requests hit the cache instead of overwhelming the primary data store.  \n  • Increased throughput and scalability: systems can handle more requests per second when much of the work is offloaded to caches.\n\n2. Common Cache Levels and Types  \n  • CPU caches (L1, L2, L3): hardware‐level caches that store recent instructions and data.  \n  •

In [55]:
completion = await cached_chat_completion(
    messages=[user("What is caching in software engineering?")],
    model=GPT4O_MINI,
    max_completion_tokens=1000
)

completion

/tmp/ipykernel_3987418/3530422529.py:29: PydanticDeprecatedSince20: The `validate` method is deprecated; use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return ChatCompletion.validate(json.loads(cached_value))


ChatCompletion(id='chatcmpl-C1eYDHrAK4n3R9YIbJYYgRUZXnzgA', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Caching in software engineering is the practice of storing copies of data or computation results in a faster‐access storage layer (the “cache”) so that future requests for that data can be served more quickly than by going back to the original, usually slower, source (disk, database, remote service, etc.).\n\n1. Purpose and Benefits  \n  • Reduced latency: fetching data from memory or a nearby cache is much faster than from a database or over the network.  \n  • Lower backend load: repeated requests hit the cache instead of overwhelming the primary data store.  \n  • Increased throughput and scalability: systems can handle more requests per second when much of the work is offloaded to caches.\n\n2. Common Cache Levels and Types  \n  • CPU caches (L1, L2, L3): hardware‐level caches that store recent instructions and data.  \n  •