# Setup

## Load API Keys

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

## Tracing

In [2]:
from langfuse.openai import AsyncOpenAI  # autoinstrmenttion

## Setup Embedding Call Helpers

In [3]:
client = AsyncOpenAI()

In [4]:
EMBED_MODEL = "text-embedding-3-large"

## Demo Embedding Call

In [5]:
embedding = await client.embeddings.create(
    input='Union[str, List[str], Iterable[int], Iterable[Iterable[int]]]',
    model=EMBED_MODEL
)

## Embedding Call Helpers

In [6]:
def get_embedding(e) -> list[float]:
    return e.data[0].embedding

In [7]:
this_embedding = get_embedding(embedding)

## Compute Cosine Similarity

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from openai.types.create_embedding_response import CreateEmbeddingResponse

In [9]:
def embedding_cosine_sim(e1: CreateEmbeddingResponse, e2: CreateEmbeddingResponse) -> float:
    e1, e2 = get_embedding(e1), get_embedding(e2)
    to_np = lambda e: np.array(e).reshape(1, -1)
    e1, e2 = to_np(e1), to_np(e2)
    _cos_sim = cosine_similarity(e1, e2)
    return _cos_sim[0][0]

In [10]:
embedding_cosine_sim(embedding, embedding)

np.float64(0.9999999999999976)

## Cache System 

In [11]:
from diskcache import Cache

In [12]:
cache = Cache(directory=".cache_course")

In [13]:
import asyncio

In [14]:
async def set_async(key, val, **kwargs):
    return await asyncio.to_thread(cache.set, key, val, **kwargs)

async def get_async(key, default=None, **kwargs):
    return await asyncio.to_thread(cache.get, key, default, **kwargs)

In [15]:
import json
from hashlib import md5

def make_cache_key(key_name, **kwargs):
    kwargs_string = json.dumps(kwargs, sort_keys=True)
    kwargs_hash = md5(kwargs_string.encode('utf-8')).hexdigest()
    cache_key = f"{key_name}__{kwargs_hash}"
    return cache_key

## [EMBEDDING] Cached and Retried Calls

In [16]:
from pydantic import BaseModel

def _make_key_for_cached_embedding_with_retry(
    *,
    model,
    input,
    **kwargs,
):
    return make_cache_key(
        "openai_parsed_chat",
        model=model,
        input=input,
        **kwargs
    )

In [17]:
from openai.types.create_embedding_response import CreateEmbeddingResponse
from functools import wraps
from openai import APITimeoutError, RateLimitError
from pydantic import BaseModel
import backoff


CACHE_MISS_SENTINEL = object()


@wraps(client.embeddings.create)
async def cached_embedding_with_retry(
    *,
    model,
    input,
    **kwargs,
) -> CreateEmbeddingResponse:
    # CREATE CACHE KEY
    cache_key = _make_key_for_cached_embedding_with_retry(
        model=model,
        input=input,
        **kwargs
    )

    cached_value = await get_async(cache_key, default=CACHE_MISS_SENTINEL)
    # CACHE MISS
    if cached_value is CACHE_MISS_SENTINEL:
        @backoff.on_exception(
            backoff.expo,
            (APITimeoutError, RateLimitError)
        )
        async def do_call():
            return await client.embeddings.create(
                model=model,
                input=input,
                **kwargs
            )
        embedding = await do_call()
        await set_async(cache_key, embedding.model_dump_json())
        return embedding
    # CACHE HIT
    else:
        embedding = CreateEmbeddingResponse.model_validate(json.loads(cached_value))
        return embedding
        
        

In [18]:
embedding = await cached_embedding_with_retry(
    input='Union[str, List[str], Iterable[int], Iterable[Iterable[int]]]',
    model=EMBED_MODEL
)

In [19]:
get_embedding(embedding)[0]

0.010675324127078056

## [LLM] Cached, Retried, and Traced Calls

In [20]:
from pydantic import BaseModel

def _make_key_for_cached_chat_completion_parsed_with_retry(
    *,
    model,
    messages,
    response_format: BaseModel,
    **kwargs,
):
    return make_cache_key(
        "openai_parsed_chat",
        model=model,
        messages=messages,
        response_format=response_format.model_json_schema(),
        **kwargs
    )

In [21]:
from openai.types.chat import ParsedChatCompletion
from functools import wraps
from openai import APITimeoutError, RateLimitError
from pydantic import BaseModel
from typing_extensions import TypeVar
import backoff

ResponseFormatT = TypeVar("ResponseFormatT", bound=BaseModel)

CACHE_MISS_SENTINEL = object()


@wraps(client.chat.completions.parse)
async def cached_chat_completion_parsed_with_retry(
    *,
    model,
    messages,
    response_format: ResponseFormatT,
    **kwargs,
) -> ParsedChatCompletion[ResponseFormatT]:
    # CREATE CACHE KEY
    cache_key = _make_key_for_cached_chat_completion_parsed_with_retry(
        model=model,
        messages=messages,
        response_format=response_format,
        **kwargs
    )

    cached_value = await get_async(cache_key, default=CACHE_MISS_SENTINEL)
    # CACHE MISS
    if cached_value is CACHE_MISS_SENTINEL:
        @backoff.on_exception(
            backoff.expo,
            (APITimeoutError, RateLimitError)
        )
        async def do_call():
            return await client.chat.completions.parse(
                model=model,
                messages=messages,
                response_format=response_format,
                **kwargs
            )
        completion = await do_call()
        await set_async(cache_key, completion.model_dump_json())
        return completion
    # CACHE HIT
    else:
        # TODO: Tracing Code (next section)
        # return 
        completion = ParsedChatCompletion.model_validate(json.loads(cached_value))
        for choice in completion.choices:
            if not choice.message.refusal:
                choice.message.parsed = response_format.model_validate(
                    choice.message.parsed
                )
        return completion
        
        

## Sanity Checks

In [22]:
# sanity check
embedding = await cached_embedding_with_retry(
    input="input: 'Union[str, List[str], Iterable[int], Iterable[Iterable[int]]]'",
    model=EMBED_MODEL
)
embedding_cosine_sim(embedding, embedding)

np.float64(0.9999999999999976)

In [23]:
# sanity check
from pydantic import BaseModel

class CalendarEvent(BaseModel):
    name: str
    date: str
    participants: list[str]

completion = await cached_chat_completion_parsed_with_retry(
    model="gpt-4o-2024-08-06",
    messages=[
        {"role": "system", "content": "Extract the event information."},
        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
    ],
    response_format=CalendarEvent,
)

event = completion.choices[0].message.parsed
event

CalendarEvent(name='Science Fair', date='Friday', participants=['Alice', 'Bob'])

---

# Synthetic Data Filtering

## Get the embedding for each questions

In [24]:
import pandas as pd
import ast
emails = pd.read_csv('paul_allen_sent_email_with_questions_v1.csv')
emails['questions'] = emails['questions'].apply(ast.literal_eval)

## Create a data structure that will hold the Message ID, Question ID and the embedding of the question

In [25]:
def _make_ds(id_message, id_question, question, embedding=None):
    return {
        "id_message": id_message,
        "id_question": id_question,
        "question": question,
        "embedding": embedding
    }

In [26]:
questions_with_embeddings = []
index_for_questions_with_embeddings = {}

In [27]:
for e_idx, email in emails.iterrows():
    index_for_questions_with_embeddings[e_idx] = {}
    for q_idx, question in enumerate(emails.questions):
        _ds = _make_ds(e_idx, q_idx, question)
        index_for_questions_with_embeddings[e_idx][q_idx] = _ds
        questions_with_embeddings.append(_ds)

In [37]:
questions_with_embeddings[1]

{'id_message': 0, 'id_question': 1, 'question': [], 'embedding': None}

In [33]:
import traceback
tasks = []

for ds in questions_with_embeddings[1:41]:
    async def do_task(ds):
        try:
            ds['embedding'] = await cached_embedding_with_retry(input=ds['question'], model=EMBED_MODEL)
        except Exception as e:
            return (ds, e, traceback.format_exc())
    tasks.append(do_task(ds))

In [34]:
len(tasks)

40

In [35]:
from tqdm.asyncio import tqdm_asyncio

results = await tqdm_asyncio.gather(*tasks)

100%|██████████| 40/40 [00:00<00:00, 50.78it/s]


In [36]:
results

[({'id_message': 0, 'id_question': 1, 'question': [], 'embedding': None},
  openai.BadRequestError('Error code: 400 - {\'error\': {\'message\': "\'$.input\' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", \'type\': \'invalid_request_error\', \'param\': None, \'code\': None}}'),
  'Traceback (most recent call last):\n  File "/tmp/ipykernel_3566510/2241576899.py", line 7, in do_task\n    ds[\'embedding\'] = await cached_embedding_with_retry(input=ds[\'question\'], model=EMBED_MODEL)\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/tmp/ipykernel_3566510/3686650940.py", line 38, in cached_embedding_with_retry\n    embedding = await do_call()\n                ^^^^^^^^^^^^^^^\n  File "/home/admin/LIVE/course/.venv/lib64/python3.12/site-packages/backoff/_async.py", line 151, in retry\n    ret = await target(*args, **kwargs)\n          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/tmp/ipykernel_3