# Load API Keys

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

# Tracing

In [2]:
from langfuse.openai import AsyncOpenAI

# Setup Embedding Call Helpers

In [3]:
client = AsyncOpenAI()

In [4]:
EMBED_MODEL = "text-embedding-3-large"

## Demo Embedding Call

In [5]:
client.embeddings.create?

[31mSignature:[39m
client.embeddings.create(
    *,
    input: [33m'Union[str, List[str], Iterable[int], Iterable[Iterable[int]]]'[39m,
    model: [33m'Union[str, EmbeddingModel]'[39m,
    dimensions: [33m'int | NotGiven'[39m = NOT_GIVEN,
    encoding_format: [33m"Literal['float', 'base64'] | NotGiven"[39m = NOT_GIVEN,
    user: [33m'str | NotGiven'[39m = NOT_GIVEN,
    extra_headers: [33m'Headers | None'[39m = [38;5;28;01mNone[39;00m,
    extra_query: [33m'Query | None'[39m = [38;5;28;01mNone[39;00m,
    extra_body: [33m'Body | None'[39m = [38;5;28;01mNone[39;00m,
    timeout: [33m'float | httpx.Timeout | None | NotGiven'[39m = NOT_GIVEN,
) -> [33m'CreateEmbeddingResponse'[39m
[31mDocstring:[39m
Creates an embedding vector representing the input text.

Args:
  input: Input text to embed, encoded as a string or array of tokens. To embed multiple
      inputs in a single request, pass an array of strings or array of token arrays.
      The input must not ex

In [6]:
embedding = await client.embeddings.create(
    input='Union[str, List[str], Iterable[int], Iterable[Iterable[int]]]',
    model=EMBED_MODEL
)

In [7]:
dir(embedding.data[0])

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__class_vars__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_pydantic_core_schema__',
 '__get_pydantic_json_schema__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__pretty__',
 '__private_attributes__',
 '__pydantic_complete__',
 '__pydantic_computed_fields__',
 '__pydantic_core_schema__',
 '__pydantic_custom_init__',
 '__pydantic_decorators__',
 '__pydantic_extra__',
 '__pydantic_fields__',
 '__pydantic_fields_set__',
 '__pydantic_generic_metadata__',
 '__pydantic_init_subclass__',
 '__pydantic_parent_namespace__',
 '__pydantic_post_init__',
 '__pydantic_private__',
 '__pydantic_root_model__',
 '__pydantic_serializer__',
 '__pydantic_setattr_handl

In [8]:
type(embedding.data[0].embedding[0])

float

## Embedding Call Helpers

In [25]:
def get_embedding(e) ->list[float]:
    return e.data[0].embedding

In [26]:
this_embedding = get_embedding(embedding)

# Compute Cosine Similarity

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from openai.types.create_embedding_response import CreateEmbeddingResponse

In [28]:
def embedding_cosine_sim(e1: CreateEmbeddingResponse, e2: CreateEmbeddingResponse) -> float:
    e1, e2 = get_embedding(e1), get_embedding(e2)
    to_np = lambda e: np.array(e).reshape(1, -1)
    e1, e2 = to_np(e1), to_np(e2)
    _cos_sim = cosine_similarity(e1, e2)
    return _cos_sim

In [31]:
embedding_cosine_sim(embedding, embedding)[0][0]

np.float64(0.9999999999999976)

# Cache System

In [11]:
from diskcache import Cache

In [12]:
cache = Cache(directory=".cache_course")

In [13]:
import asyncio

In [14]:
async def set_async(key, val, **kwargs):
    return await asyncio.to_thread(cache.set, key, val, **kwargs)

async def get_async(key, default=None, **kwargs):
    return await asyncio.to_thread(cache.get, key, default, **kwargs)

# Implementing Cached, Retried and Traced Structured Outputs completion

In [15]:
import json
from hashlib import md5

def make_cache_key(key_name, **kwargs):
    kwargs_string = json.dumps(kwargs, sort_keys=True)
    kwargs_hash = md5(kwargs_string.encode('utf-8')).hexdigest()
    cache_key = f"{key_name}__{kwargs_hash}"
    return cache_key

In [16]:
from pydantic import BaseModel

def _make_key_for_cached_embedding_with_retry(
    *,
    model,
    input,
    **kwargs,
):
    return make_cache_key(
        "openai_parsed_chat",
        model=model,
        input=input,
        **kwargs
    )

In [19]:
from openai.types.create_embedding_response import CreateEmbeddingResponse
from functools import wraps
from openai import APITimeoutError, RateLimitError
from pydantic import BaseModel
import backoff


CACHE_MISS_SENTINEL = object()


@wraps(client.embeddings.create)
async def cached_embedding_with_retry(
    *,
    model,
    input,
    **kwargs,
) -> CreateEmbeddingResponse:
    # CREATE CACHE KEY
    cache_key = _make_key_for_cached_embedding_with_retry(
        model=model,
        input=input,
        **kwargs
    )

    cached_value = await get_async(cache_key, default=CACHE_MISS_SENTINEL)
    # CACHE MISS
    if cached_value is CACHE_MISS_SENTINEL:
        @backoff.on_exception(
            backoff.expo,
            (APITimeoutError, RateLimitError)
        )
        async def do_call():
            return await client.embeddings.create(
                model=model,
                input=input,
                **kwargs
            )
        embedding = await do_call()
        await set_async(cache_key, embedding.model_dump_json())
        return embedding
    # CACHE HIT
    else:
        embedding = CreateEmbeddingResponse.model_validate(json.loads(cached_value))
        return embedding

In [20]:
embedding = await cached_embedding_with_retry(
    input='Union[str, List[str], Iterable[int], Iterable[Iterable[int]]]',
    model=EMBED_MODEL
)

In [22]:
get_embedding(embedding)[0]

0.010675324127078056

# Deduplication

In [None]:
import pandas as pd
import ast

In [None]:
emails = pd.read_csv('paul_allen_sent_email_with_questions_v1.csv')
emails['questions'] = emails['questions'].apply(ast.literal_eval)

# counting how many questions we have

In [None]:
flat_questions = []

for _, row in emails.iterrows():
    flat_questions += row.questions

## Find the easy duplicates

In [None]:
questions_set = {}
duplicates = {}

for i, row in emails.iterrows():
    for question in row.questions:
        if question not in questions_set:
            questions_set[question] = i
        else:
            print(i, question, '\n')
            duplicates[question] = questions_set[question]

In [None]:
emails.iloc[341], emails.iloc[342]

### Remove duplicate email 341

## Find likely duplicates