# Load API Keys

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

# Tracing

In [2]:
from langfuse.openai import AsyncOpenAI  # autoinstrmenttion

# Setup LLM Call Helpers

In [3]:
client = AsyncOpenAI()

In [4]:
GPT4O_MINI = "gpt-4o-mini-2024-07-18"

In [5]:
def _msg(role, content):
    return {'role': role, 'content': content}

def system(content):
    return _msg('system', content)

def user(content):
    return _msg('user', content)

def assistant(content):
    return _msg('assistant', content)

# Cache System 

In [6]:
from diskcache import Cache

In [7]:
cache = Cache(directory=".cache_course")

In [8]:
import asyncio

In [9]:
async def set_async(key, val, **kwargs):
    return await asyncio.to_thread(cache.set, key, val, **kwargs)

async def get_async(key, default=None, **kwargs):
    return await asyncio.to_thread(cache.get, key, default, **kwargs)

# Implementing Cached, Retried and Traced Structured Ouputs completion

In [10]:
import json
from hashlib import md5

def make_cache_key(key_name, **kwargs):
    kwargs_string = json.dumps(kwargs, sort_keys=True)
    kwargs_hash = md5(kwargs_string.encode('utf-8')).hexdigest()
    cache_key = f"{key_name}__{kwargs_hash}"
    return cache_key

In [11]:
from pydantic import BaseModel

def _make_key_for_cached_chat_completion_parsed_with_retry(
    *,
    model,
    messages,
    response_format: BaseModel,
    **kwargs,
):
    return make_cache_key(
        "openai_parsed_chat",
        model=model,
        messages=messages,
        response_format=response_format.model_json_schema(),
        **kwargs
    )

In [12]:
from openai.types.chat import ParsedChatCompletion
from functools import wraps
from openai import APITimeoutError, RateLimitError
from typing_extensions import TypeVar
import backoff

ResponseFormatT = TypeVar("ResponseFormatT", bound=BaseModel)

CACHE_MISS_SENTINEL = object()


@wraps(client.chat.completions.parse)
async def cached_chat_completion_parsed_with_retry(
    *,
    model,
    messages,
    response_format: ResponseFormatT,
    **kwargs,
) -> ParsedChatCompletion[ResponseFormatT]:
    # CREATE CACHE KEY
    cache_key = _make_key_for_cached_chat_completion_parsed_with_retry(
        model=model,
        messages=messages,
        response_format=response_format,
        **kwargs
    )

    cached_value = await get_async(cache_key, default=CACHE_MISS_SENTINEL)
    # CACHE MISS
    if cached_value is CACHE_MISS_SENTINEL:
        @backoff.on_exception(
            backoff.expo,
            (APITimeoutError, RateLimitError)
        )
        async def do_call():
            return await client.chat.completions.parse(
                model=model,
                messages=messages,
                response_format=response_format,
                **kwargs
            )
        completion = await do_call()
        await set_async(cache_key, completion.model_dump_json())
        return completion
    # CACHE HIT
    else:
        # TODO: Tracing Code (next section)
        # return 
        completion = ParsedChatCompletion.model_validate(json.loads(cached_value))
        for choice in completion.choices:
            if not choice.message.refusal:
                choice.message.parsed = response_format.model_validate(
                    choice.message.parsed
                )
        return completion
        
        

# DATA_GENERATION

## Load the dataset

In [13]:
import pandas as pd

In [14]:
emails = pd.read_csv('paul_allen_sent_emails.csv')

In [15]:
emails.head(n=3)

Unnamed: 0,content,Message-ID,Date,Correspondants
0,"jeff, i want to bid $2.8 for sagewood with a r...",<13537630.1075855669909.JavaMail.evans@thyme,"Mon, 31 Dec 1979 16:00:00 -0800 (PST",Jeff Smith
1,attached is the systems wish list for the gas ...,<27903020.1075855669931.JavaMail.evans@thyme,"Mon, 31 Dec 1979 16:00:00 -0800 (PST","John J Lavorato, Beth Perlman, Hunter S Shivel..."
2,how is your racing going? what category are yo...,<12929996.1075855668941.JavaMail.evans@thyme,"Mon, 31 Dec 1979 16:00:00 -0800 (PST",muller@thedoghousemail.co


In [16]:
print(f"{emails.shape[0]} lines x {emails.shape[0]} rows")

396 lines x 396 rows


## Prompt Engineering

In [17]:
from jinja2 import Template # To take better into account the spcecial characters that print(f"{user_name} {prompt}\n\nThis is a test for dedent...")
from textwrap import dedent # To remove all the characters that we don't want

## Test jinja2 and textwrap.dedent

In [18]:
# Without dedent
prompt_template = Template(
    """\
    {{ user_name }} {{ prompt }}

    This is a test for dedent.

    I want to check.
    """
)

In [19]:
print(prompt_template.render(user_name="Paul Allen", prompt="Generate questions from emails"))

    Paul Allen Generate questions from emails

    This is a test for dedent.

    I want to check.
    


In [20]:
prompt_template = Template(
    dedent(
        """\
        {{ user_name }} {{ prompt }}

        This is a test for dedent.

        I want to check.
        """
    )
)

In [21]:
print(prompt_template.render(user_name="Paul Allen", prompt="Generate questions from emails"))

Paul Allen Generate questions from emails

This is a test for dedent.

I want to check.


## Apply the prompt to real data

In [22]:
from pydantic import BaseModel

class GeneratedQuestions(BaseModel):
    useless_to_recall: bool
    questions: list[str]

In [62]:
prompt_template = Template(
    dedent(
        """\
        You are Paul Allen, also known as Phillip Allen, and you sent this email.
        
        <email_content>
        {{ email_content }}
        </email_content>
        
        Return JSON only.

        
        CLASSIFY
        Set "useless_to_recall" = true **unless** the email contains at least one
        **professional, content-bearing fact** (clear business action/decision/commitment,
        deliverable, schedule with date/time, contract/price/volume/rate/term, project or
        asset details). Personal/household, HR/tax chat, gossip, or fragmentary jargon
        lists are **not** professional.
        
        USELESS CATEGORIES (choose one for "useless_reason"):
        - personal       (household approvals, errands)
        - hr_tax         (W-2, payroll, benefits chatter)
        - gossip         (rumors, who said what)
        - fragment       (token/jargon lists without a predicate/action)
        - attachment_only(link/attachment with no in-body summary)
        - chitchat       (greetings, small talk)
        - boilerplate    (auto-replies, disclaimers)
        - vague          (no concrete, answerable facts)
        
        IF USELESS:
        {
          "useless_to_recall": true,
          "useless_reason": "<one category above>",
          "questions": []
        }
        
        IF USEFUL:
        Generate up to {{ n_questions|default(6) }} questions that are:
        - directly answerable **verbatim** from the email (no outside info)
        - **self-contained**
        - include ≥1 exact token from the email (proper noun OR numeral/unit)
        - NOT clarifying/meta/recipient/formatting questions
        
        STRICT CHECK (for each question):
        If it cannot be answered exactly from the email text or lacks an exact token, drop it.
        If no questions survive, set the email to USELESS ("vague").
        
        OUTPUT (JSON only):
        {
          "useless_to_recall": boolean,
          "useless_reason": "personal | hr_tax | gossip | fragment | attachment_only | chitchat | boilerplate | vague | n/a",
          "questions": ["string", "..."]
        }
        
        Classifier anchors (do not echo in output):
        - "mary, it is ok to buy a carpet shampooer. about the w-2’s, how would you" → USELESS: personal or hr_tax
        - "what did mary write? stage misses you? i sent 2 emails. maybe mary is stalking gary" → USELESS: gossip
        - "eol report for tv in conference on 33 cash -hehub -chicago -pepl -katy -waha prompt month nymex" → USELESS: fragment
        - Positive example (useful): "Approve purchase of 4 servers for Project Orion at $6.2k each, delivery by Oct 15."
        """
    )
)

## Iterate on the prompt

In [63]:
email = emails.iloc[0]

In [64]:
email.content

'jeff, i want to bid $2.8 for sagewood with a rate 8.5% or less and dependent on 30 year term'

In [65]:
prompt = prompt_template.render(
    email_content=email.content
)

In [66]:
print(prompt)

You are Paul Allen, also known as Phillip Allen, and you sent this email.

<email_content>
jeff, i want to bid $2.8 for sagewood with a rate 8.5% or less and dependent on 30 year term
</email_content>

Return JSON only.


CLASSIFY
Set "useless_to_recall" = true **unless** the email contains at least one
**professional, content-bearing fact** (clear business action/decision/commitment,
deliverable, schedule with date/time, contract/price/volume/rate/term, project or
asset details). Personal/household, HR/tax chat, gossip, or fragmentary jargon
lists are **not** professional.

USELESS CATEGORIES (choose one for "useless_reason"):
- personal       (household approvals, errands)
- hr_tax         (W-2, payroll, benefits chatter)
- gossip         (rumors, who said what)
- fragment       (token/jargon lists without a predicate/action)
- attachment_only(link/attachment with no in-body summary)
- chitchat       (greetings, small talk)
- boilerplate    (auto-replies, disclaimers)
- vague        

In [67]:
MODEL = GPT4O_MINI

In [68]:
completion = await cached_chat_completion_parsed_with_retry(
    model=MODEL,
    messages=[user(prompt)],
    response_format=GeneratedQuestions
)

In [69]:
completion.choices[0].message.parsed.questions

['What is the bid amount for Sagewood?',
 'What rate is being requested for Sagewood?',
 'What is the term dependent on for the Sagewood bid?',
 'What is the maximum rate mentioned for the Sagewood bid?',
 'Who is the recipient of the email?',
 'What asset is being referenced in the bid?']

## Bad questions category

'Am I certain that the rate should be 8.5% or less?' --> clarifying - not useful to retrieve the email
'What type of deal or project is Sagewood related to?' --> clarifying question - does not help to retrieve the email
'Did I address the recipient by name in the email?' --> does not relate to the content
'What type of deal or project is Sagewood related to?' --> clarifying question. Does not help to retrieve the email

In [70]:
async def try_the_prompt(i):
    email = emails.iloc[i]
    prompt = prompt_template.render(
        email_content=email.content
    )
    completion = await cached_chat_completion_parsed_with_retry(
        model=MODEL,
        messages=[user(prompt)],
        response_format=GeneratedQuestions
    )
    return completion.choices[0].message.parsed, email.content

In [71]:
r, e = await try_the_prompt(2)

In [72]:
e

'how is your racing going? what category are you up to? i'

In [73]:
print(f"List of questions: {r.questions}\n\nUseless to recall? {r.useless_to_recall}")

List of questions: []

Useless to recall? True


### notes on what we don't like in the inputs

'attached is the systems wish list for the gas basis and physical trading' -> useless because we don't have access to the attachment
'how is your racing going? what category are you up to? i' -> generated ['How is your racing going?', 'What category are you up to in racing?'] : bad questions (it is useless to recall this)

## Batching Calls

The goal is to iterate much faster to identify the useless to recall emails.

### Create the task function

In [74]:
import traceback

async def try_the_prompt(i):
    email = emails.iloc[i]
    prompt = prompt_template.render(
        email_content=email.content
    )
    completion = await cached_chat_completion_parsed_with_retry(
        model=MODEL,
        messages=[user(prompt)],
        response_format=GeneratedQuestions
    )
    return completion.choices[0].message.parsed, email.content

async def maybe_try_the_prompt(i):
    try:
        return await try_the_prompt(i)
    except Exception as e:
        return (i, e, traceback.format_exc())

In [75]:
tasks = [maybe_try_the_prompt(i) for i in range(10)]

results = await asyncio.gather(*tasks)

In [76]:
failed = [r for r in results if isinstance(r[0], int)]
f"{len(failed)=}, {len(results)=}"

'len(failed)=0, len(results)=10'

In [77]:
from pprint import pprint

In [78]:
pprint([(r[1], r[0].model_dump()) for r in results])

[('jeff, i want to bid $2.8 for sagewood with a rate 8.5% or less and '
  'dependent on 30 year term',
  {'questions': ['What is the bid amount for Sagewood?',
                 'What rate is being requested for Sagewood?',
                 'What is the term dependent on for the Sagewood bid?',
                 'What is the maximum rate mentioned for the Sagewood bid?',
                 'Who is the recipient of the email?',
                 'What asset is being referenced in the bid?'],
   'useless_to_recall': False}),
 ('attached is the systems wish list for the gas basis and physical trading',
  {'questions': [], 'useless_to_recall': True}),
 ('how is your racing going? what category are you up to? i',
  {'questions': [], 'useless_to_recall': True}),
 ('eol report for tv in conference on 33 cash -hehub -chicago -pepl -katy '
  '-waha prompt month nymex',
  {'questions': [], 'useless_to_recall': True}),
 ("mary, it is ok to buy a carpet shampooer. about the w-2's, how would you",
  {'q

## Useless to recall:

'eol report for tv in conference on 33 cash -hehub -chicago -pepl -katy '
  '-waha prompt month nymex' --> not any professional informations

mary, it is ok to buy a carpet shampooer. about the w-2's, how would you --> Not any professional informations
"'what did mary write? stage misses you? i sent 2 emails. maybe mary is stalking gary" --> There are not any professional informations.

# Run against the full dataset

In [87]:
import asyncio
from tqdm.notebook import tqdm
from tqdm.asyncio import tqdm_asyncio

In [88]:
#tasks = [maybe_try_the_prompt(i) for i in range(len(emails))]
#results = await tqdm_asyncio.gather(*tasks)


semaphore = asyncio.Semaphore(5)  # max 5 requêtes simultanées

async def limited_call(i):
    async with semaphore:
        return await maybe_try_the_prompt(i)

tasks = [limited_call(i) for i in range(len(emails))]
results = await tqdm_asyncio.gather(*tasks)

100%|██████████| 396/396 [00:00<00:00, 962.87it/s]


In [89]:
failed = [r for r in results if isinstance(r[0], int)]
f"{len(failed)=}, {len(results)=}"

'len(failed)=0, len(results)=396'

In [91]:
results[0]

(GeneratedQuestions(useless_to_recall=False, questions=['What is the bid amount for Sagewood?', 'What rate is being requested for Sagewood?', 'What is the term dependent on for the Sagewood bid?', 'What is the maximum rate mentioned for the Sagewood bid?', 'Who is the recipient of the email?', 'What asset is being referenced in the bid?']),
 'jeff, i want to bid $2.8 for sagewood with a rate 8.5% or less and dependent on 30 year term')

### Join the questions with email and save

In [99]:
col_useless_to_recall = []
col_questions = []

for (generated_questions, _) in results:
    col_useless_to_recall.append(generated_questions.useless_to_recall)
    col_questions.append(generated_questions.questions)

In [100]:
len(col_questions)

396

In [101]:
len(col_useless_to_recall)

396

In [102]:
len(emails)

396

In [103]:
emails['useless_to_recall'] = col_useless_to_recall

In [104]:
emails['questions'] = col_questions

In [107]:
emails.to_csv('paul_allen_sent_email_with_questions_v1.csv', index=False)