In [1]:
import pandas as pd

In [2]:
# emails = pd.read_csv("data/paul_allen_sent_emails.csv")

# alternatively, practice with my own data:
emails = pd.read_csv("data/satoshi.csv")

In [3]:
emails.head()

Unnamed: 0,content,chunk_index,collection,content_type,filename,sequence_number,source,source_file,title,total_chunks
0,"Emails\nBitcoin P2P e-cash paperOct 31, 2008, ...",0,emails,email,email_002_Cryptography_Mailing_List,2,nakamotoinstitute_files\emails\email_002_Crypt...,nakamotoinstitute_files\emails\email_002_Crypt...,Cryptography Mailing List,1
1,Emails\n[p2p-research] Bitcoin open source imp...,0,emails,email,email_004_P2P_Research_List,4,nakamotoinstitute_files\emails\email_004_P2P_R...,nakamotoinstitute_files\emails\email_004_P2P_R...,P2P Research List,1
2,From: | Satoshi Nakamoto | Subject: | Bitcoin ...,0,emails,email,email_010_Bitcoin_P2P_e-cash_paper,10,nakamotoinstitute_files\emails\email_010_Bitco...,nakamotoinstitute_files\emails\email_010_Bitco...,Bitcoin P2P e-cash paper,1
3,From: | Satoshi Nakamoto | Subject: | [bitcoin...,0,emails,email,email_022_[bitcoin-list]_Welcome,22,nakamotoinstitute_files\emails\email_022_[bitc...,nakamotoinstitute_files\emails\email_022_[bitc...,[bitcoin-list] Welcome,1
4,From: | Satoshi Nakamoto | Subject: | [bitcoin...,0,emails,email,email_024_[bitcoin-list]_Bitcoin_v0.1.2_now_av...,24,nakamotoinstitute_files\emails\email_024_[bitc...,nakamotoinstitute_files\emails\email_024_[bitc...,[bitcoin-list] Bitcoin v0.1.2 now available,1


## Prompt Engineering

In [4]:
from jinja2 import Template
from textwrap import dedent
from pydantic import BaseModel

In [5]:
# prompt_template = Template(dedent(
#     """
#     You are Paul Allen, also known as Phillip Allen, and you sent this email:
    
#     <email content>
#     {{ email }}
#     </email content>

#     Your task is to generate questions that Paul Allen could ask about the email at a later date when trying to remember it.

#     Guidelines:
#     - Only ask questions that can be answered with the content of the email. Do not speculate, imagine responses, or make assumptions.

#     Your output should be in JSON format, using the following schema:
#     ```
#     {
#         "questions": ["string 1", "string 2", ...]
#     }
#     ```
#     """
# ))


prompt_template = Template(dedent(
"""
    The following is a quote, forum post, or email from Satoshi Nakamoto, the creator of Bitcoin. Use it as the sole basis for generating questions.
    
    <document content>
    {{ email }}
    </document content>

    First, evaluate if the document is useful for a retrieval system (RAG) where users ask questions about Satoshi Nakamoto, his ideas, or Bitcoin's development. A document is 'useless_to_recall' if it contains no meaningful information, such as: only dates, links, HTML tags, metadata; single-word or trivial replies (e.g., 'Yes', 'Thanks'); empty content; or nothing specific/tied to Satoshi's thoughts, opinions, or technical details. If the document has any substantive content (e.g., explanations, opinions, mentions of concepts), it is useful.

    Your task is to:
    - If the document is useless, set 'useless_to_recall' to true and generate an empty list for 'questions'.
    - If the document is useful, set 'useless_to_recall' to false and generate 3-7 diverse questions that a curious user might naturally ask about Satoshi Nakamoto, his ideas, or Bitcoin's development. These questions should be phrased such that they could be used to retrieve this specific document in a search system (e.g., via semantic matching), and the document's content would directly provide the answer or key evidence.

    Guidelines for questions (only apply if useful):
    - Questions must be standalone and natural, as if asked by someone researching Satoshi without any prior context or reference to 'the document,' 'this quote,' or 'the email.' For example, avoid: 'What is discussed in the document?' Instead, use: 'What did Satoshi Nakamoto say about peer-to-peer networks?'
    - Only base questions on explicit information in the document. Do not speculate, add external knowledge, or assume additional details about Satoshi or Bitcoin.
    - Ensure questions are specific to unique aspects of the document (e.g., opinions, mentions, technical details, timelines) to test precise retrieval, but keep them broad enough to sound like real user queries.
    - Avoid vague, overly broad, or clarifying questions (e.g., no 'Can you explain more about...?'). Focus on factual, opinion-based, or existence questions like 'What was Satoshi's view on...?' or 'Did Satoshi mention...?'
    - Promote diversity: Include a mix of question types (e.g., 2-3 what/why/how, 1-2 yes/no, 1-2 timeline-based) and cover different parts of the document without repetition.
    - Do not generate questions that could be answered by many documents; tie them subtly to this one's content for evaluation purposes.

    Your output should be in JSON format, using the following schema:
    ```
    {
        "useless_to_recall": boolean,
        "questions": ["string 1", "string 2", ...]
    }
    ```
    """
))

In [6]:
prompt_template.render(email="blabla")

'\nThe following is a quote, forum post, or email from Satoshi Nakamoto, the creator of Bitcoin. Use it as the sole basis for generating questions.\n\n<document content>\nblabla\n</document content>\n\nFirst, evaluate if the document is useful for a retrieval system (RAG) where users ask questions about Satoshi Nakamoto, his ideas, or Bitcoin\'s development. A document is \'useless_to_recall\' if it contains no meaningful information, such as: only dates, links, HTML tags, metadata; single-word or trivial replies (e.g., \'Yes\', \'Thanks\'); empty content; or nothing specific/tied to Satoshi\'s thoughts, opinions, or technical details. If the document has any substantive content (e.g., explanations, opinions, mentions of concepts), it is useful.\n\nYour task is to:\n- If the document is useless, set \'useless_to_recall\' to true and generate an empty list for \'questions\'.\n- If the document is useful, set \'useless_to_recall\' to false and generate 3-7 diverse questions that a curiou

In [7]:
class GeneratedQuestions(BaseModel):
    useless_to_recall: bool
    questions: list[str]

# LLM Setup
tracing, async, caching etc...

In [8]:
from dotenv import load_dotenv
load_dotenv()

from langfuse.openai import AsyncOpenAI

from diskcache_decorator import cached

In [9]:
def _msg(role, content):
    # simple helper function to create a message object
    return {"role": role, "content": content}

def system(content):
    return _msg("system", content)

def user(content):
    return _msg("user", content)

def assistant(content):
    return _msg("assistant", content)

In [10]:
client = AsyncOpenAI(
    base_url="http://localhost:11434/v1",  # Ollama's default local endpoint
    api_key="ollama"  # Dummy key, required but not used by Ollama
)


In [11]:
@cached()
async def get_completion(messages, response_format):
    completion = await client.chat.completions.parse(
        model="llama3.2:3b",
        messages=messages,
        response_format=response_format
    )
    return completion.choices[0].message.parsed


# sanity check:

class CalendarEvent(BaseModel):
    name: str
    date: str
    participants: list[str]

messages = [
            system("Extract the event information."),
            user("Alice and Bob are going to a science fair on Friday.")
        ]

parsed = await get_completion(messages, CalendarEvent)

print(parsed)

name='Science Fair' date='Friday' participants=['Alice', 'Bob']


# Iterate on the Prompt

In [12]:
email = emails.iloc[173].content

print(email)

prompt = prompt_template.render(email=email)

Quote from: bdonlan on July 15, 2010, 11:27:14 PMin 120DPI mode.What is "120DPI mode"?Â  Is that an actual setting somewhere?Â  Sounds like an obscure enough candidate.Â  I suppose it needs twice the resolution icon to fill the size of the upper left corner icon.Â  Only one size is provided.
Re: "SetIcons(): icon bundle doesn't contain any suitable icon"
July 15, 2010 at 23:41:23 UTC
BitcoinTalk


In [13]:
completion = await get_completion(
    messages=[user(prompt)],
    response_format=GeneratedQuestions
)

print(completion.useless_to_recall)

completion.questions

False


['What is the purpose of setting a specific DPI mode for icons in Bitcoin software?',
 'Did bdonlan mention a particular reason why he was unable to find suitable icons for the Bitcoin bundle?',
 'In what year did Satoshi Nakamoto first post about using icons in Bitcoin.',
 "What does '120DPI mode' refer to specifically?",
 'Is there any direct relationship between DPI mode and the development of Bitcoin?',
 'Can bitcon software handle different DPI modes?',
 'Was bdonlan able to resolve his issue with setting icon bundles?']

In [14]:
emails.iloc[8].content

'From: | Satoshi Nakamoto | Subject: | [bitcoin-list] Bitcoin 0.2 released | Date: | December 17, 2009 at 06:52:09 UTC\nBitcoin 0.2 is here!\nDownload (Windows, and now Linux version available)\nhttp://sourceforge.net/projects/bitcoin/files/\nNew Features\nMartti Malmi\n- Minimize to system tray option\n- Autostart on boot option so you can keep it running in the\nbackground automatically\n- New options dialog layout for future expansion\n- Setup program for Windows\n- Linux version (tested on Ubuntu)\nSatoshi Nakamoto\n- Multi-processor support for coin generation\n- Proxy support for use with TOR\n- Fixed some slowdowns in the initial block download\nWe also have a new forum at http://www.bitcoin.org/smf/\nMany thanks to Martti (sirius-m) for all his development work, and to\nNew Liberty Standard for his help with testing the Linux version.\nSatoshi Nakamoto\n[bitcoin-list] Bitcoin 0.2 released\nbitcoin-list'

## Batching Calls

With above code we would have to manually change the document index, run the cells over and over... 

In [15]:
import traceback
import asyncio

async def try_the_prompt(i):
    email = emails.iloc[i].content
    prompt = prompt_template.render(email=email)
    completion = await get_completion(messages=[user(prompt)], response_format=GeneratedQuestions)
    return completion.useless_to_recall, completion.questions, email


async def maybe_try_the_prompt(i):
    try:
        return await try_the_prompt(i)
    except Exception as e:
        return (i, e, traceback.format_exc())

In [19]:
# Limit concurrency to avoid overwhelming file descriptors (Windows limit ~512)
SEM_LIMIT = 20  # Adjust based on your system; start low and test

async def bounded_try(i, sem):
    async with sem:
        return await maybe_try_the_prompt(i)

sem = asyncio.Semaphore(SEM_LIMIT)
tasks = [bounded_try(i, sem) for i in range(len(emails))]

results = await asyncio.gather(*tasks)

failed = [r for r in results if isinstance(r, tuple) and len(r) == 3 and isinstance(r[1], Exception)]
print(f"{len(failed)=}")

1 validation error for GeneratedQuestions
  Invalid JSON: EOF while parsing a string at line 9 column 8 [type=json_invalid, input_value='{\n    "useless_to_recal...n 0.3.17?",\n\n    "]}}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
1 validation error for GeneratedQuestions
  Invalid JSON: EOF while parsing a string at line 11 column 7 [type=json_invalid, input_value='{\n\n    "useless_to_rec...put style?",\n\n    "]}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


len(failed)=2


## Create Final Data Set

In [None]:
emails['useless_to_recall'] = [r[0] for r in results]
emails['questions'] = [r[1] for r in results]
emails.to_csv("data/satoshi_with_questions.csv", index=False)

