In [30]:
import pandas as pd
from openai import OpenAI
from datetime import datetime
from pathlib import Path
from typing import List
import re
from dotenv import load_dotenv
import os
from pydantic import BaseModel
import json

## Helper Functions

In [2]:
def save_df_to_csv(samples_df: pd.DataFrame, name: str = 'samples', directory: str = './data/'):
    timestamp_str = datetime.now().strftime("%Y%m%d%H%M%S")
    samples_df.to_csv(f"{directory}{name}_{timestamp_str}.csv")

In [3]:
def list_files(directory: str) -> List[str]:
    """Return all file names (not directories) in the given directory."""
    path = Path(directory).expanduser()
    return [p.name for p in path.iterdir() if p.is_file()]

In [4]:
def get_last_saved_timestamp(directory: str = './data/'):
    filenames = list_files(directory)
    matches = [ re.search(r'.*?(\d{14})\.csv$', fn) for fn in filenames ]
    timestamps = [ int(m.group(1)) for m in matches if m is not None ]
    return str(max(timestamps))

# get_last_saved_timestamp()

In [5]:
def load_df(timestamp: str, name: str = 'samples', directory: str = './data/') -> pd.DataFrame:
    return pd.read_csv(f"{directory}{name}_{timestamp}.csv", index_col=0)

In [6]:
def load_latest_df(name: str = 'samples', directory: str = './data/') -> pd.DataFrame:
    timestamp = get_last_saved_timestamp(directory=directory)
    return load_df(timestamp=timestamp, name=name, directory=directory)

## Do Stuff

In [37]:
samples_df = pd.DataFrame(columns=['First Name', 'Last Name', 'City', 'State', 'Birth Year', 'Text Snippet', 'Source', 'Is Match'])
# samples_df.loc[0] = ['Dave', 'Freifeld', 'Longmont', 'Colorado', 1989]
# samples_df
# save_df_to_csv(samples_df)

In [9]:
# read in env vars from the .env file
load_dotenv()

PROMPT_ID_SYNTH_GEN = os.environ['PROMPT_ID_SYNTH_GEN']

# connect and get an OpenAI client
client = OpenAI()

In [25]:
class BioSnippet(BaseModel):
    text_snippet: str
    source: str
    is_match: bool

In [26]:
class UserWithLabeledSnippets(BaseModel):
    first_name: str
    last_name: str
    city: str
    state: str
    birth_year: int
    snippets: list[BioSnippet]

In [27]:
resp = client.responses.parse(
    prompt={"id": PROMPT_ID_SYNTH_GEN},
    input=[
        {
            "role": "user",
            "content": """
                Generate an example person. 
                - The person should have 3 snippets. 
                - Each snippet should represent a brief bio of the fictional person found via googling. e.g. a whitepages.com listing, an instagram bio, a reddit post, etc.
                - Each snippet should either actually match the person, and hence have a label of is_match=True, or it should be a close, but not actual, match, and have is_match=False.
                - Each snippet should have a source (e.g. whitepages, reddit, instagram)
            """,
        },
    ],
    text_format=UserWithLabeledSnippets,
)

In [29]:
print(resp.output_text)

{"first_name":"Dana","last_name":"Whitfield","city":"Portland","state":"OR","birth_year":1990,"snippets":[{"text_snippet":"Dana Whitfield, age 34, lives in Portland, Oregon (97214). Ceramic artist and part‑time barista. Known for hand‑thrown minimalist tableware sold at local weekend markets. Previously studied fine arts at a community college in Seattle before relocating to Portland in 2016.","source":"whitepages-style directory","is_match":true},{"text_snippet":"Dana L. Whitfield, age 52, resides in Eugene, Oregon (97401). Works as a real estate broker and volunteers with the local Rotary Club. Associated names: Daniel Whitfield, Anne Whitfield. Prior addresses in Salem and Corvallis.","source":"whitepages","is_match":false},{"text_snippet":"u/danawhitfield90: \"Hey r/Portland! I’m Dana, a ceramic artist and barista living around the Hawthorne area. I run a tiny home studio where I make small-batch mugs and ramen bowls. I moved here from Seattle a few years ago and I’m always looking

In [36]:
user_0 = json.loads(resp.output_text)
user_0

{'first_name': 'Dana',
 'last_name': 'Whitfield',
 'city': 'Portland',
 'state': 'OR',
 'birth_year': 1990,
 'snippets': [{'text_snippet': 'Dana Whitfield, age 34, lives in Portland, Oregon (97214). Ceramic artist and part‑time barista. Known for hand‑thrown minimalist tableware sold at local weekend markets. Previously studied fine arts at a community college in Seattle before relocating to Portland in 2016.',
   'source': 'whitepages-style directory',
   'is_match': True},
  {'text_snippet': 'Dana L. Whitfield, age 52, resides in Eugene, Oregon (97401). Works as a real estate broker and volunteers with the local Rotary Club. Associated names: Daniel Whitfield, Anne Whitfield. Prior addresses in Salem and Corvallis.',
   'source': 'whitepages',
   'is_match': False},
  {'text_snippet': 'u/danawhitfield90: "Hey r/Portland! I’m Dana, a ceramic artist and barista living around the Hawthorne area. I run a tiny home studio where I make small-batch mugs and ramen bowls. I moved here from Se

In [38]:
for snippet in user_0['snippets']:
    new_row = {
        "First Name": user_0['first_name'],
        "Last Name": user_0['last_name'],
        "City": user_0['city'],
        "State": user_0['state'],
        "Birth Year": user_0['birth_year'],
        "Text Snippet": snippet['text_snippet'],
        "Source": snippet['source'],
        "Is Match": snippet['is_match'],
    }
    samples_df = pd.concat([samples_df, pd.DataFrame([new_row])], ignore_index=True)

In [39]:
samples_df

Unnamed: 0,First Name,Last Name,City,State,Birth Year,Text Snippet,Source,Is Match
0,Dana,Whitfield,Portland,OR,1990,"Dana Whitfield, age 34, lives in Portland, Ore...",whitepages-style directory,True
1,Dana,Whitfield,Portland,OR,1990,"Dana L. Whitfield, age 52, resides in Eugene, ...",whitepages,False
2,Dana,Whitfield,Portland,OR,1990,"u/danawhitfield90: ""Hey r/Portland! I’m Dana, ...",reddit,True
