In [30]:
import pandas as pd
from openai import OpenAI
from datetime import datetime
from pathlib import Path
from typing import List
import re
from dotenv import load_dotenv
import os
from pydantic import BaseModel
import json

## Helper Functions

In [2]:
def save_df_to_csv(samples_df: pd.DataFrame, name: str = 'samples', directory: str = './data/'):
    timestamp_str = datetime.now().strftime("%Y%m%d%H%M%S")
    samples_df.to_csv(f"{directory}{name}_{timestamp_str}.csv")

In [3]:
def list_files(directory: str) -> List[str]:
    """Return all file names (not directories) in the given directory."""
    path = Path(directory).expanduser()
    return [p.name for p in path.iterdir() if p.is_file()]

In [4]:
def get_last_saved_timestamp(directory: str = './data/'):
    filenames = list_files(directory)
    matches = [ re.search(r'.*?(\d{14})\.csv$', fn) for fn in filenames ]
    timestamps = [ int(m.group(1)) for m in matches if m is not None ]
    return str(max(timestamps))

# get_last_saved_timestamp()

In [5]:
def load_df(timestamp: str, name: str = 'samples', directory: str = './data/') -> pd.DataFrame:
    return pd.read_csv(f"{directory}{name}_{timestamp}.csv", index_col=0)

In [6]:
def load_latest_df(name: str = 'samples', directory: str = './data/') -> pd.DataFrame:
    timestamp = get_last_saved_timestamp(directory=directory)
    return load_df(timestamp=timestamp, name=name, directory=directory)

## Do Stuff

In [51]:
samples_df = pd.DataFrame(columns=['First Name', 'Last Name', 'City', 'State', 'Birth Year', 'Text Snippet', 'Source', 'Is Match'])
# samples_df.loc[0] = ['Dave', 'Freifeld', 'Longmont', 'Colorado', 1989]
# samples_df
# save_df_to_csv(samples_df)

In [56]:
# read in env vars from the .env file
load_dotenv()

PROMPT_ID_SYNTH_GEN = os.environ['PROMPT_ID_SYNTH_GEN']
PROMPT_ID_SCORER = os.environ['PROMPT_ID_SCORER']

# connect and get an OpenAI client
client = OpenAI()

In [25]:
class BioSnippet(BaseModel):
    text_snippet: str
    source: str
    is_match: bool

In [26]:
class UserWithLabeledSnippets(BaseModel):
    first_name: str
    last_name: str
    city: str
    state: str
    birth_year: int
    snippets: list[BioSnippet]

In [41]:
class ListOfUsersWithLabeledSnippets(BaseModel):
    users: list[UserWithLabeledSnippets]

In [42]:
resp = client.responses.parse(
    prompt={"id": PROMPT_ID_SYNTH_GEN},
    input=[
        {
            "role": "user",
            "content": """
                Generate some example fictional people. 
                - Each person should have 5 to 10 snippets. 
                - Each snippet should represent a brief bio of the fictional person found via googling. e.g. a whitepages.com listing, an instagram bio, a reddit post, etc.
                - Each snippet should either actually match the person, and hence have a label of is_match=True, or it should be a close, but not actual, match, and have is_match=False.
                - Each snippet should have a source (e.g. whitepages, reddit, instagram)
            """,
        },
    ],
    text_format=ListOfUsersWithLabeledSnippets,
)

In [50]:
users_list = json.loads(resp.output_text)['users']

In [53]:
for user in users_list:
    for snippet in user['snippets']:
        new_row = {
            "First Name": user['first_name'],
            "Last Name": user['last_name'],
            "City": user['city'],
            "State": user['state'],
            "Birth Year": user['birth_year'],
            "Text Snippet": snippet['text_snippet'],
            "Source": snippet['source'],
            "Is Match": snippet['is_match'],
        }
        samples_df = pd.concat([samples_df, pd.DataFrame([new_row])], ignore_index=True)

In [58]:
samples_df.head(10)

Unnamed: 0,First Name,Last Name,City,State,Birth Year,Text Snippet,Source,Is Match
0,Emily,Carter,Portland,OR,1990,Emily Carter is a UX designer based in Portlan...,linkedin,True
1,Emily,Carter,Portland,OR,1990,@emily.draws | UX designer @CloudHarbor | PDX ...,instagram,True
2,Emily,Carter,Portland,OR,1990,"Emily J Carter, age 34–35, current address nea...",whitepages,True
3,Emily,Carter,Portland,OR,1990,"Speaker bio: Emily Carter is a Portland, OR-ba...",conference_website,True
4,Emily,Carter,Portland,OR,1990,Medium profile: Emily Carter writes about desi...,medium,True
5,Emily,Carter,Portland,OR,1990,Meetup organizer profile: Emily C. hosts the m...,meetup,True
6,Emily,Carter,Portland,OR,1990,"Emily Anne Carter, freelance photographer in P...",personal_website,False
7,Emily,Carter,Portland,OR,1990,"Reddit u/emily_carter90: ""Any other Carters in...",reddit,False
8,Jamal,Thompson,Atlanta,GA,1985,Jamal D. Thompson is an Atlanta-based high sch...,school_website,True
9,Jamal,Thompson,Atlanta,GA,1985,@CoachJT404 | Math teacher • Varsity boys’ bas...,twitter,True


In [55]:
save_df_to_csv(samples_df)

In [64]:
class MatchScoreResponse(BaseModel):
    match_prob: float

In [96]:
def get_match_prob(row, prompt_id):
    first_name = row['First Name']
    last_name = row['Last Name']
    city = row['City']
    state = row['State']
    birth_year = row['Birth Year']
    snippet = row['Text Snippet']
    source = row['Source']

    input_message = f"""
        Here is the profile of a person. (The current year is 2025). 
        - First Name: {first_name}
        - Last Name: {last_name}
        - City: {city}
        - State: {state}
        - Birth Year: {birth_year}
        Here is a snippet found online via googling that person: "{snippet}"

        It is from {source}.

        Please output the probability (a number from 0 to 1 rounded to two decimal places) that the snippet is 
        referring to that specific person. 1 means you are absolutely sure that it is a match, and 0 means that you
        are absolutely sure it is not.
    """

    resp = client.responses.parse(
        prompt={"id": prompt_id},
        input=[
            {
                "role": "user",
                "content": input_message
            },
        ],
        text_format=MatchScoreResponse,
    )   

    return float(json.loads(resp.output_text)['match_prob'])

In [103]:
def get_all_match_probs(df, prompt_id):
    match_probs_list = []
    for _, row in df.iterrows():
        print(row)
        match_prob = get_match_prob(row, prompt_id)
        print(f"Match Prob: {match_prob}\n")
        match_probs_list.append(match_prob)
    
    df['Match Prob'] = pd.Series(match_probs_list, index=df.index)

In [104]:
samples_df_subset = samples_df[:2].copy()
display(samples_df_subset)
get_all_match_probs(samples_df_subset, PROMPT_ID_SCORER)
samples_df_subset

Unnamed: 0,First Name,Last Name,City,State,Birth Year,Text Snippet,Source,Is Match
0,Emily,Carter,Portland,OR,1990,Emily Carter is a UX designer based in Portlan...,linkedin,True
1,Emily,Carter,Portland,OR,1990,@emily.draws | UX designer @CloudHarbor | PDX ...,instagram,True


First Name                                                  Emily
Last Name                                                  Carter
City                                                     Portland
State                                                          OR
Birth Year                                                   1990
Text Snippet    Emily Carter is a UX designer based in Portlan...
Source                                                   linkedin
Is Match                                                     True
Name: 0, dtype: object
Match Prob: 0.68

First Name                                                  Emily
Last Name                                                  Carter
City                                                     Portland
State                                                          OR
Birth Year                                                   1990
Text Snippet    @emily.draws | UX designer @CloudHarbor | PDX ...
Source                             

Unnamed: 0,First Name,Last Name,City,State,Birth Year,Text Snippet,Source,Is Match,Match Prob
0,Emily,Carter,Portland,OR,1990,Emily Carter is a UX designer based in Portlan...,linkedin,True,0.68
1,Emily,Carter,Portland,OR,1990,@emily.draws | UX designer @CloudHarbor | PDX ...,instagram,True,0.22
