In [None]:
import json

us_entries = json.loads(open('/Users/darshil/projects/freemoney/darshil_local/american_entries_max_100k.json').read())

print(len(us_entries))

us_entries[0]

In [None]:
from typing import List, Optional
from pydantic import BaseModel, HttpUrl, Field

class InvestorQuickProfile(BaseModel):
    investor_name: str
    firm_name: Optional[str] = None

    personal_bio: Optional[str] = Field(description="2–4 sentences, summarized") 
    interests: Optional[List[str]] = Field(description="3–8 concise topics the investor is passionate about")
    career_background: Optional[str] = Field(description="one short paragraph")
    previous_investments: Optional[List[str]] = Field(description="up to 15 company names")
    hometown: Optional[str] = Field(description="only if explicitly stated")

    personal_twitter_profile: Optional[str] = Field(description="full profile URL")
    personal_linkedin_url: Optional[str] = Field(description="full profile URL")
    firm_twitter_profile: Optional[str] = Field(description="full profile URL")
    firm_linkedin_url: Optional[str] = Field(description="full profile URL")

    investment_focus: Optional[str] = Field(description="e.g., sectors/stages/types of startups")

In [None]:
from openai import OpenAI

client = OpenAI()

prompt = """
You are a web research agent that find information on investors.

You will get an investor's name and their firm name if its available.

Task:
Return a concise investor profile as JSON that matches the schema exactly.

Rules:
- Output JSON only (no prose).
- Be brief and factual; avoid hype.
- If a field can’t be verified from credible sources, set it to null (don’t guess).
- Disambiguate names using firm pages, LinkedIn, and geography.
- Do NOT infer “hometown” from “based in”; only fill if explicitly stated as hometown/grew up in.
- Limit `interests` to 3–8 items and `previous_investments` to at most 15 notable companies.
- Normalize URLs to https when possible.

Don't cite any sources. Provide the 

Here's the given investor details:
{investor}
"""

def get_investor_profile(entry):
    
    investor = str(entry.get("name", "")) + " from the following firm: " + str(entry.get("firm", ""))

    response = client.responses.parse(
        model="gpt-5-mini",
        tools=[{"type": "web_search_preview"}],
        input=prompt.format(investor=investor),
        text_format=InvestorQuickProfile
    )

    return response.output_parsed

# get_investor_profile(us_entries[0])

In [None]:
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

DATA_PATH = "/Users/darshil/projects/freemoney/darshil_local/american_entries_max_100k.json"
BATCH_SIZE = 800
MAX_WORKERS = 800  # tune based on rate limits


def to_plain_dict(model_obj):
    """Return a plain dict from a pydantic model-like object, or None."""
    if model_obj is None:
        return None
    if hasattr(model_obj, "model_dump"):
        return model_obj.model_dump()
    if hasattr(model_obj, "dict"):
        return model_obj.dict()
    return model_obj


def enrich_single_entry(entry):
    """Fetch structured profile and return as dict; never raise."""
    try:
        profile = get_investor_profile(entry)
        return to_plain_dict(profile)
    except Exception as error:
        print(f"[enrich_single_entry] error for '{entry.get('name')}' ({entry.get('firm')}): {error}")
        return None


# Determine remaining items to process (skip already enriched)
TOTAL = len(us_entries)
indices_to_process = [i for i in range(TOTAL) if us_entries[i].get("enrichment") is None]

if not indices_to_process:
    print("Nothing to process. All entries are already enriched.")
else:
    total_progress = tqdm(total=len(indices_to_process), desc="Total items", unit="inv", leave=True)

    for batch_start in range(0, len(indices_to_process), BATCH_SIZE):
        batch_indices = indices_to_process[batch_start: batch_start + BATCH_SIZE]
        batch_desc = f"Batch {batch_start // BATCH_SIZE + 1}"
        batch_progress = tqdm(total=len(batch_indices), desc=batch_desc, unit="inv", leave=False)

        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            future_by_index = {executor.submit(enrich_single_entry, us_entries[i]): i for i in batch_indices}
            for future in as_completed(future_by_index):
                idx = future_by_index[future]
                try:
                    enrichment_value = future.result()
                except Exception as error:
                    print(f"[batch] future failed at index {idx}: {error}")
                    enrichment_value = None
                us_entries[idx]["enrichment"] = enrichment_value
                batch_progress.update(1)
                total_progress.update(1)

        batch_progress.close()

        # Persist after each batch
        with open(DATA_PATH, "w") as outfile:
            json.dump(us_entries, outfile, ensure_ascii=False)

    total_progress.close()

# Final summary
to_process_count = len(indices_to_process)
enriched_count = sum(1 for i in indices_to_process if us_entries[i].get("enrichment") is not None)
print(f"Finished. Enriched {enriched_count}/{to_process_count} entries into {DATA_PATH}.")


In [1]:
import json

OUTPUT_PATH = "/Users/darshil/projects/freemoney/darshil_local/american_entries_max_100k_enriched_only.json"

# Load entries from memory if present, otherwise from disk
try:
    entries_source = us_entries
except NameError:
    source_path = DATA_PATH if 'DATA_PATH' in globals() else \
        "/Users/darshil/projects/freemoney/darshil_local/american_entries_max_100k.json"
    with open(source_path, "r") as infile:
        entries_source = json.load(infile)


def has_nonempty_enrichment(entry):
    """Return True if the entry has a non-empty enrichment value."""
    enrichment = entry.get("enrichment")
    if enrichment is None:
        return False
    if enrichment == {}:
        return False
    return True


filtered_entries = [entry for entry in entries_source if has_nonempty_enrichment(entry)]

with open(OUTPUT_PATH, "w") as outfile:
    json.dump(filtered_entries, outfile, ensure_ascii=False)

print(f"Saved {len(filtered_entries)} enriched entries to {OUTPUT_PATH}.")


Saved 6597 enriched entries to /Users/darshil/projects/freemoney/darshil_local/american_entries_max_100k_enriched_only.json.


In [3]:
import json
import re
from urllib.parse import urlparse

ENRICHED_ONLY_PATH = "/Users/darshil/projects/freemoney/darshil_local/american_entries_max_100k_enriched_only.json"

with open(ENRICHED_ONLY_PATH, "r") as infile:
    enriched_only = json.load(infile)


def is_valid_twitter_url(url: str) -> bool:
    """Basic validation for Twitter/X profile URLs."""
    if not isinstance(url, str) or not url.strip():
        return False
    parsed = urlparse(url.strip())
    if parsed.scheme not in ("http", "https"):
        return False
    if not parsed.netloc:
        return False
    host = parsed.netloc.lower()
    if not (host.endswith("twitter.com") or host.endswith("x.com")):
        return False
    path = parsed.path.strip("/")
    if not path:
        return False
    segments = [seg for seg in path.split("/") if seg]
    if len(segments) != 1:
        return False
    username = segments[0]
    if not re.fullmatch(r"[A-Za-z0-9_]{1,15}", username):
        return False
    return True


def is_valid_linkedin_url(url: str) -> bool:
    """Basic validation for LinkedIn profile URLs."""
    if not isinstance(url, str) or not url.strip():
        return False
    parsed = urlparse(url.strip())
    if parsed.scheme not in ("http", "https"):
        return False
    if not parsed.netloc:
        return False
    host = parsed.netloc.lower()
    if not host.endswith("linkedin.com"):
        return False
    path = parsed.path.strip("/")
    if not path:
        return False
    segments = [seg for seg in path.split("/") if seg]
    if len(segments) < 2 or segments[0] != "in":
        return False
    username = segments[1]
    if not re.fullmatch(r"[A-Za-z0-9\-]{1,100}", username):
        return False
    return True


total = len(enriched_only)

# Twitter stats
twitter_with_value = 0
twitter_invalid = 0
twitter_valid = 0
twitter_missing = 0

# LinkedIn stats
linkedin_with_value = 0
linkedin_invalid = 0
linkedin_valid = 0
linkedin_missing = 0

for entry in enriched_only:
    enrichment = entry.get("enrichment") or {}
    
    # Check Twitter
    twitter_url = enrichment.get("personal_twitter_profile")
    if twitter_url is None or (isinstance(twitter_url, str) and twitter_url.strip() == ""):
        twitter_missing += 1
    else:
        twitter_with_value += 1
        if is_valid_twitter_url(twitter_url):
            twitter_valid += 1
        else:
            twitter_invalid += 1
    
    # Check LinkedIn
    linkedin_url = enrichment.get("personal_linkedin_url")
    if linkedin_url is None or (isinstance(linkedin_url, str) and linkedin_url.strip() == ""):
        linkedin_missing += 1
    else:
        linkedin_with_value += 1
        if is_valid_linkedin_url(linkedin_url):
            linkedin_valid += 1
        else:
            linkedin_invalid += 1

print(f"Total enriched entries: {total}")
print()
print("Twitter Profile Stats:")
print(f"personal_twitter_profile present: {twitter_with_value}")
print(f"Valid links: {twitter_valid}")
print(f"Invalid links: {twitter_invalid}")
print(f"Missing/null personal_twitter_profile: {twitter_missing}")
print()
print("LinkedIn Profile Stats:")
print(f"personal_linkedin_url present: {linkedin_with_value}")
print(f"Valid links: {linkedin_valid}")
print(f"Invalid links: {linkedin_invalid}")
print(f"Missing/null personal_linkedin_url: {linkedin_missing}")


Total enriched entries: 6597

Twitter Profile Stats:
personal_twitter_profile present: 885
Valid links: 874
Invalid links: 11
Missing/null personal_twitter_profile: 5712

LinkedIn Profile Stats:
personal_linkedin_url present: 1762
Valid links: 1734
Invalid links: 28
Missing/null personal_linkedin_url: 4835


In [4]:
import json
import re
from urllib.parse import urlparse

INPUT_PATH = "/Users/darshil/projects/freemoney/darshil_local/american_entries_max_100k_enriched_only.json"
OUTPUT_TWITTER = "/Users/darshil/projects/freemoney/darshil_local/american_entries_max_100k_with_twitter.json"
OUTPUT_LINKEDIN = "/Users/darshil/projects/freemoney/darshil_local/american_entries_max_100k_with_linkedin.json"

with open(INPUT_PATH, "r") as infile:
    enriched_only = json.load(infile)


def is_valid_twitter_url(url: str) -> bool:
    if not isinstance(url, str) or not url.strip():
        return False
    parsed = urlparse(url.strip())
    if parsed.scheme not in ("http", "https"):
        return False
    host = (parsed.netloc or "").lower()
    if not (host.endswith("twitter.com") or host.endswith("x.com")):
        return False
    username = (parsed.path or "").strip("/")
    if not username:
        return False
    if "/" in username:
        return False
    return re.fullmatch(r"[A-Za-z0-9_]{1,15}", username) is not None


def is_valid_linkedin_url(url: str) -> bool:
    if not isinstance(url, str) or not url.strip():
        return False
    parsed = urlparse(url.strip())
    if parsed.scheme not in ("http", "https"):
        return False
    host = (parsed.netloc or "").lower()
    if not host.endswith("linkedin.com"):
        return False
    path = (parsed.path or "").strip("/")
    if not path:
        return False
    # Prefer personal profiles: linkedin.com/in/<slug>
    segments = [seg for seg in path.split("/") if seg]
    if len(segments) < 2:
        return False
    if segments[0] != "in":
        # allow some leniency but still require at least two segments
        return False
    return True


with_twitter = []
with_linkedin = []

for entry in enriched_only:
    enrichment = entry.get("enrichment") or {}
    tw = enrichment.get("personal_twitter_profile")
    li = enrichment.get("personal_linkedin_url")

    if is_valid_twitter_url(tw) if tw else False:
        with_twitter.append(entry)
    if is_valid_linkedin_url(li) if li else False:
        with_linkedin.append(entry)

with open(OUTPUT_TWITTER, "w") as f_tw:
    json.dump(with_twitter, f_tw, ensure_ascii=False)
with open(OUTPUT_LINKEDIN, "w") as f_li:
    json.dump(with_linkedin, f_li, ensure_ascii=False)

print(f"Saved {len(with_twitter)} entries with valid Twitter profiles to {OUTPUT_TWITTER}")
print(f"Saved {len(with_linkedin)} entries with valid LinkedIn profiles to {OUTPUT_LINKEDIN}")


Saved 874 entries with valid Twitter profiles to /Users/darshil/projects/freemoney/darshil_local/american_entries_max_100k_with_twitter.json
Saved 1749 entries with valid LinkedIn profiles to /Users/darshil/projects/freemoney/darshil_local/american_entries_max_100k_with_linkedin.json
