# Disease Demographics Verification

This notebook creates:
**ChatGPT batch prompts** - Copy-paste into ChatGPT for manual lookup

In [1]:
import pandas as pd
import numpy as np
import json
import requests
import re
import time
from pathlib import Path
from bs4 import BeautifulSoup
from typing import Dict, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

project_root = Path.cwd().parent
print(f"Project root: {project_root}")

Project root: c:\Users\henry\Desktop\Programming\Python\Multimodal_Diagnosis


In [2]:
# Load diseases from mapping
with open(project_root / "data" / "disease_mapping.json") as f:
    disease_mapping = json.load(f)

# Flatten to list with categories
diseases_with_categories = []
for category, disease_list in disease_mapping.items():
    for disease in disease_list:
        diseases_with_categories.append({"disease": disease, "category": category})

diseases_df = pd.DataFrame(diseases_with_categories)
print(f"Total diseases: {len(diseases_df)}")
diseases_df.head()

Total diseases: 702


Unnamed: 0,disease,category
0,acute stress reaction,Mental and Behavioral Health
1,adjustment reaction,Mental and Behavioral Health
2,anxiety,Mental and Behavioral Health
3,asperger syndrome,Mental and Behavioral Health
4,attention deficit hyperactivity disorder (adhd),Mental and Behavioral Health



## Part 1: ChatGPT Batch Prompts

Copy each batch into ChatGPT and save the responses.

In [3]:
def create_chatgpt_prompt(diseases: list, batch_num: int) -> str:
    """Create a prompt for ChatGPT to process a batch of diseases."""
    prompt = f"""BATCH {batch_num}: For each disease below, provide the typical demographics.

Format your response EXACTLY as CSV rows like this:
disease_name,age_min,age_max,peak_age,male_percent

Rules:
- age_min/max: typical age range of onset (integers)
- peak_age: most common age of diagnosis
- male_percent: 0-100 (use 50 if roughly equal, 0 for female-only, 100 for male-only)

DISEASES:
"""
    for i, disease in enumerate(diseases, 1):
        prompt += f"{i}. {disease}\n"
    
    prompt += "\nRespond with ONLY the CSV data, no explanations."
    return prompt

In [4]:
# Generate batches of 25 diseases each
BATCH_SIZE = 25
all_diseases = diseases_df['disease'].tolist()
batches = [all_diseases[i:i+BATCH_SIZE] for i in range(0, len(all_diseases), BATCH_SIZE)]

print(f"Total batches: {len(batches)}")
print(f"Diseases per batch: {BATCH_SIZE}")
print(f"\nEstimated time: {len(batches) * 1.5:.0f} minutes (if ~1.5 min per batch)")

Total batches: 29
Diseases per batch: 25

Estimated time: 44 minutes (if ~1.5 min per batch)


In [5]:
# Save all prompts to text files
prompts_dir = project_root / "data" / "demographics_prompts"
prompts_dir.mkdir(exist_ok=True)

for i, batch in enumerate(batches):
    prompt = create_chatgpt_prompt(batch, i+1)
    with open(prompts_dir / f"batch_{i+1:02d}.txt", "w") as f:
        f.write(prompt)

print(f"Saved {len(batches)} prompt files to:")
print(f"  {prompts_dir}")
print(f"\nOpen each file, paste into ChatGPT, save response as batch_XX_response.csv")

Saved 29 prompt files to:
  c:\Users\henry\Desktop\Programming\Python\Multimodal_Diagnosis\data\demographics_prompts

Open each file, paste into ChatGPT, save response as batch_XX_response.csv


In [6]:
# Show first batch as example
print("=" * 70)
print("EXAMPLE - BATCH 1 (copy this into ChatGPT):")
print("=" * 70)
print(create_chatgpt_prompt(batches[0], 1))

EXAMPLE - BATCH 1 (copy this into ChatGPT):
BATCH 1: For each disease below, provide the typical demographics.

Format your response EXACTLY as CSV rows like this:
disease_name,age_min,age_max,peak_age,male_percent

Rules:
- age_min/max: typical age range of onset (integers)
- peak_age: most common age of diagnosis
- male_percent: 0-100 (use 50 if roughly equal, 0 for female-only, 100 for male-only)

DISEASES:
1. acute stress reaction
2. adjustment reaction
3. anxiety
4. asperger syndrome
5. attention deficit hyperactivity disorder (adhd)
6. autism
7. bipolar disorder
8. conduct disorder
9. conversion disorder
10. delirium
11. depression
12. dissociative disorder
13. drug abuse (barbiturates)
14. drug abuse (cocaine)
15. drug abuse (methamphetamine)
16. drug abuse (opioids)
17. dysthymic disorder
18. eating disorder
19. factitious disorder
20. impulse control disorder
21. marijuana abuse
22. neurosis
23. obsessive compulsive disorder (ocd)
24. oppositional disorder
25. panic attack

Re


## Part 2: Process ChatGPT Responses

After you've pasted all batches into ChatGPT and saved responses:

In [14]:
def parse_chatgpt_response(response_file: Path) -> pd.DataFrame:
    """Parse a ChatGPT response CSV."""
    try:
        # Try to read as CSV
        df = pd.read_csv(response_file, names=['disease', 'age_min', 'age_max', 'peak_age', 'male_pct'])
        df['source'] = 'chatgpt'
        return df
    except:
        print(f"Could not parse {response_file}")
        return pd.DataFrame()

def load_all_chatgpt_responses() -> pd.DataFrame:
    """Load all ChatGPT response files."""
    responses_dir = project_root / "data" / "demographics_prompts"
    all_responses = []
    
    for response_file in sorted(responses_dir.glob("*_response.csv")):
        df = parse_chatgpt_response(response_file)
        if not df.empty:
            all_responses.append(df)
    
    if all_responses:
        return pd.concat(all_responses, ignore_index=True)
    return pd.DataFrame()

# Try to load responses
chatgpt_df = load_all_chatgpt_responses()
if len(chatgpt_df) > 0:
    print(f"Loaded {len(chatgpt_df)} ChatGPT responses")
else:
    print("No ChatGPT responses found yet.")
    print("Save ChatGPT responses as: data/demographics_prompts/batch_XX_response.csv")

Loaded 702 ChatGPT responses
