In [None]:
!pip install --upgrade openai




In [None]:
import os
import json
import time
import random
import re
from openai import OpenAI

# Set your OpenAI API key securely
os.environ["OPENAI_API_KEY"] = "INSERT-API-KEY-HERE"
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


In [None]:
diagnoses = [
    "Hypertension", "COPD", "CHF", "Pneumonia", "Cellulitis", "UTI",
    "DEHYDRATION", "FEVER", "Bronchiectasis", "DIARRHEA", "influenza",
    "gastroenteritis", "Skin infections", "heart failure", "hyperemesis"
]
genders = ["Male", "Female"]
condition_changes = ["no change", "improvement", "deterioration"]

def generate_prompt(patient_id, age, gender, diagnosis, change, description_length, tone):
    return f"""
You are simulating progress notes from a patient at home. Write a JSON object describing their experience over two days.

Fields to include:
- "patient_id": "{patient_id}"
- "age": {age}
- "gender": "{gender}"
- "diagnosis": "{diagnosis}"
- "change": "{change}"
- "narratives": {{
    "Day 1": a short first-person message written by the patient (about {description_length}). It should describe how they feel in simple, everyday language, matching their diagnosis. Use natural, human phrasing like someone texting or journaling. At the end of the message, include their vitals: HR, BP, Temp (°C), RR.
    "Day 2": a similar first-person message, with different symptoms or experiences. Include new vitals at the end.
}}
- "reasoning": A brief, objective explanation of how the patient's change is inferred from their complaints and vital signs. Avoid emotional or vague wording like "better" or "worse".

⚠️ Return valid JSON only. No markdown, no extra commentary. Only the JSON object.
"""


In [None]:
def extract_json(text):
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except json.JSONDecodeError:
            return None
    return None


In [None]:
def call_openai_api(prompt, retries=3):
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.9
            )
            content = response.choices[0].message.content
            return extract_json(content)
        except Exception as e:
            print(f"Error on attempt {attempt + 1}: {e}")
            time.sleep(2)
    return None


In [None]:
data = []

for i in range(650):
    patient_id = f"PID{i+1:04d}"
    age = random.randint(25, 110)
    gender = random.choice(genders)
    diagnosis = random.choice(diagnoses)
    change = random.choice(condition_changes)
    description_length = random.choice(["1–2 sentence", "3–4 sentence"])
    tone = random.choice(["formal", "conversational"])

    prompt = generate_prompt(patient_id, age, gender, diagnosis, change, description_length, tone)
    result = call_openai_api(prompt)

    if result:
        data.append(result)
    else:
        print(f" Invalid JSON format at record {i}")


In [None]:
def generate_noisy_prompt(patient_id, age, gender, diagnosis, change, description_length, tone):
    return f"""
You are simulating a noisy, real-world patient journal entry for a clinical dataset used in text classification.

Generate a JSON object with:
- "patient_id": "{patient_id}"
- "age": {age}
- "gender": "{gender}"
- "diagnosis": "{diagnosis}"
- "change": "{change}"
- "narratives": {{
    "Day 1": A short, first-person message written by the patient (about {description_length}, in a {tone} tone). Introduce natural noise: misspellings, ASR-like errors, misplaced grammar, confusion, or irrelevant phrases. Still include vitals at the end (HR, BP, Temp °C, RR).
    "Day 2": Another noisy, first-person message with different symptoms or behavior. Include new vitals at the end.
}}
- "reasoning": Explain how the change can be inferred from the combination of the noisy complaints and vitals. Keep it objective.

⚠️ Output a valid JSON object only. Do not include markdown, extra commentary, or formatting.
"""


In [None]:
# 2. Generate 50 noisy records
for i in range(50):
    patient_id = f"PIG{i+1:04d}"
    age = random.randint(25, 105)
    gender = random.choice(genders)
    diagnosis = random.choice(diagnoses)
    change = random.choice(condition_changes)

    prompt = generate_noisy_prompt(patient_id, age, gender, diagnosis, change, description_length, tone)

    result = call_openai_api(prompt)

    if result:
        data.append(result)
    else:
        print(f"Invalid JSON format at noisy record {i}")


Invalid JSON format at noisy record 40


In [None]:
random.shuffle(data)


In [None]:
with open("patient_dataset.json", "w") as f:
    json.dump(data, f, indent=2)

# Optional: export to CSV for manual review or labeling
flat_data = []
for record in data:
    try:
        flat_data.append({
            "patient_id": record["patient_id"],
            "age": record["age"],
            "gender": record["gender"],
            "diagnosis": record["diagnosis"],
            "change": record["change"],
            "day1_note": record["narratives"]["Day 1"],
            "day2_note": record["narratives"]["Day 2"],
            "reasoning": record["reasoning"],

        })
    except KeyError:
        continue

df = pd.DataFrame(flat_data)
df.to_csv("patient_dataset.csv", index=False)


In [None]:
import pandas as pd

# Convert JSON records to flat rows for CSV
flat_data = []
for record in data:
    try:
        flat_data.append({
            "patient_id": record["patient_id"],
            "age": record["age"],
            "gender": record["gender"],
            "diagnosis": record["diagnosis"],
            "change": record["change"],
            "day1_note": record["narratives"]["Day 1"],
            "day2_note": record["narratives"]["Day 2"],
            "reasoning": record["reasoning"]
        })
    except KeyError:
        continue  # Skip malformed records

# Convert to DataFrame
df = pd.DataFrame(flat_data)

# Save as CSV
df.to_csv("Condition_description.csv", index=False)

# Download in Colab
from google.colab import files
files.download("Condition_description.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>