### Generate synthetic EHR Data for Database population

In [None]:
# install the faker library
!pip install faker



### Import necessary package

In [None]:

from faker import Faker
import random
from datetime import datetime, timedelta
import json

fake = Faker()

### Helper functions to generate data

In [7]:
def get_random_diagnosis():
    diagnoses = [
        "Anxiety", "Depression", "Bipolar Disorder", "ADHD", 
        "Insomnia", "PTSD", "OCD", "Schizophrenia"
    ]
    return random.choice(diagnoses)

def get_random_medications(diagnosis):
    med_mapping = {
        "Anxiety": ["Alprazolam 0.5mg", "Diazepam 5mg", "Buspirone 10mg"],
        "Depression": ["Sertraline 50mg", "Fluoxetine 20mg", "Escitalopram 10mg"],
        "Bipolar Disorder": ["Lithium 300mg", "Quetiapine 100mg", "Lamotrigine 100mg"],
        "ADHD": ["Methylphenidate 10mg", "Adderall 20mg", "Atomoxetine 40mg"],
        "Insomnia": ["Zolpidem 5mg", "Eszopiclone 2mg", "Ramelteon 8mg"],
        "PTSD": ["Sertraline 100mg", "Prazosin 2mg", "Paroxetine 20mg"],
        "OCD": ["Fluvoxamine 100mg", "Clomipramine 25mg", "Sertraline 50mg"],
        "Schizophrenia": ["Risperidone 2mg", "Olanzapine 5mg", "Aripiprazole 10mg"]
    }
    return random.sample(med_mapping[diagnosis], random.randint(1, 3))

def generate_lab_results():
    return {
        "Hb": round(random.uniform(11.0, 16.0), 1),
        "WBC": round(random.uniform(4.0, 11.0), 1),
        "Platelets": round(random.uniform(150, 450)),
        "TSH": round(random.uniform(0.4, 4.0), 2),
        "Vitamin D": round(random.uniform(20, 50)),
        "LDL": round(random.uniform(70, 130)),
        "HDL": round(random.uniform(40, 60))
    }
    
def generate_encounter_note(diagnosis):
    symptoms = {
        "Anxiety": ["persistent worry", "panic attacks", "restlessness", "sleep difficulties"],
        "Depression": ["low mood", "fatigue", "loss of interest", "poor concentration"],
        "Bipolar Disorder": ["mood swings", "elevated energy", "racing thoughts", "impulsivity"],
        "ADHD": ["inattention", "hyperactivity", "impulsivity", "organization difficulties"],
        "Insomnia": ["difficulty falling asleep", "early morning awakening", "daytime fatigue"],
        "PTSD": ["flashbacks", "nightmares", "avoidance", "hypervigilance"],
        "OCD": ["intrusive thoughts", "compulsive behaviors", "anxiety", "ritualistic actions"],
        "Schizophrenia": ["hallucinations", "delusions", "disorganized thinking", "social withdrawal"]
    }
    
    treatments = ["CBT", "medication adjustment", "psychotherapy", "group therapy", 
                 "mindfulness training", "stress management techniques"]
    
    symptoms_text = random.sample(symptoms[diagnosis], 2)
    treatment = random.choice(treatments)
    
    return f"Patient reports {' and '.join(symptoms_text)}. Recommended {treatment}."

#### Main Patient Data Generator

In [8]:
def generate_patient_data(patient_id):
    diagnosis = get_random_diagnosis()
    
    # Generate encounters
    num_encounters = random.randint(1, 5)
    encounters = []
    base_date = datetime.now() - timedelta(days=365)
    
    for _ in range(num_encounters):
        encounter_date = base_date + timedelta(days=random.randint(0, 365))
        encounters.append({
            "Date": encounter_date.strftime("%Y-%m-%d"),
            "Reason": random.choice(["Follow-up", "Initial Consultation", "Medication Review", "Crisis Intervention"]),
            "LabResults": generate_lab_results(),
            "Notes": generate_encounter_note(diagnosis)
        })
    
    # Sort encounters by date
    encounters = sorted(encounters, key=lambda x: x["Date"])
    
    return {
        "PatientID": patient_id,
        "Age": random.randint(18, 80),
        "Gender": random.choice(["Male", "Female", "Other"]),
        "Diagnosis": diagnosis,
        "Medications": get_random_medications(diagnosis),
        "Encounters": encounters
    }

In [24]:
# generation of 1000 patient records
num_patients = 1000
dataset = [generate_patient_data(1000 + i) for i in range(num_patients)]

In [25]:
# Print sample patient data
print(json.dumps(dataset[0], indent=2))

{
  "PatientID": 1000,
  "Age": 72,
  "Gender": "Female",
  "Diagnosis": "Depression",
  "Medications": [
    "Fluoxetine 20mg",
    "Escitalopram 10mg"
  ],
  "Encounters": [
    {
      "Date": "2024-08-17",
      "Reason": "Crisis Intervention",
      "LabResults": {
        "Hb": 11.5,
        "WBC": 9.2,
        "Platelets": 324,
        "TSH": 1.55,
        "Vitamin D": 40,
        "LDL": 120,
        "HDL": 58
      },
      "Notes": "Patient reports loss of interest and fatigue. Recommended stress management techniques."
    },
    {
      "Date": "2024-08-29",
      "Reason": "Medication Review",
      "LabResults": {
        "Hb": 14.6,
        "WBC": 9.3,
        "Platelets": 418,
        "TSH": 1.12,
        "Vitamin D": 27,
        "LDL": 118,
        "HDL": 46
      },
      "Notes": "Patient reports fatigue and loss of interest. Recommended mindfulness training."
    },
    {
      "Date": "2024-10-06",
      "Reason": "Initial Consultation",
      "LabResults": {
      

#### Populating the database for function calling retrieval

In [30]:
# "mongodb+srv://frshafi49:<db_password>@demoprojectcluster.jqxbv.mongodb.net/?retryWrites=true&w=majority&appName=DemoProjectCluster"
MONGO_URI="mongodb+srv://frshafi49:Pr98ject3C6aa@demoprojectcluster.jqxbv.mongodb.net/?retryWrites=true&w=majority&appName=DemoProjectCluster"

In [26]:
# Install required package
!pip install pymongo

Collecting pymongo
  Downloading pymongo-4.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Using cached dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached dnspython-2.7.0-py3-none-any.whl (313 kB)
Installing collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.10.1


In [34]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from tqdm import tqdm

In [36]:

# Create a new client and connect to the server
client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
db = client['ehr_database']
collection = db['patient_records']
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("You successfully connected to MongoDB!")
except Exception as e:
    print(e)

You successfully connected to MongoDB!


### Batch insert one 1000 data into DB

In [35]:
total_docs = len(dataset)
inserted_count = 0
    
try:
    for i in tqdm(range(0, total_docs, 100)):
        batch = dataset[i:min(i + 100, total_docs)]
        result = collection.insert_many(batch)
        inserted_count += len(result.inserted_ids)
        
    print(f"Successfully inserted {inserted_count} documents")
    
except Exception as e:
    print(f"Error during batch insert: {str(e)}")
    
finally:
    client.close()

100%|██████████| 10/10 [00:10<00:00,  1.03s/it]

Successfully inserted 1000 documents



