In [None]:
# Install packages
!pip install faker

Collecting faker
  Downloading faker-37.0.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.0.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m86.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.0.0


In [None]:
# MOCK DATASETS
import pandas as pd
import random
from faker import Faker

# Initialize Faker
fake = Faker()

# Generate 100 unique patient IDs
patient_ids = [f'P{str(i).zfill(4)}' for i in range(1, 101)]

# Generate demographic dataset
patient_data = {
    "Patient_ID": patient_ids,
    "First_Name": [fake.first_name() for _ in range(100)],
    "Last_Name": [fake.last_name() for _ in range(100)],
    "Date_of_Birth": [fake.date_of_birth(minimum_age=30, maximum_age=90) for _ in range(100)],
    "Gender": [random.choice(["Male", "Female", "Other"]) for _ in range(100)],
    "Address": [f"{fake.building_number()} {fake.street_name()}, {fake.city()}, {fake.state_abbr()} {fake.zipcode()}" for _ in range(100)],
    "Insurance_Information": [random.choice(["Medicare", "Medicaid", "Private", "Uninsured"]) for _ in range(100)]
}
df_patient = pd.DataFrame(patient_data)

# Generate Diagnosis dataset

# Link diagnoses with corresponding ICD-10 codes
diagnoses = [
    ("Type 2 diabetes mellitus with hyperglycemia", "E11.65"),
    ("Osteoarthritis", "M15.0"),
    ("Mixed hyperlipidemia", "E78.2"),
    ("Essential (primary) hypertension", "I10"),
    ("Gastroesophageal reflux disease without esophagitis", "K21.9"),
    ("Lumbar spondylosis", "M47.896"),
    ("Asthma, unspecified", "J45.0"),
    ("Mild cognitive impairment", "G31.")
]

# Generate data
diagnosis_data = {
    "Patient_ID": random.choices(patient_ids, k=150),
    "Dx_Code": [],
    "Dx_Description": [],
    "Dx_Date": [fake.date_between(start_date="-2y", end_date="today") for _ in range(150)]
}

# Assign diagnoses randomly
for _ in range(150):
    diagnosis = random.choice(diagnoses)
    diagnosis_data["Dx_Code"].append(diagnosis[1])
    diagnosis_data["Dx_Description"].append(diagnosis[0])

df_diagnosis = pd.DataFrame(diagnosis_data)


# Link procedures with corresponding CPT codes
procedures = [
    ("Preventive exam for patient over 65", "99397"),
    ("Preventive exam for new adult patient age 40-64", "99386"),
    ("Established patient office visit (10-19 minutes)", "99212"),
    ("Established patient office visit (20-29 minutes)", "99213"),
    ("Automated urinalysis without microscopy", "81003"),
    ("Influenza virus vaccine, inactivated", "90653"),
    ("Cardiovascular stress testing", "93015")
]

# Generate procedure dataset
procedure_data = {
    "Patient_ID": random.choices(patient_ids, k=150),
    "Procedure_Code": [],
    "Procedure_Description": [],
    "Procedure_Date": [fake.date_between(start_date="-2y", end_date="today") for _ in range(150)]
}

# Assign procedures randomly while ensuring correct code-description mapping
for _ in range(150):
    procedure = random.choice(procedures)
    procedure_data["Procedure_Code"].append(procedure[1])
    procedure_data["Procedure_Description"].append(procedure[0])

df_procedure = pd.DataFrame(procedure_data)


# Generate Appointment dataset
appointment_data = {
    "Patient_ID": random.choices(patient_ids, k=150),  # Some patients have multiple appointments
    "Appointment_Date": [fake.date_between(start_date="-1y", end_date="today") for _ in range(150)],
    "Appointment_Time": [fake.time(pattern="%H:%M") for _ in range(150)],
    "Location": [random.choice(["Clinic A", "Clinic B", "Clinic C", "Telehealth"]) for _ in range(150)],
    "Appointment_Type": [random.choice(["New Patient", "Follow Up", "Sick Visit"]) for _ in range(150)],
    "Appointment_Status": [random.choice(["Checked-In", "Cancelled", "No-Show"]) for _ in range(150)]
}
df_appointment = pd.DataFrame(appointment_data)

# Save datasets to CSV files
df_patient.to_csv("patient_dataset.csv", index=False)
df_diagnosis.to_csv("diagnosis_dataset.csv", index=False)
df_procedure.to_csv("procedure_dataset.csv", index=False)
df_appointment.to_csv("appointment_dataset.csv", index=False)

print("All datasets successfully created as CSV files!")

All datasets successfully created as CSV files!
