In [1]:
pip install pandas numpy faker


Collecting faker
  Downloading faker-40.1.2-py3-none-any.whl.metadata (16 kB)
Downloading faker-40.1.2-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faker
Successfully installed faker-40.1.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

fake = Faker()
np.random.seed(42)
random.seed(42)

# -----------------------------
# CONFIG
# -----------------------------
NUM_PATIENTS = 10000
NUM_DOCTORS = 120
NUM_BEDS = 650
NUM_ADMISSIONS = 18000
START_DATE = datetime(2023, 1, 1)
END_DATE = datetime(2024, 12, 31)

branches = [
    ("B1", "CityCare Hospital", "Chennai"),
    ("B2", "Metro Health Center", "Bangalore"),
    ("B3", "Sunrise Multispecialty", "Hyderabad"),
]

departments = [
    ("D1", "Cardiology"),
    ("D2", "Oncology"),
    ("D3", "Orthopedics"),
    ("D4", "Pediatrics"),
    ("D5", "Emergency"),
    ("D6", "General Medicine"),
]

insurance_types = ["Private", "Government", "Self-pay"]
admission_types = ["Emergency", "Scheduled"]
bed_types = ["General", "ICU", "Ventilator"]
outcomes = ["Recovered", "Improved", "Transferred", "Deceased"]
procedure_types = ["Surgery", "MRI Scan", "CT Scan", "Dialysis", "Chemotherapy", "X-Ray"]

# -----------------------------
# DIM PATIENT
# -----------------------------
patients = []
for pid in range(1, NUM_PATIENTS + 1):
    patients.append([
        pid,
        fake.name(),
        random.choice(["Male", "Female"]),
        int(np.clip(np.random.normal(45, 20), 0, 90)),
        random.choices(insurance_types, weights=[0.4, 0.35, 0.25])[0]
    ])

df_patients = pd.DataFrame(patients, columns=[
    "patient_id", "name", "gender", "age", "insurance_type"
])

# -----------------------------
# DIM DEPARTMENT
# -----------------------------
df_departments = pd.DataFrame(departments, columns=["dept_id", "department_name"])

# -----------------------------
# DIM BRANCH
# -----------------------------
df_branches = pd.DataFrame(branches, columns=["branch_id", "branch_name", "city"])

# -----------------------------
# DIM DOCTOR
# -----------------------------
doctors = []
for did in range(1, NUM_DOCTORS + 1):
    dept = random.choice(departments)[0]
    branch = random.choice(branches)[0]
    doctors.append([
        did,
        fake.name(),
        dept,
        branch,
        fake.job()
    ])

df_doctors = pd.DataFrame(doctors, columns=[
    "doctor_id", "name", "dept_id", "branch_id", "specialization"
])

# -----------------------------
# DIM BED
# -----------------------------
beds = []
for bid in range(1, NUM_BEDS + 1):
    beds.append([
        bid,
        random.choice(branches)[0],
        random.choice(departments)[0],
        random.choices(bed_types, weights=[0.7, 0.2, 0.1])[0],
        "Active"
    ])

df_beds = pd.DataFrame(beds, columns=[
    "bed_id", "branch_id", "dept_id", "bed_type", "status"
])

# -----------------------------
# FACT ADMISSION
# -----------------------------
admissions = []
for aid in range(1, NUM_ADMISSIONS + 1):
    admit_time = fake.date_time_between(start_date=START_DATE, end_date=END_DATE)
    los_days = np.random.randint(1, 10)
    discharge_time = admit_time + timedelta(days=los_days)

    admissions.append([
        aid,
        random.randint(1, NUM_PATIENTS),
        random.choice(branches)[0],
        random.choice(departments)[0],
        random.choices(admission_types, weights=[0.35, 0.65])[0],
        admit_time,
        discharge_time,
        los_days * 24,
        random.randint(1, NUM_DOCTORS)
    ])

df_admissions = pd.DataFrame(admissions, columns=[
    "admission_id", "patient_id", "branch_id", "dept_id",
    "admission_type", "admission_time", "discharge_time",
    "length_of_stay_hours", "doctor_id"
])

# -----------------------------
# FACT BED OCCUPANCY
# -----------------------------
occupancy = []
for i, row in df_admissions.iterrows():
    occupancy.append([
        i + 1,
        random.randint(1, NUM_BEDS),
        row["admission_id"],
        row["admission_time"],
        row["discharge_time"]
    ])

df_occupancy = pd.DataFrame(occupancy, columns=[
    "occupancy_id", "bed_id", "admission_id", "start_time", "end_time"
])

# -----------------------------
# FACT PROCEDURE
# -----------------------------
procedures = []
proc_id = 1
for aid in df_admissions["admission_id"]:
    for _ in range(np.random.randint(0, 3)):
        procedures.append([
            proc_id,
            aid,
            random.choice(procedure_types),
            fake.date_time_between(start_date=START_DATE, end_date=END_DATE),
            round(random.uniform(2000, 50000), 2)
        ])
        proc_id += 1

df_procedures = pd.DataFrame(procedures, columns=[
    "procedure_id", "admission_id", "procedure_type",
    "procedure_time", "cost"
])

# -----------------------------
# FACT BILLING
# -----------------------------
billing = []
for aid in df_admissions["admission_id"]:
    room = random.uniform(5000, 20000)
    procedure = random.uniform(2000, 50000)
    medicine = random.uniform(1000, 10000)
    other = random.uniform(500, 5000)
    total = room + procedure + medicine + other

    billing.append([aid, room, procedure, medicine, other, total])

df_billing = pd.DataFrame(billing, columns=[
    "admission_id", "room_charges", "procedure_charges",
    "medicine_charges", "other_charges", "total_cost"
])

# -----------------------------
# FACT OUTCOME
# -----------------------------
outcome_data = []
for aid in df_admissions["admission_id"]:
    outcome_data.append([
        aid,
        random.choices(outcomes, weights=[0.7, 0.2, 0.07, 0.03])[0],
        1 if random.random() < 0.1 else 0
    ])

df_outcomes = pd.DataFrame(outcome_data, columns=[
    "admission_id", "outcome_type", "readmitted_within_30d"
])

# -----------------------------
# SAVE TO CSV
# -----------------------------
df_patients.to_csv("dim_patient.csv", index=False)
df_departments.to_csv("dim_department.csv", index=False)
df_branches.to_csv("dim_branch.csv", index=False)
df_doctors.to_csv("dim_doctor.csv", index=False)
df_beds.to_csv("dim_bed.csv", index=False)
df_admissions.to_csv("fact_admission.csv", index=False)
df_occupancy.to_csv("fact_bed_occupancy.csv", index=False)
df_procedures.to_csv("fact_procedure.csv", index=False)
df_billing.to_csv("fact_billing.csv", index=False)
df_outcomes.to_csv("fact_outcome.csv", index=False)

print("✅ All hospital dataset CSV files generated successfully!")


✅ All hospital dataset CSV files generated successfully!
