# Generate Healthcare Data for Analysis

Create a mock dataset for 12 ambulatory practices including:
- Visit volume
- Average wait time
- Patient satisfaction score
- Appointment no-show rate
- Follow-up adherence rate
- Staff-to-patient ratio
- Provider productivity (visits per FTE)
- Quality measure compliance (e.g., A1C screening for diabetic patients)

Rows will be based on monthly data from each practice

Refer the to [Data Dictionary](Practice/sr_data_analyst_project1_data_dictionary.md) For the Data Variables Used in this Dataset.

In [9]:
# Import Packages
import random
import numpy as np
import pandas as pd
from faker import Faker  # For realistic names/locations (install with: pip install faker)

In [10]:
# --- Initialize Faker for realistic practice names ---
fake = Faker()

In [11]:
# --- Parameters ---
n_practices = 50  # Number of unique practices
n_entries = 5000  # Total rows
years = [2021, 2022, 2023]
quarters = ['Q1', 'Q2', 'Q3', 'Q4']

In [12]:
# --- Generate realistic practice IDs with names ---
practice_ids = [f"PRAC{fake.unique.random_number(digits=5)}" for _ in range(n_practices)]
practice_names = [fake.company() for _ in range(n_practices)]
practice_types = random.choices(['Primary Care', 'Specialty', 'Urgent Care', 'Hospital'], 
                              weights=[0.5, 0.3, 0.15, 0.05], k=n_practices)

In [13]:
# --- Create base data with correlations ---
data = {
    'Practice_ID': [],
    'Practice_Name': [],
    'Practice_Type': [],
    'Quarter': [],
    'Year': [],
    'Visit_Volume': [],
    'Avg_Wait_Time_Min': [],
    'Patient_Satisfaction_Score': [],
    'No_Show_Rate': [],
    'Followup_Adherence_Rate': [],
    'Staff_to_Patient_Ratio': [],
    'Provider_Productivity': [],
    'A1C_Screening_Compliance': [],
    'Total_Providers_FTE': [],
    'Total_Staff_FTE': [],
    'Unique_Patients_Seen': []
}

In [17]:
for _ in range(n_entries):
    # Select practice with some practices being overrepresented
    prac_idx = random.choices(range(n_practices), 
                            weights=[0.5 if i < 10 else 0.02 for i in range(n_practices)])[0]
    
    # Generate correlated metrics
    base_volume = random.randint(500, 5000)
    providers = max(1, int(np.random.normal(5, 2)))
    staff = providers * random.randint(2, 4)
    
    # Add seasonal effects
    quarter = random.choice(quarters)
    q_multiplier = 1.1 if quarter in ['Q1', 'Q4'] else 0.9  # Higher volume in winter
    
    # --- Populate data with realistic distributions ---
    data['Practice_ID'].append(practice_ids[prac_idx])
    data['Practice_Name'].append(practice_names[prac_idx])
    data['Practice_Type'].append(practice_types[prac_idx])
    data['Quarter'].append(quarter)
    data['Year'].append(random.choice(years))
    
    # Visit volume depends on practice size and season
    data['Visit_Volume'].append(int(base_volume * q_multiplier * np.random.lognormal(0, 0.2)))
    
    # Wait times correlate inversely with staff ratio
    staff_ratio = staff / (base_volume/30)  # Approximate daily patients
    data['Avg_Wait_Time_Min'].append(max(5, min(120, np.random.normal(40 - staff_ratio*2, 10))))
    
    # Satisfaction depends on wait times and random factors
    data['Patient_Satisfaction_Score'].append(
        max(1, min(5, np.random.normal(
            4 - (data['Avg_Wait_Time_Min'][-1]/40), 0.7)
        )))
    
    # Other metrics with realistic distributions
    data['No_Show_Rate'].append(min(0.5, max(0.01, np.random.beta(2, 10))))
    data['Followup_Adherence_Rate'].append(min(0.95, max(0.3, np.random.beta(5, 2))))
    data['Staff_to_Patient_Ratio'].append(staff_ratio)
    data['Provider_Productivity'].append(min(40, max(5, np.random.normal(20, 5))))
    data['A1C_Screening_Compliance'].append(min(1, max(0.5, np.random.beta(8, 2))))
    data['Total_Providers_FTE'].append(providers)
    data['Total_Staff_FTE'].append(staff)
    data['Unique_Patients_Seen'].append(int(base_volume * random.uniform(0.6, 0.9)))


In [18]:
# Create DataFrame
df = pd.DataFrame(data)

In [19]:
# Add some missing data (5% random NAs)
for col in df.columns[5:]:
    df.loc[df.sample(frac=0.05).index, col] = np.nan

In [1]:
df.to_csv('D:/GitHub/important-reference-repo/Data/fake_healthcare_data_v3.csv', index=False)

NameError: name 'df' is not defined