In [None]:
Understanding the connection between No-shows apppointments 

In [1]:
import pandas as pd
import requests
import mysql.connector
from datetime import datetime

In [2]:
conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="Shrinking2025!",
    database="medical_db"
)
cursor = conn.cursor()

In [3]:
# Load Hospital Data from CSV File
hospital_df = pd.read_csv("/Users/nirusmadahal/medical_data/Hospital_General_Information.csv")
hospital_df = hospital_df.rename(columns={
    "Facility ID": "facility_id",
    "Facility Name": "hospital_name",
    "City/Town": "city",
    "State": "state",
    "Hospital Type": "hospital_type",
    "Hospital Ownership": "hospital_ownership",
    "Hospital overall rating": "hospital_rating"
})

In [4]:
kaggle_df = pd.read_csv("/Users/nirusmadahal/medical_data/KaggleV2-May-2016.csv")
patients_df = kaggle_df[["PatientId", "Age", "Gender", "Diabetes", "Hipertension"]].rename(columns={
    "PatientId": "patient_id",
    "Age": "age",
    "Gender": "gender",
    "Diabetes": "has_diabetes",
    "Hipertension": "has_hypertension"
})

In [20]:
# New CDC API Endpoint (Replace with the correct dataset API endpoint)
cdc_api_url = "https://data.cdc.gov/resource/hksd-2xuw.json?$limit=500"
response = requests.get(cdc_api_url)
cdc_data = response.json()

In [6]:
appointments_df = kaggle_df[["AppointmentID", "PatientId", "ScheduledDay", "AppointmentDay", "No-show"]].rename(columns={
    "AppointmentID": "appointment_id",
    "PatientId": "patient_id",
    "ScheduledDay": "scheduled_date",
    "AppointmentDay": "appointment_date",
    "No-show": "status"
})

In [7]:
appointments_df["scheduled_date"] = pd.to_datetime(appointments_df["scheduled_date"])
appointments_df["appointment_date"] = pd.to_datetime(appointments_df["appointment_date"])
appointments_df["status"] = appointments_df["status"].apply(lambda x: 1 if x == "Yes" else 0)

In [8]:
# Select only the required columns and handle missing values
hospital_df = hospital_df[["facility_id", "hospital_name", "city", "state", 
                           "hospital_type", "hospital_ownership", "hospital_rating"]]

# Convert 'Not Available' to NULL (or 0 if preferred)
hospital_df["hospital_rating"] = hospital_df["hospital_rating"].replace("Not Available", 0)

# Convert hospital_rating to integer
hospital_df["hospital_rating"] = hospital_df["hospital_rating"].astype(int)

# Fill other NaN values with defaults
hospital_df = hospital_df.fillna({
    "facility_id": "UNKNOWN", "hospital_name": "UNKNOWN", 
    "city": "UNKNOWN", "state": "UNKNOWN", 
    "hospital_type": "UNKNOWN", "hospital_ownership": "UNKNOWN"
})

# Convert facility_id to string to avoid data type mismatches
hospital_df["facility_id"] = hospital_df["facility_id"].astype(str)

# Insert into MySQL
for _, row in hospital_df.iterrows():
    cursor.execute("""
        INSERT INTO hospitals (facility_id, hospital_name, city, state, hospital_type, hospital_ownership, hospital_rating)
        VALUES (%s, %s, %s, %s, %s, %s, %s)
    """, tuple(row))

conn.commit()

In [9]:
# Select required columns
patients_df = patients_df[["patient_id", "age", "gender", "has_diabetes", "has_hypertension"]]

# Remove duplicate patient IDs
patients_df = patients_df.drop_duplicates(subset=["patient_id"])

# Convert data types to match MySQL schema
patients_df["patient_id"] = patients_df["patient_id"].astype(str)  # Ensure consistency
patients_df["age"] = patients_df["age"].fillna(0).astype(int)  # Handle missing age values
patients_df["gender"] = patients_df["gender"].fillna("Unknown")  # Handle missing gender
patients_df["has_diabetes"] = patients_df["has_diabetes"].fillna(0).astype(int)  # Convert NaN to 0
patients_df["has_hypertension"] = patients_df["has_hypertension"].fillna(0).astype(int)

# Insert into MySQL
for _, row in patients_df.iterrows():
    cursor.execute("""
        INSERT INTO patients (patient_id, age, gender, has_diabetes, has_hypertension)
        VALUES (%s, %s, %s, %s, %s)
    """, tuple(row))

conn.commit()
print("Patient data loaded into MySQL successfully!")

✅ Patient data loaded into MySQL successfully!


In [10]:
cursor.execute("""
    INSERT INTO date_dimension (full_date, year, month, day, day_of_week, quarter)
    SELECT DISTINCT appointment_date, YEAR(appointment_date), MONTH(appointment_date), 
                    DAY(appointment_date), DAYNAME(appointment_date), QUARTER(appointment_date)
    FROM appointments;
""")
conn.commit()

In [11]:
for _, row in appointments_df.iterrows():
    cursor.execute("""
        INSERT INTO appointments (appointment_id, patient_id, scheduled_date, appointment_date, status)
        VALUES (%s, %s, %s, %s, %s)
    """, tuple(row))
conn.commit()

In [27]:
# Create the dataframe with the correct column names
cdc_df = pd.DataFrame(cdc_data)[["topic", "locationdesc", "datavalue", "yearstart"]].rename(columns={
    "topic": "condition_name",
    "locationdesc": "state",
    "datavalue": "prevalence",
    "yearstart": "year"
})

# Convert Data Types
cdc_df["prevalence"] = pd.to_numeric(cdc_df["prevalence"], errors="coerce")
cdc_df["year"] = pd.to_numeric(cdc_df["year"], errors="coerce")
cdc_df["state"] = cdc_df["state"].str.upper()

# Optional: Round prevalence to 2 decimal places
cdc_df["prevalence"] = cdc_df["prevalence"].round(2)
cdc_df = cdc_df.dropna(subset=["prevalence", "year"]) 

In [28]:
for _, row in cdc_df.iterrows():
    cursor.execute("""
        INSERT INTO chronic_diseases (condition_name, state, prevalence, year)
        VALUES (%s, %s, %s, %s)
    """, tuple(row))
conn.commit()

In [32]:
# No-show rate per hospital
sql_noshow = """SELECT h.hospital_name, COUNT(a.appointment_id) AS total_appointments,
       SUM(a.status) AS total_no_shows,
       ROUND((SUM(a.status) / COUNT(a.appointment_id)) * 100, 2) AS no_show_rate
FROM appointments a
JOIN hospitals h ON a.hospital_id = h.hospital_id
GROUP BY h.hospital_name
ORDER BY no_show_rate DESC
LIMIT 10;"""



In [None]:
# No-Show Trends Over Time 
sql_overyime = """
SELECT d.year, d.month, COUNT(a.appointment_id) AS total_appointments,
       SUM(a.status) AS total_no_shows,
       ROUND((SUM(a.status) / COUNT(a.appointment_id)) * 100, 2) AS no_show_rate
FROM appointments a
JOIN date_dimension d ON a.date_id = d.date_id
GROUP BY d.year, d.month
ORDER BY d.year DESC, d.month DESC;"""

In [31]:
# Chronic Disease Impact on No-Shows
sql_query = """SELECT c.state, c.condition_name, c.prevalence,
       COUNT(a.appointment_id) AS total_appointments,
       SUM(a.status) AS total_no_shows,
       ROUND((SUM(a.status) / COUNT(a.appointment_id)) * 100, 2) AS no_show_rate
FROM appointments a
JOIN hospitals h ON a.hospital_id = h.hospital_id
JOIN chronic_diseases c ON h.state = c.state
WHERE c.condition_name = 'Diabetes'
GROUP BY c.state, c.condition_name, c.prevalence
ORDER BY no_show_rate DESC;
"""

In [None]:
# Compare No-Show Rates by Chronic Disease
sql_query2 = """SELECT c.condition_name, AVG(c.prevalence) AS avg_prevalence,
       COUNT(a.appointment_id) AS total_appointments,
       SUM(a.status) AS total_no_shows,
       ROUND((SUM(a.status) / COUNT(a.appointment_id)) * 100, 2) AS no_show_rate
FROM appointments a
JOIN hospitals h ON a.hospital_id = h.hospital_id
JOIN chronic_diseases c ON h.state = c.state
GROUP BY c.condition_name
ORDER BY no_show_rate DESC;"""