## Eligibility for mobilization - Analysis

Run this script after running the [cohort_identification.ipynb](cohort_identification.ipynb)

## Load libraries 

In [None]:
#! pip install pandas numpy duckdb seaborn matplotlib tableone
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
from tableone import TableOne
import pyCLIF
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from upsetplot import UpSet, from_indicators

In [None]:
final_df = pd.read_parquet('../output/intermediate/final_df_hourly.parquet')
all_ids_w_outcome = pd.read_parquet('../output/intermediate/cohort_all_ids_w_outcome.parquet')
final_df_blocks = pd.read_parquet('../output/intermediate/final_df_blocks.parquet')

## Forward and Backward fill the final dataset

In [None]:
before_filling = final_df.isnull().sum() / len(final_df) * 100
print("Shape of final_df: ", final_df.shape)

In [None]:
# 0 ── safety ordering ───────────────────────────────────────
final_df = final_df.sort_values(
    by=['encounter_block', 'recorded_date', 'recorded_hour']
)

# 1 ── identify column groups ────────────────────────────────
flag_columns = [
    'hourly_trach','hourly_on_vent','nicardipine_flag','nitroprusside_flag',
    'clevidipine_flag','red_meds_flag','cisatracurium_flag','vecuronium_flag',
    'rocuronium_flag','paralytics_flag'
]
exclude_columns = [
    'patient_id','hospitalization_id','encounter_block',
     'recorded_date','recorded_hour',
    'time_from_vent','time_from_vent_adjusted', 'lactate',
    'last_ne_dose_last_6_hours','ne_calc_last']

all_cols           = set(final_df.columns)
potential_fill     = all_cols - set(flag_columns) - set(exclude_columns) 
continuous_columns = [
    c for c in potential_fill
    if pd.api.types.is_numeric_dtype(final_df[c])
]

# 2 ── binary flags → 0/1 ints ───────────────────────────────
for col in flag_columns:
    final_df[col] = final_df[col].fillna(0).astype(int)

# 3 ── forward / backward fill for numeric variables ---------
final_df[continuous_columns] = (
    final_df
      .groupby('encounter_block')[continuous_columns]
      .transform(lambda s: s.ffill().bfill())
)

# 4 ── lactate: forward-fill but **only 24 h** (24 rows) -----
final_df['lactate'] = (
    final_df
      .groupby('encounter_block')
      .apply(lambda g: g['lactate'].fillna(method='ffill', limit=24))
      .reset_index(level=0, drop=True)
)

# 5 ── tracheostomy flag stays 1 once first seen -------------
final_df['hourly_trach'] = (
    final_df.groupby('encounter_block')['hourly_trach']
            .transform(lambda s: s.cummax())
            .astype(int)
)

final_df['ne_calc_last'] = (
    final_df
      .groupby('encounter_block')['ne_calc_last']
      .transform(lambda s: s.ffill())
)


# 6 ── norepinephrine: exact-6-hour look-backs, but
#     **only replace rows that were NA already**
# ------------------------------------------------
def add_exact_6h_ne(block: pd.DataFrame) -> pd.DataFrame:
    """
    • If a row already has last_ne_dose_last_6_hours 
      it is **left unchanged**.
    • Otherwise we insert the value from 6 hours earlier
      (0 if no row exists 6 hours before).
    Expect *block* to be time-sorted.
    """
    block = block.copy()

    # make sure the columns exist
    if 'last_ne_dose_last_6_hours' not in block.columns:
        block[col] = np.nan

    # candidate fill values = value 6 rows earlier
    fill_last = block['ne_calc_last'].shift(6)

    # only overwrite where current value is NA
    block.loc[block['last_ne_dose_last_6_hours'].isna(), 'last_ne_dose_last_6_hours'] = fill_last

    # still-missing ⇒ 0  (= “no vasopressor recorded 6 h ago”)
    block[['last_ne_dose_last_6_hours']] = block[[
               'last_ne_dose_last_6_hours']].fillna(0)

    return block


final_df = (
    final_df
      .sort_values(['encounter_block', 'recorded_date', 'recorded_hour']) 
      .groupby('encounter_block', group_keys=False)
      .apply(add_exact_6h_ne)
      .reset_index(drop=True)
)


In [None]:
after_filling = final_df.isnull().sum() / len(final_df) * 100

In [None]:
before_after = pd.DataFrame({
    'before_filling': before_filling,
    'after_filling': after_filling
}).reset_index()
before_after = before_after.rename(columns={'index': 'column'})
# save before_after to csv
before_after.to_csv('../output/final/missingness_final_df.csv', index=False)

In [None]:
# checkpoint- useful to compare to the original df and check filling logic
final_df.to_parquet(f'../output/intermediate/final_df_filled.parquet')

## Create Criteria Flags

### Patel et al. Criteria:

Cardio
* Mean arterial blood pressure: 65-110 mm Hg
* Systolic blood pressure: ≤ 200 mm Hg
* Heart rate: 40-130 beats per minute

Respiratory
* Respiratory rate: 5-40 breaths per minute
* Pulse oximetry: ≥ 88%

In [None]:
# Apply Patel et al. Criteria

# 1. Mean arterial blood pressure: 65-110 mm Hg
## 5/5/25 -- UPDATE MAP TO CONSIDER AVERAGE VALUE BASED ON VASOPRESSOR DOSE
# final_df['patel_map_flag'] = (
#     (final_df['min_map'] >= 65) & (final_df['max_map'] <= 110)
# ).astype(int)

final_df['patel_map_flag'] = (
    (final_df['avg_map'] >= 65) & (final_df['avg_map'] <= 110)
).astype(int)

# 2. Systolic blood pressure: ≤ 200 mm Hg
final_df['patel_sbp_flag'] = (
    final_df['max_sbp'].isna() |
    (final_df['max_sbp'] <= 200)
).astype(int)

# 3. Heart rate (Pulse): 40-130 beats per minute
final_df['patel_pulse_flag'] = (
    (final_df['min_heart_rate'] >= 40) & (final_df['max_heart_rate'] <= 130)
).astype(int)

# 4. Respiratory rate: 5-40 breaths per minute
final_df['patel_resp_rate_flag'] = (
    (final_df['min_respiratory_rate'] >= 5) & (final_df['max_respiratory_rate'] <= 40)
).astype(int)

# 5. Pulse oximetry (SpO2): ≥ 88%
final_df['patel_spo2_flag'] = (
    final_df['min_spo2'].isna() |
    (final_df['min_spo2'] >= 88)
).astype(int)

# Resp flag: Combines respiratory rate and SpO2 criteria
final_df['patel_resp_flag'] = (
    final_df['patel_resp_rate_flag'] &
    final_df['patel_spo2_flag'] &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

# Cardio flag: Combines MAP, SBP, and Pulse criteria
final_df['patel_cardio_flag'] = (
    final_df['patel_map_flag'] &
    final_df['patel_sbp_flag'] &
    final_df['patel_pulse_flag'] &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

# Create the overall Patel flag
final_df['patel_flag'] = (
    final_df['patel_map_flag'] &
    final_df['patel_sbp_flag'] &
    final_df['patel_pulse_flag'] &
    final_df['patel_resp_rate_flag'] &
    final_df['patel_spo2_flag'] &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['patel_flag_all_hours'] = (
    final_df['patel_map_flag'] &
    final_df['patel_sbp_flag'] &
    final_df['patel_pulse_flag'] &
    final_df['patel_resp_rate_flag'] &
    final_df['patel_spo2_flag'] &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

### TEAM criteria

Cardio
* Heart rate: ≤ 150 bpm
* Most recent lactate: ≤ 4.0 mmol/L
* Noradrenaline infusion rate: <0.2 mcg/kg/min or if infusion rate has increased by more than 25% in the last 6 hours, dose must be <0.1 mcg/kg/min.
Respiratory
* Sufficient respiratory stability:
    *  FiO2: ≤ 0.6
    *  PEEP: ≤ 16 cm H2O (use peep_observed)
* Current respiratory rate: ≤ 45 (use resp_rate_obs)

In [None]:
# 1. Heart rate: ≤ 150 bpm
final_df['team_pulse_flag'] = np.where(
    final_df['max_heart_rate'].isna(),
    1,
    (final_df['max_heart_rate'] <= 150).astype(int)
)

# 2. Most recent lactate: ≤ 4.0 mmol/L
final_df['team_lactate_flag'] = np.where(
    final_df['lactate'].isna(),
    1,
    (final_df['lactate'] <= 4.0).astype(int)
)

# 3. Noradrenaline infusion rate: <0.2 mcg/kg/min 
# final_df['team_ne_flag'] = np.where(
#     final_df['ne_calc_max'].isna(),
#     1,
#     (final_df['ne_calc_max'] <= 0.2).astype(int)
# )

# final_df['team_ne_flag'] = (
#     # (final_df['ne_calc_min'] >= 0.1) & (final_df['ne_calc_max'] <= 0.2)
#     final_df['ne_calc_max'] <= 0.2
# ).astype(int)

# print the number of team_ne_flag == 1
# print("TEAM NE flag counts when ne < 0.2\n", final_df['team_ne_flag'].value_counts(), "\n")
 
# #3b. set the flag to 0 if infusion rate has increased by more than 25% in the last 6 hours and the dose is >0.1 mcg/kg/min.
# final_df['team_ne_flag'] = np.where(
#     (final_df['ne_calc_max'] > 1.25 * final_df['min_ne_dose_last_6_hours']) & (final_df['ne_calc_max'] > 0.1),
#     0,
#     final_df['team_ne_flag']
# )


final_df['team_ne_flag'] = np.where(
    final_df['ne_calc_last'].isna(),
    1,
    (final_df['ne_calc_last'] <= 0.2).astype(int)
)

# print the number of team_ne_flag == 1
print("TEAM NE flag counts when ne < 0.2\n", final_df['team_ne_flag'].value_counts(), "\n")
 
#3b. set the flag to 0 if infusion rate has increased by more than 25% in the last 6 hours and the dose is >0.1 mcg/kg/min.
final_df['team_ne_flag'] = np.where(
    (final_df['ne_calc_last'] > 1.25 * final_df['last_ne_dose_last_6_hours']) & (final_df['ne_calc_last'] > 0.1),
    0,
    final_df['team_ne_flag']
)
print("TEAM NE flag counts adjusting for change in the last 6 hrs\n", final_df['team_ne_flag'].value_counts(), "\n")

# 4. Sufficient respiratory stability:
#    a. FiO2: ≤ 0.6
final_df['team_fio2_flag'] = np.where(
    final_df['min_fio2_set'].isna(),
    1,
    (final_df['min_fio2_set'] <= 0.6).astype(int)
)

#    b. PEEP: ≤ 16 cm H2O
final_df['team_peep_flag'] = np.where(
    final_df['max_peep_set'].isna(),
    1,
    (final_df['max_peep_set'] <= 16).astype(int)
)

# 5. Current respiratory rate: ≤ 45
final_df['team_resp_rate_flag'] = np.where(
    final_df['max_respiratory_rate'].isna(),
    1,
    (final_df['max_respiratory_rate'] <= 45).astype(int)
)

# Cardio flag: Combines heart rate, lactate, and norepinephrine criteria
final_df['team_cardio_flag'] = (
    final_df['team_pulse_flag'] &
    final_df['team_lactate_flag'] &
    final_df['team_ne_flag'] &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

# Resp flag: Combines FiO2, PEEP, and respiratory rate criteria
final_df['team_resp_flag'] = (
    final_df['team_fio2_flag'] &
    final_df['team_peep_flag'] &
    final_df['team_resp_rate_flag'] &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)


# Create the overall TEAM flag
final_df['team_flag'] = (
    final_df['team_pulse_flag'] &
    final_df['team_lactate_flag'] &
    final_df['team_ne_flag'] &
    final_df['team_fio2_flag'] &
    final_df['team_peep_flag'] &
    final_df['team_resp_rate_flag'] &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) & 
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['team_flag_all_hours'] = (
    final_df['team_pulse_flag'] &
    final_df['team_lactate_flag'] &
    final_df['team_ne_flag'] &
    final_df['team_fio2_flag'] &
    final_df['team_peep_flag'] &
    final_df['team_resp_rate_flag'] &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) & 
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

### Consensus criteria

* Green Criteria
    * Respiratory
        * Saturation  90% and
        * Respiratory rate ≤ 30 breaths/min
        * Current FiO2 ≤ 0.6 and
        * PEEP≤ 10cm H20
    * Cardiovascular:
        * Blood pressure greater than lower limit of target range (MAP 65+) while on no or low level of support (low support- define as <0.1 μg/kg/min of Norepi equivalents)
        * Heart rate <120 beats/min
        * lactate < 4mmol/L
        * HR > 40
* Yellow Criteria
    * Respiratory
        * Sat >= 90%
        * Current FiO2 >0.6
        * Respiratory rate >30breaths/min
        * PEEP >10cm H20
    * Cardiovascular
        * Blood pressure greater than lower limit of target range (MAP 65+) while receiving moderate level of support (medium-define as 0.1–0.3 μg/kg/min of Norepi equivalents)
        * Heart rate 120-150 beats/min
        * Shock of any cause with lactate >4mmol/L
        * HR > 40
* Red Criteria
    * Respiratory
        * Sat <90%
    * Cardiovascular
        * Below target MAP despite support (MAP <65) or
        * greater than lower limit MAP (MAP 65+) but on high level support (high defined as >0.3 μg/kg/min of Norepi equivalents)
        * IV therapy for hypertensive emergency (SBP >200mmHg or MAP >110 and on nicardipine, nitroprusside, or clevidipine gtt)
        * HR >150 bpm
        * Bradycardia <40


**Consensus criteria - redefined**

* all_red: All red subcomponents must be met.
* all_green: All green subcomponents must be met, and no red subcomponents are met.
* all_yellow: All yellow subcomponents must be met, no red subcomponents are met, and all green subcomponents are not met.
* any_yellow: Any yellow subcomponent is met, no green subcomponents are fully met, and no red subcomponents are met.
* any_yellow_or_green_no_red: Any yellow or green subcomponents are met, but no red subcomponents are met.
* no_red: No red criteria is met

In [None]:
# Red Cardiovascular Criteria
final_df['red_resp_spo2_flag'] = ((final_df['min_spo2'] < 90) | final_df['min_spo2'].isna()).astype(int)
final_df['red_map_flag'] = ((final_df['avg_map'] < 65) | final_df['avg_map'].isna()).astype(int)

# High support (Norepinephrine equivalents > 0.3 μg/kg/min)
final_df['red_high_support_flag'] = ((final_df['ne_calc_last'] > 0.3)).astype(int)

# Hypertensive emergency criteria (SBP > 200 mmHg or MAP > 110 mmHg and on certain medications)
final_df['red_hypertensive_flag'] = (
    (((final_df['max_sbp'] > 200) | (final_df['avg_map'] > 110)) &
    (final_df['red_meds_flag'] == 1)) 
).astype(int)

# High heart rate criteria (HR > 150 bpm)
final_df['red_pulse_high_flag'] = ((final_df['max_heart_rate'] > 150)).astype(int)
# Low heart rate criteria (HR < 40 bpm)
final_df['red_pulse_low_flag'] = ((final_df['min_heart_rate'] < 40) | final_df['min_heart_rate'].isna()).astype(int)

# Yellow Respiratory Criteria
final_df['yellow_resp_spo2_flag'] = ((final_df['min_spo2'] >= 90)| final_df['min_spo2'].isna()).astype(int)
final_df['yellow_fio2_flag'] = ((final_df['min_fio2_set'] > 0.6)).astype(int)
final_df['yellow_resp_rate_flag'] = ((final_df['max_respiratory_rate'] > 30)).astype(int)
final_df['yellow_peep_flag'] = ((final_df['min_peep_set'] > 10)).astype(int)

# Yellow Cardiovascular Criteria
final_df['yellow_map_flag'] = (((final_df['avg_map'] >= 65) & (final_df['ne_calc_last'].between(0.1, 0.3)))).astype(int)
final_df['yellow_pulse_flag'] = ((final_df['min_heart_rate'].between(120, 150))).astype(int)
final_df['yellow_lactate_flag'] = ((final_df['lactate'] > 4)).astype(int)

# Step 3: Implement Green Criteria
final_df['green_resp_spo2_flag'] = ((final_df['min_spo2'] >= 90)| final_df['min_spo2'].isna()).astype(int)
final_df['green_resp_rate_flag'] = ((final_df['max_respiratory_rate'] <= 30) | final_df['max_respiratory_rate'].isna()).astype(int)
final_df['green_fio2_flag'] = ((final_df['min_fio2_set'] <= 0.6) | final_df['min_fio2_set'].isna()).astype(int)
final_df['green_peep_flag'] = ((final_df['min_peep_set'] <= 10) | final_df['min_peep_set'].isna()).astype(int)

# Green Cardiovascular Criteria
final_df['green_map_flag'] = (((final_df['avg_map'] >= 65) & (final_df['ne_calc_last'] < 0.1)) | final_df['ne_calc_last'].isna()).astype(int)
final_df['green_pulse_flag'] = ((final_df['min_heart_rate'] < 120) | final_df['min_heart_rate'].isna()).astype(int)
final_df['green_lactate_flag'] = ((final_df['lactate'] < 4) | final_df['lactate'].isna()).astype(int)
final_df['green_hr_flag'] = ((final_df['min_heart_rate'] > 40) | final_df['min_heart_rate'].isna()).astype(int)


## Green subcomoponent flags
final_df['green_resp_flag'] = (
    final_df['green_resp_spo2_flag'] &
    final_df['green_resp_rate_flag'] &
    final_df['green_fio2_flag'] &
    final_df['green_peep_flag'] &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

# Green cardio flag: Combines MAP, SBP, lactate and Pulse criteria
final_df['green_cardio_flag'] = (
    final_df['green_map_flag'] &
    final_df['green_pulse_flag'] &
    final_df['green_lactate_flag'] &
    final_df['green_hr_flag'] &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)


final_df['any_red'] = (
    (final_df['red_resp_spo2_flag'] |
    final_df['red_map_flag'] |
    final_df['red_high_support_flag'] |
    final_df['red_hypertensive_flag'] |
    final_df['red_pulse_high_flag'] |
    final_df['red_pulse_low_flag']) &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['no_red'] = (~(final_df['red_resp_spo2_flag'] |
       final_df['red_map_flag'] |
       final_df['red_high_support_flag'] |
       final_df['red_hypertensive_flag'] |
       final_df['red_pulse_high_flag'] |
       final_df['red_pulse_low_flag']) &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['any_yellow'] = (
    (final_df['yellow_resp_spo2_flag'] |
    final_df['yellow_fio2_flag'] |
    final_df['yellow_resp_rate_flag'] |
    final_df['yellow_peep_flag'] |
    final_df['yellow_map_flag'] |
    final_df['yellow_pulse_flag'] |
    final_df['yellow_lactate_flag']) &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['any_green'] = (
    (final_df['green_resp_spo2_flag'] |
    final_df['green_resp_rate_flag'] |
    final_df['green_fio2_flag'] |
    final_df['green_peep_flag'] |
    final_df['green_map_flag'] |
    final_df['green_pulse_flag'] |
    final_df['green_lactate_flag'] |
    final_df['green_hr_flag']) &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['all_green'] = (
    final_df['green_resp_spo2_flag'] &
    final_df['green_resp_rate_flag'] &
    final_df['green_fio2_flag'] &
    final_df['green_peep_flag'] &
    final_df['green_map_flag'] &
    final_df['green_pulse_flag'] &
    final_df['green_lactate_flag'] &
    final_df['green_hr_flag'] &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['all_green_all_hours'] = (
    final_df['green_resp_spo2_flag'] &
    final_df['green_resp_rate_flag'] &
    final_df['green_fio2_flag'] &
    final_df['green_peep_flag'] &
    final_df['green_map_flag'] &
    final_df['green_pulse_flag'] &
    final_df['green_lactate_flag'] &
    final_df['green_hr_flag'] &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['all_green_no_red'] = (
    final_df['green_resp_spo2_flag'] &
    final_df['green_resp_rate_flag'] &
    final_df['green_fio2_flag'] &
    final_df['green_peep_flag'] &
    final_df['green_map_flag'] &
    final_df['green_pulse_flag'] &
    final_df['green_lactate_flag'] &
    final_df['green_hr_flag'] &
    (final_df['any_red'] == 0) &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['all_green_no_red_yellow'] = (
    final_df['green_resp_spo2_flag'] &
    final_df['green_resp_rate_flag'] &
    final_df['green_fio2_flag'] &
    final_df['green_peep_flag'] &
    final_df['green_map_flag'] &
    final_df['green_pulse_flag'] &
    final_df['green_lactate_flag'] &
    final_df['green_hr_flag'] &
    (final_df['any_red'] == 0) &
    (final_df['any_yellow'] == 0) &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['all_yellow_no_red_green'] = (
    final_df['yellow_resp_spo2_flag'] &
    final_df['yellow_fio2_flag'] &
    final_df['yellow_resp_rate_flag'] &
    final_df['yellow_peep_flag'] &
    final_df['yellow_map_flag'] &
    final_df['yellow_pulse_flag'] &
    final_df['yellow_lactate_flag'] &
    (final_df['any_red'] == 0) &
    (final_df['any_green'] == 0) &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['any_yellow_no_red_green'] = (
    (final_df['yellow_resp_spo2_flag'] |
    final_df['yellow_fio2_flag'] |
    final_df['yellow_resp_rate_flag'] |
    final_df['yellow_peep_flag'] |
    final_df['yellow_map_flag'] |
    final_df['yellow_pulse_flag'] |
    final_df['yellow_lactate_flag']) &
    (final_df['any_red'] == 0) &
    (final_df['any_green'] == 0) &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['any_yellow_or_green_no_red'] = (
    (final_df['yellow_resp_spo2_flag'] |
    final_df['yellow_fio2_flag'] |
    final_df['yellow_resp_rate_flag'] |
    final_df['yellow_peep_flag'] |
    final_df['yellow_map_flag'] |
    final_df['yellow_pulse_flag'] |
    final_df['yellow_lactate_flag'] |
    final_df['green_resp_spo2_flag'] |
    final_df['green_resp_rate_flag'] |
    final_df['green_fio2_flag'] |
    final_df['green_peep_flag'] |
    final_df['green_map_flag'] |
    final_df['green_pulse_flag'] |
    final_df['green_lactate_flag'] |
    final_df['green_hr_flag']) &
    (final_df['any_red'] == 0) &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['any_yellow_or_green_no_red_all_hours'] = (
    (final_df['yellow_resp_spo2_flag'] |
    final_df['yellow_fio2_flag'] |
    final_df['yellow_resp_rate_flag'] |
    final_df['yellow_peep_flag'] |
    final_df['yellow_map_flag'] |
    final_df['yellow_pulse_flag'] |
    final_df['yellow_lactate_flag'] |
    final_df['green_resp_spo2_flag'] |
    final_df['green_resp_rate_flag'] |
    final_df['green_fio2_flag'] |
    final_df['green_peep_flag'] |
    final_df['green_map_flag'] |
    final_df['green_pulse_flag'] |
    final_df['green_lactate_flag'] |
    final_df['green_hr_flag']) &
    (final_df['any_red'] == 0) &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['yellow_resp_flag'] = (
    (final_df['yellow_resp_spo2_flag'] |
    final_df['yellow_fio2_flag'] |
    final_df['yellow_resp_rate_flag'] |
    final_df['yellow_peep_flag'] |
    final_df['green_resp_spo2_flag'] |
    final_df['green_resp_rate_flag'] |
    final_df['green_fio2_flag'] |
    final_df['green_peep_flag']) &
    (final_df['any_red'] == 0) &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['yellow_cardio_flag'] = (
    (final_df['yellow_map_flag'] |
    final_df['yellow_pulse_flag'] |
    final_df['yellow_lactate_flag'] |
    final_df['green_map_flag'] |
    final_df['green_pulse_flag'] |
    final_df['green_lactate_flag'] |
    final_df['green_hr_flag']) &
    (final_df['any_red'] == 0) &
    (final_df['hourly_trach'] == 0) &
    (final_df['paralytics_flag'] == 0) &
    (final_df['recorded_hour'] >= 8) &
    (final_df['recorded_hour'] < 17) &
    (final_df['time_from_vent_adjusted'] != -1)
).astype(int)

final_df['yellow_all_green'] = (
    final_df['all_green_no_red'] &
    (final_df['any_yellow'] == 0)
).astype(int)

final_df['yellow_not_all_green'] = (
    final_df['any_yellow_or_green_no_red'] &
    (final_df['all_green_no_red'] == 0)
).astype(int)

In [None]:
# print value counts for each flag
print(final_df[['any_red', 'any_yellow', 'any_green' ,  'all_green',
                'all_green_no_red', 'all_green_no_red_yellow', 'all_yellow_no_red_green', 
                'any_yellow_no_red_green','any_yellow_or_green_no_red','no_red' ,'yellow_all_green',
                 'yellow_not_all_green' ]].sum())

In [None]:
final_df.to_parquet(f'../output/intermediate/final_df_w_criteria.parquet')

## TableOne

In [None]:
# Create criteria_results df with one row per encounter showing if criteria were ever met
criteria_block_results = final_df.groupby('encounter_block').agg({
    'patel_flag': 'max',  # 1 if criteria ever met
    'team_flag': 'max',
    'any_yellow_or_green_no_red': 'max',
    'all_green': 'max',
    'all_green_no_red': 'max'
}).reset_index()

final_df_blocks_merged = final_df_blocks.merge(criteria_block_results, on='encounter_block', how='left')

#  aggregate the values from final_df at encounter_block level
vaso_peep_fio2_stats = final_df.groupby('encounter_block').agg({
    'ne_calc_last': 'max',  # If any value is > 0, the block received vasopressors
    'max_peep_set': 'mean',  # Average of max PEEP in the block
    'min_fio2_set': 'mean'   # Average of min FiO2 in the block
}).reset_index()

# Merge these stats with final_df_blocks
all_encounters = final_df_blocks_merged.copy()
all_encounters = pd.merge(all_encounters, vaso_peep_fio2_stats, on='encounter_block', how='left')

# Create subsets and map race for each
def map_race_column(df, race_column='race'):
    race_mapping = {
        'Black or African-American': 'Black',
        'Black or African American': 'Black',
        'White': 'White',
        'Asian': 'Other',
        'American Indian or Alaska Native': 'Other',
        'Native Hawaiian or Other Pacific Islander': 'Other',
        'Other': 'Other',
        'Unknown': 'Other'
    }
    df['race_new'] = df[race_column].map(race_mapping).fillna('Missing')
    return df

# Map race and create subsets
all_encounters = map_race_column(all_encounters, 'race_category')
patel_subset = map_race_column(all_encounters[all_encounters['patel_flag'] == 1].copy(), 'race_category')
team_subset = map_race_column(all_encounters[all_encounters['team_flag'] == 1].copy(), 'race_category')
yellow_subset = map_race_column(all_encounters[all_encounters['any_yellow_or_green_no_red'] == 1].copy(), 'race_category')
green_subset = map_race_column(all_encounters[all_encounters['all_green'] == 1].copy(), 'race_category')
green_no_red_subset = map_race_column(all_encounters[all_encounters['all_green_no_red'] == 1].copy(), 'race_category')

# Calculate vasopressor usage for each subset
def calculate_vasopressor_stats(df):
    # Count encounters with any vasopressor use (ne_calc_last > 0)
    vaso_usage = df['ne_calc_last'].notna() & (df['ne_calc_last'] > 0)
    n_vaso = vaso_usage.sum()
    n_zero = (df['ne_calc_last'] == 0).sum()
    n_missing = df['ne_calc_last'].isna().sum()
    total = len(df)
    return n_vaso, n_zero, n_missing, total

# Calculate stats for each group
vaso_stats = {
    'All Encounters': calculate_vasopressor_stats(all_encounters),
    'Patel Criteria': calculate_vasopressor_stats(patel_subset),
    'TEAM Criteria': calculate_vasopressor_stats(team_subset),
    'Yellow Criteria': calculate_vasopressor_stats(yellow_subset),
    'Green Criteria': calculate_vasopressor_stats(green_subset),
    'Green-No-Red Criteria': calculate_vasopressor_stats(green_no_red_subset)
}

# Define variables for the table
categorical = ['sex_category', 'race_new', 'ethnicity_category', 
              'location_category', 'is_dead']

continuous = ['age_at_admission', 'sofa_cv_97', 'sofa_coag', 'sofa_renal',
             'sofa_liver', 'sofa_resp', 'sofa_cns', 'sofa_total',
             'ne_calc_last', 'max_peep_set', 'min_fio2_set']

# Create individual tables
# All Encounters - This will be our template
table_all = TableOne(all_encounters, 
                    columns=categorical + continuous,
                    categorical=categorical,
                    groupby=None,
                    nonnormal=continuous,
                    pval=False)
df_all = table_all.tableone.reset_index()

# Filter out the 'n' row from the template
# df_all = df_all[~((df_all['level_0'] == 'n') & (df_all['level_1'].isna()))]

# Get the last column and the index columns
df_template = pd.DataFrame({
    'Characteristics': df_all['level_0'],
    'Category': df_all['level_1'],
    'All Encounters': df_all[df_all.columns[-1]]
})

# Function to process each criteria subset
def process_criteria_subset(subset_df, criteria_name, template):
    table = TableOne(subset_df,
                    columns=categorical + continuous,
                    categorical=categorical,
                    groupby=None,
                    nonnormal=continuous,
                    pval=False)
    df = table.tableone.reset_index()
    
    # Filter out the 'n' row
    df = df[~((df['level_0'] == 'n') & (df['level_1'].isna()))]
    
    # Create a DataFrame with the same structure as template
    result = pd.DataFrame({
        'Characteristics': df['level_0'],
        'Category': df['level_1'],
        criteria_name: df[df.columns[-1]]
    })
    
    # Merge with template to ensure all categories are present
    merged = pd.merge(template[['Characteristics', 'Category']], 
                     result,
                     on=['Characteristics', 'Category'],
                     how='left')
    
    return merged[criteria_name]

# Process each criteria subset
patel_col = process_criteria_subset(patel_subset, 'Patel Criteria', df_template)
team_col = process_criteria_subset(team_subset, 'TEAM Criteria', df_template)
yellow_col = process_criteria_subset(yellow_subset, 'Yellow Criteria', df_template)
green_col = process_criteria_subset(green_subset, 'Green Criteria', df_template)
green_no_red_col = process_criteria_subset(green_no_red_subset, 'Green-No-Red Criteria', df_template)

# Combine all columns
final_table = pd.concat([
    df_template[['Characteristics', 'Category', 'All Encounters']],
    patel_col,
    team_col,
    yellow_col,
    green_col,
    green_no_red_col
], axis=1)

# Clean up the table
# Remove the 'Missing' category if it exists and has count of 0
final_table = final_table[~((final_table['Category'] == 'Missing') & 
                          (final_table['All Encounters'].str.startswith('0')))]

# Format mortality rows
mortality_rows = final_table.loc[final_table['Characteristics'] == 'is_dead']
for col in final_table.columns[2:]:  # Skip 'Characteristics' and 'Category'
    if col == 'All Encounters':
        total = len(all_encounters)
        deaths = all_encounters['is_dead'].sum()
    elif col == 'Patel Criteria':
        total = len(patel_subset)
        deaths = patel_subset['is_dead'].sum()
    elif col == 'TEAM Criteria':
        total = len(team_subset)
        deaths = team_subset['is_dead'].sum()
    elif col == 'Yellow Criteria':
        total = len(yellow_subset)
        deaths = yellow_subset['is_dead'].sum()
    elif col == 'Green Criteria':
        total = len(green_subset)
        deaths = green_subset['is_dead'].sum()
    else:  # Green No Red Criteria
        total = len(green_no_red_subset)
        deaths = green_no_red_subset['is_dead'].sum()
    
    percentage = (deaths / total * 100) if total > 0 else 0
    mortality_rows.loc[mortality_rows['Category'] == '1', col] = f"{deaths} ({percentage:.1f})"

# Replace the original mortality rows
final_table.loc[final_table['Characteristics'] == 'is_dead'] = mortality_rows

# # Clean up labels
final_table.loc[final_table['Characteristics'] == 'is_dead', 'Characteristics'] = 'Mortality'
# final_table.loc[final_table['Category'] == '1', 'Category'] = ''

# Add vasopressor usage rows
vaso_rows = []
for status in ['Received Vasopressors', 'No Vasopressors', 'Missing Vasopressor Data']:
    row_data = {'Characteristics': 'Vasopressor Status', 'Category': status}
    for col in final_table.columns[2:]:  # Skip 'Characteristics' and 'Category'
        if col in vaso_stats:
            n_vaso, n_zero, n_missing, total = vaso_stats[col]
            if status == 'Received Vasopressors':
                value = n_vaso
            elif status == 'No Vasopressors':
                value = n_zero
            else:  # Missing Vasopressor Data
                value = n_missing
            
            percentage = (value / total * 100) if total > 0 else 0
            row_data[col] = f"{value} ({percentage:.1f})"
    vaso_rows.append(row_data)

vaso_df = pd.DataFrame(vaso_rows)
final_table = pd.concat([final_table, vaso_df], ignore_index=True)

# Add n row at the top (only once)
n_row = pd.DataFrame({
    'Characteristics': ['n'],
    'Category': [''],
    'All Encounters': [str(len(all_encounters))],
    'Patel Criteria': [str(len(patel_subset))],
    'TEAM Criteria': [str(len(team_subset))],
    'Yellow Criteria': [str(len(yellow_subset))],
    'Green Criteria': [str(len(green_subset))],
    'Green-No-Red Criteria': [str(len(green_no_red_subset))]
})

final_table = pd.concat([n_row, final_table]).reset_index(drop=True)

# Save to CSV
final_table.to_csv('../output/final/table1_results.csv', index=False)

print("Table 1 has been generated and saved to table1_results.csv")
final_table

## TableOne - 72 hours 

In [None]:
import pandas as pd
from tableone import TableOne

# 1) restrict all the raw hourly rows to the first 72 hours
final_df_72h = final_df.query("time_from_vent <= 72").copy()

# Create criteria_results df with one row per encounter showing if criteria were ever met
criteria_block_results = final_df_72h.groupby('encounter_block').agg({
    'patel_flag': 'max',  # 1 if criteria ever met
    'team_flag': 'max',
    'any_yellow_or_green_no_red': 'max',
    'all_green': 'max',
    'all_green_no_red': 'max'
}).reset_index()

final_df_blocks_72h = final_df_blocks[
    final_df_blocks['encounter_block'].isin(criteria_block_results['encounter_block'])
].copy()

final_df_blocks_72h = final_df_blocks_72h.merge(
    criteria_block_results,
    on='encounter_block',
    how='left'
)

#  aggregate the values from final_df at encounter_block level
vaso_peep_fio2_stats = final_df.groupby('encounter_block').agg({
    'ne_calc_last': 'max',  # If any value is > 0, the block received vasopressors
    'max_peep_set': 'mean',  # Average of max PEEP in the block
    'min_fio2_set': 'mean'   # Average of min FiO2 in the block
}).reset_index()

# Merge these stats with final_df_blocks
all_encounters = final_df_blocks_72h.merge(
    vaso_peep_fio2_stats,
    on='encounter_block',
    how='left'
)

# Create subsets and map race for each
def map_race_column(df, race_column='race'):
    race_mapping = {
        'Black or African-American': 'Black',
        'Black or African American': 'Black',
        'White': 'White',
        'Asian': 'Other',
        'American Indian or Alaska Native': 'Other',
        'Native Hawaiian or Other Pacific Islander': 'Other',
        'Other': 'Other',
        'Unknown': 'Other'
    }
    df['race_new'] = df[race_column].map(race_mapping).fillna('Missing')
    return df

# Map race and create subsets
all_encounters = map_race_column(all_encounters, 'race_category')
patel_subset = map_race_column(all_encounters[all_encounters['patel_flag'] == 1].copy(), 'race_category')
team_subset = map_race_column(all_encounters[all_encounters['team_flag'] == 1].copy(), 'race_category')
yellow_subset = map_race_column(all_encounters[all_encounters['any_yellow_or_green_no_red'] == 1].copy(), 'race_category')
green_subset = map_race_column(all_encounters[all_encounters['all_green'] == 1].copy(), 'race_category')
green_no_red_subset = map_race_column(all_encounters[all_encounters['all_green_no_red'] == 1].copy(), 'race_category')

# Calculate vasopressor usage for each subset
def calculate_vasopressor_stats(df):
    # Count encounters with any vasopressor use (ne_calc_last > 0)
    vaso_usage = df['ne_calc_last'].notna() & (df['ne_calc_last'] > 0)
    n_vaso = vaso_usage.sum()
    n_zero = (df['ne_calc_last'] == 0).sum()
    n_missing = df['ne_calc_last'].isna().sum()
    total = len(df)
    return n_vaso, n_zero, n_missing, total

# Calculate stats for each group
vaso_stats = {
    'All Encounters': calculate_vasopressor_stats(all_encounters),
    'Patel Criteria': calculate_vasopressor_stats(patel_subset),
    'TEAM Criteria': calculate_vasopressor_stats(team_subset),
    'Yellow Criteria': calculate_vasopressor_stats(yellow_subset),
    'Green Criteria': calculate_vasopressor_stats(green_subset),
    'Green-No-Red Criteria': calculate_vasopressor_stats(green_no_red_subset)
}

# Define variables for the table
categorical = ['sex_category', 'race_new', 'ethnicity_category', 
              'location_category', 'is_dead']

continuous = ['age_at_admission', 'sofa_cv_97', 'sofa_coag', 'sofa_renal',
             'sofa_liver', 'sofa_resp', 'sofa_cns', 'sofa_total',
             'ne_calc_last', 'max_peep_set', 'min_fio2_set']

# Create individual tables
# All Encounters - This will be our template
table_all = TableOne(all_encounters, 
                    columns=categorical + continuous,
                    categorical=categorical,
                    groupby=None,
                    nonnormal=continuous,
                    pval=False)
df_all = table_all.tableone.reset_index()

# Filter out the 'n' row from the template
# df_all = df_all[~((df_all['level_0'] == 'n') & (df_all['level_1'].isna()))]

# Get the last column and the index columns
df_template = pd.DataFrame({
    'Characteristics': df_all['level_0'],
    'Category': df_all['level_1'],
    'All Encounters': df_all[df_all.columns[-1]]
})

# Function to process each criteria subset
def process_criteria_subset(subset_df, criteria_name, template):
    table = TableOne(subset_df,
                    columns=categorical + continuous,
                    categorical=categorical,
                    groupby=None,
                    nonnormal=continuous,
                    pval=False)
    df = table.tableone.reset_index()
    
    # Filter out the 'n' row
    df = df[~((df['level_0'] == 'n') & (df['level_1'].isna()))]
    
    # Create a DataFrame with the same structure as template
    result = pd.DataFrame({
        'Characteristics': df['level_0'],
        'Category': df['level_1'],
        criteria_name: df[df.columns[-1]]
    })
    
    # Merge with template to ensure all categories are present
    merged = pd.merge(template[['Characteristics', 'Category']], 
                     result,
                     on=['Characteristics', 'Category'],
                     how='left')
    
    return merged[criteria_name]

# Process each criteria subset
patel_col = process_criteria_subset(patel_subset, 'Patel Criteria', df_template)
team_col = process_criteria_subset(team_subset, 'TEAM Criteria', df_template)
yellow_col = process_criteria_subset(yellow_subset, 'Yellow Criteria', df_template)
green_col = process_criteria_subset(green_subset, 'Green Criteria', df_template)
green_no_red_col = process_criteria_subset(green_no_red_subset, 'Green-No-Red Criteria', df_template)

# Combine all columns
final_table = pd.concat([
    df_template[['Characteristics', 'Category', 'All Encounters']],
    patel_col,
    team_col,
    yellow_col,
    green_col,
    green_no_red_col
], axis=1)

# Clean up the table
# Remove the 'Missing' category if it exists and has count of 0
final_table = final_table[~((final_table['Category'] == 'Missing') & 
                          (final_table['All Encounters'].str.startswith('0')))]

# Format mortality rows
mortality_rows = final_table.loc[final_table['Characteristics'] == 'is_dead']
for col in final_table.columns[2:]:  # Skip 'Characteristics' and 'Category'
    if col == 'All Encounters':
        total = len(all_encounters)
        deaths = all_encounters['is_dead'].sum()
    elif col == 'Patel Criteria':
        total = len(patel_subset)
        deaths = patel_subset['is_dead'].sum()
    elif col == 'TEAM Criteria':
        total = len(team_subset)
        deaths = team_subset['is_dead'].sum()
    elif col == 'Yellow Criteria':
        total = len(yellow_subset)
        deaths = yellow_subset['is_dead'].sum()
    elif col == 'Green Criteria':
        total = len(green_subset)
        deaths = green_subset['is_dead'].sum()
    else:  # Green No Red Criteria
        total = len(green_no_red_subset)
        deaths = green_no_red_subset['is_dead'].sum()
    
    percentage = (deaths / total * 100) if total > 0 else 0
    mortality_rows.loc[mortality_rows['Category'] == '1', col] = f"{deaths} ({percentage:.1f})"

# Replace the original mortality rows
final_table.loc[final_table['Characteristics'] == 'is_dead'] = mortality_rows

# # Clean up labels
final_table.loc[final_table['Characteristics'] == 'is_dead', 'Characteristics'] = 'Mortality'
# final_table.loc[final_table['Category'] == '1', 'Category'] = ''

# Add vasopressor usage rows
vaso_rows = []
for status in ['Received Vasopressors', 'No Vasopressors', 'Missing Vasopressor Data']:
    row_data = {'Characteristics': 'Vasopressor Status', 'Category': status}
    for col in final_table.columns[2:]:  # Skip 'Characteristics' and 'Category'
        if col in vaso_stats:
            n_vaso, n_zero, n_missing, total = vaso_stats[col]
            if status == 'Received Vasopressors':
                value = n_vaso
            elif status == 'No Vasopressors':
                value = n_zero
            else:  # Missing Vasopressor Data
                value = n_missing
            
            percentage = (value / total * 100) if total > 0 else 0
            row_data[col] = f"{value} ({percentage:.1f})"
    vaso_rows.append(row_data)

vaso_df = pd.DataFrame(vaso_rows)
final_table = pd.concat([final_table, vaso_df], ignore_index=True)

# Add n row at the top (only once)
n_row = pd.DataFrame({
    'Characteristics': ['n'],
    'Category': [''],
    'All Encounters': [str(len(all_encounters))],
    'Patel Criteria': [str(len(patel_subset))],
    'TEAM Criteria': [str(len(team_subset))],
    'Yellow Criteria': [str(len(yellow_subset))],
    'Green Criteria': [str(len(green_subset))],
    'Green-No-Red Criteria': [str(len(green_no_red_subset))]
})

final_table = pd.concat([n_row, final_table]).reset_index(drop=True)

# Save to CSV
final_table.to_csv('../output/final/table1_resultss_72hrs.csv', index=False)

print("Table 1 has been generated and saved to table1_results_72hrs.csv")
final_table

## Missingess

In [None]:
key_variables =['encounter_block','hospitalization_id', 
                'recorded_date'	,'recorded_hour', 'time_from_vent',
                'hourly_trach','paralytics_flag',]

reqd_team_fields = ['hourly_trach','paralytics_flag',
                    'lactate', 'max_heart_rate', 'ne_calc_last',
                    'last_ne_dose_last_6_hours', 'min_fio2_set', 'max_peep_set', 
                    'max_resp_rate_obs', "team_pulse_flag", "team_lactate_flag", "team_ne_flag",
                    "team_fio2_flag",  "team_peep_flag",    "team_resp_rate_flag"]

reqd_yellow_fields =[
    # Clinical Measurements
    'min_spo2', 'min_map', 'max_map', 'ne_calc_last', 'max_sbp', "avg_map",
    'max_heart_rate', 'min_heart_rate', 'min_fio2_set',
    'max_resp_rate_obs', 'min_peep_set', 'lactate',
    
    # Red Flags
    'red_resp_spo2_flag', 'red_map_flag', 'red_high_support_flag',
    'red_hypertensive_flag', 'red_pulse_high_flag', 'red_pulse_low_flag',
    'red_meds_flag',
    
    # Yellow Flags
    'yellow_resp_spo2_flag', 'yellow_fio2_flag', 'yellow_resp_rate_flag',
    'yellow_peep_flag', 'yellow_map_flag', 'yellow_pulse_flag',
    'yellow_lactate_flag',
    
    # Green Flags
    'green_resp_spo2_flag', 'green_resp_rate_flag', 'green_fio2_flag',
    'green_peep_flag', 'green_map_flag', 'green_pulse_flag',
    'green_lactate_flag', 'green_hr_flag',
    
    # Composite Flags
    'any_red', 'any_yellow', 'any_green', 'all_green',
    'all_green_no_red', 'all_green_no_red_yellow',
    'all_yellow_no_red_green', 'any_yellow_no_red_green',
    'any_yellow_or_green_no_red', 'yellow_resp_flag',
    'yellow_cardio_flag', 'yellow_all_green', 'yellow_not_all_green'
]

reqd_patel_fields = ['min_map', 'max_map','max_sbp', 'min_sbp',"avg_map",
                   'min_heart_rate','max_heart_rate', 'min_respiratory_rate','min_spo2', 
                    'max_respiratory_rate','patel_map_flag','patel_sbp_flag',
                    'patel_pulse_flag', 'patel_resp_rate_flag' , 'patel_spo2_flag', 
                    'patel_resp_flag', 'patel_cardio_flag' ]

reqd_green_fields =[
    # Clinical Measurements
    'min_spo2', 'min_map', 'max_map', 'ne_calc_last', 'max_sbp',"avg_map",
    'max_heart_rate', 'min_heart_rate', 'min_fio2_set',
    'max_resp_rate_obs', 'min_peep_set', 'lactate',

    
    # Green Flags
    'green_resp_spo2_flag', 'green_resp_rate_flag', 'green_fio2_flag',
    'green_peep_flag', 'green_map_flag', 'green_pulse_flag',
    'green_lactate_flag', 'green_hr_flag',
    
    # Composite Flags
     'all_green',
    'all_green_no_red', 
]

In [None]:
# Function to calculate percentage of missing values per encounter block
def calculate_missing_percentage(df, variable_list, exclude_flags=True):
    # Filter out flag variables if requested
    if exclude_flags:
        vars_to_check = [var for var in variable_list if 'flag' not in var.lower()]
    else:
        vars_to_check = variable_list
        
    # Remove key variables that are administrative
    vars_to_check = [var for var in vars_to_check if var not in ['encounter_block', 'hospitalization_id', 
                                                                'recorded_dttm', 'recorded_date', 'recorded_hour', 
                                                                'time_from_vent', 'hourly_trach', 'paralytics_flag']]
    
    # Calculate percentage of blocks where variable was never measured
    missing_pct = {}
    total_blocks = df['encounter_block'].nunique()
    
    for var in vars_to_check:
        blocks_never_measured = df.groupby('encounter_block')[var].apply(lambda x: x.isna().all()).sum()
        missing_pct[var] = (blocks_never_measured / total_blocks) * 100
        
    return pd.Series(missing_pct).sort_values(ascending=False)

# Calculate for each criteria set
team_missing = calculate_missing_percentage(final_df, reqd_team_fields)
yellow_missing = calculate_missing_percentage(final_df, reqd_yellow_fields)
patel_missing = calculate_missing_percentage(final_df, reqd_patel_fields)
green_missing = calculate_missing_percentage(final_df, reqd_green_fields)
green_no_red_missing = calculate_missing_percentage(final_df, reqd_yellow_fields)

# def plot_missing_data(missing_series, title):
#     plt.figure(figsize=(12, 6))
#     sns.barplot(x=missing_series.values, y=missing_series.index)
#     plt.title(f"Percentage of Blocks with Never Measured Variables - {title}")
#     plt.xlabel("Percentage of Blocks (%)")
#     plt.ylabel("Variables")
#     plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
#     plt.tight_layout()
#     plt.show()

# # Plot for each criteria set
# plot_missing_data(team_missing, "TEAM Criteria")
# plot_missing_data(yellow_missing, "Yellow Criteria")
# plot_missing_data(patel_missing, "Patel Criteria")
# plot_missing_data(green_missing, "Patel Criteria")

# Print tabular summaries
print("\nTEAM Criteria Missing Data Summary:")
print(team_missing.round(2))
print("\nYellow Criteria Missing Data Summary:")
print(yellow_missing.round(2))
print("\nPatel Criteria Missing Data Summary:")
print(patel_missing.round(2))
print("\nGreen No Red Criteria Missing Data Summary:")
print(green_no_red_missing.round(2))

team_missing.to_csv('../output/final/team_missing_data.csv')
yellow_missing.to_csv('../output/final/yellow_missing_data.csv')
patel_missing.to_csv('../output/final/patel_missing_data.csv')
green_missing.to_csv('../output/final/green_missing_data.csv')
green_no_red_missing.to_csv('../output/final/green_no_red_missing_data.csv')

## Competing Risk Analysis Setup

Create a dataframe for each criteria with the following columns 

1. encounter_block: identify the patient encounter
2. time_eligibility: earliest eligibility time from first intubation episode per encounter block
3. time_death: time from ventilation start to death, if applicable. Missing if not dead
4. time_discharge_alive: time from ventilation start to discharge. If not dead, assumed discharged and the last recorded vital time is discharge time.
5. t_event: earliest of the above three times
6. outcome: 1(eligibility), 2(death), 3(discharge)

### Competing risk updated (4/14)

In [None]:
##############################################################################
# Helper: build block‑level data set for competing‑risk analysis
##############################################################################
def create_competing_risk_dataset(
    criteria_df: pd.DataFrame,
    all_ids_w_outcome: pd.DataFrame,
    flag_col: str = "patel_flag"
) -> pd.DataFrame:
    """
    One row per encounter_block with
      time_eligibility        - first hour where <flag_col> == 1
      time_death              - hours from vent start to death   (NaN if alive)
      time_discharge_alive    - hours from vent start to discharge (NaN if died)
      t_event                 - min of the three times
      outcome                 - 1(eligible)/2(death)/3(discharge)

    Assumptions
    -----------
    • time_from_vent (in hours) is already *after* the 4hour cool-off.  
    • all_ids_w_outcome has one row per encounter_block.
    """

    ###################################################################
    # 0) Basic column checks
    ###################################################################
    needed_cols = [
        "encounter_block",
        "time_from_vent",          # raw hours since intubation (already cooled‑off)
        "recorded_date",          
        "recorded_hour",
        flag_col
    ]
    missing = [c for c in needed_cols if c not in criteria_df.columns]
    if missing:
        raise ValueError(f"criteria_df is missing columns: {missing}")

    ###################################################################
    # 1) FIRST ELIGIBILITY TIME  (earliest hour where flag==1)
    ###################################################################
    first_elig = (
        criteria_df
        .loc[criteria_df[flag_col] == 1, ["encounter_block", "time_from_vent"]]
        .groupby("encounter_block", as_index=False)
        .min()
        .rename(columns={"time_from_vent": "time_eligibility"})
    )

    ###################################################################
    # 2) BLOCK‑LEVEL death / discharge times
    ###################################################################
    block_cols = [
        "encounter_block",
        "block_vent_start_dttm",
        "final_outcome_dttm",
        "is_dead"
    ]
    block_level = (
        all_ids_w_outcome
        .loc[all_ids_w_outcome["encounter_block"].isin(criteria_df["encounter_block"]),
             block_cols]
        .copy()
    )

    # convert to datetime once
    block_level["block_vent_start_dttm"] = pd.to_datetime(
        block_level["block_vent_start_dttm"], errors="coerce"
    )
    block_level["final_outcome_dttm"] = pd.to_datetime(
        block_level["final_outcome_dttm"], errors="coerce"
    )

    # hours from vent start to the *final* outcome
    hrs_from_vent = (
        (block_level["final_outcome_dttm"] - block_level["block_vent_start_dttm"])
        .dt.total_seconds() / 3600
    )

    block_level["time_death"]            = np.where(
        block_level["is_dead"] == 1, hrs_from_vent, np.nan
    )
    block_level["time_discharge_alive"]  = np.where(
        block_level["is_dead"] == 0, hrs_from_vent, np.nan
    )

    ###################################################################
    # 3) MERGE and decide which event happened first
    ###################################################################
    final_df = (
        block_level[["encounter_block", "time_death", "time_discharge_alive"]]
        .merge(first_elig, on="encounter_block", how="left")
    )

    # earliest non‑NaN time
    final_df["t_event"] = final_df[["time_eligibility",
                                    "time_death",
                                    "time_discharge_alive"]].min(axis=1, skipna=True)

    # outcome code
    def pick_outcome(r):
        if np.isfinite(r["time_eligibility"]) and r["t_event"] == r["time_eligibility"]:
            return 1
        if np.isfinite(r["time_death"])       and r["t_event"] == r["time_death"]:
            return 2
        return 3   # discharge must be earliest

    final_df["outcome"] = final_df.apply(pick_outcome, axis=1)

    return final_df[
        ["encounter_block",
         "time_eligibility",
         "time_death",
         "time_discharge_alive",
         "t_event",
         "outcome"]
    ].reset_index(drop=True)

In [None]:
df_merged = pd.merge(
        final_df,
        all_ids_w_outcome[['encounter_block',
       'block_vent_start_dttm', 'block_vent_end_dttm',
       'block_first_vital_dttm', 'block_last_vital_dttm', 'discharge_dttm',
       'discharge_category', 'death_dttm', 'final_outcome_dttm', 'is_dead']],
        on=  'encounter_block',
        how='left'
    )

df_merged_team   = df_merged.copy()
df_merged_yellow = df_merged.copy()
df_merged_patel  = df_merged.copy()
df_merged_green = df_merged.copy()
df_merged_green_no_red = df_merged.copy()

In [None]:
df_patel_competing = create_competing_risk_dataset(
    criteria_df        = df_merged_patel,
    all_ids_w_outcome  = all_ids_w_outcome,
    flag_col           = "patel_flag"
)
df_patel_competing.to_parquet("../output/intermediate/competing_risk_patel_final.parquet")

df_team_competing = create_competing_risk_dataset(
    criteria_df        = df_merged_team,
    all_ids_w_outcome  = all_ids_w_outcome,
    flag_col           = "team_flag"
)
df_team_competing.to_parquet("../output/intermediate/competing_risk_team_final.parquet")

df_yellow_competing = create_competing_risk_dataset(
    criteria_df        = df_merged_yellow,
    all_ids_w_outcome  = all_ids_w_outcome,
    flag_col           = "any_yellow_or_green_no_red"
)
df_yellow_competing.to_parquet("../output/intermediate/competing_risk_yellow_final.parquet")

df_green_competing = create_competing_risk_dataset(
    criteria_df        = df_merged_green,
    all_ids_w_outcome  = all_ids_w_outcome,
    flag_col           = "all_green"
)
df_green_competing.to_parquet("../output/intermediate/competing_risk_green_final.parquet")

df_green_no_red_competing = create_competing_risk_dataset(
    criteria_df        = df_merged_green_no_red,
    all_ids_w_outcome  = all_ids_w_outcome,
    flag_col           = "all_green_no_red"
)
df_green_no_red_competing.to_parquet("../output/intermediate/competing_risk_green_no_red_final.parquet")

In [None]:
import os
import matplotlib.pyplot as plt
from upsetplot import from_indicators, UpSet

import os
import matplotlib.pyplot as plt
from upsetplot import from_indicators, UpSet

def analyse_discharge_without_elig(competing_df, flag_prefix, title):
    # ── prepare output directory ──
    out_dir = "../output/final/graphs"
    os.makedirs(out_dir, exist_ok=True)

    # ── 1a) select blocks that failed entirely ──
    no_elig_blocks = competing_df.loc[
        competing_df["outcome"] == 3, "encounter_block"
    ].unique()
    df_fail = final_df[final_df["encounter_block"].isin(no_elig_blocks)]

    # ── 1b) pick sub-criteria columns ──
    crit_cols = [
        c for c in df_fail.columns
        if c.startswith(flag_prefix) and c.endswith("_flag")
           and c not in (f"{flag_prefix}flag", f"{flag_prefix}cardio_flag", f"{flag_prefix}resp_flag")
    ]

    # ── 1c) for each block, did it ever FAIL? ──
    ever_fail = (df_fail[crit_cols] == 0).groupby(df_fail["encounter_block"]).max()

    # ── 2) BAR PLOT & CSV ──
    freq = ever_fail.mean().sort_values(ascending=True)
    # save raw data
    freq.to_csv(
        os.path.join(out_dir, f"{title}_eligibility_failures_freq.csv"),
        header=["prop_blocks_failed"]
    )

    # plot
    plt.figure(figsize=(6, 4))
    plt.barh(freq.index.str.replace(flag_prefix, ""), freq.values, color="#4c72b0")
    plt.xlabel("Proportion of encounter-blocks where criterion ever failed")
    plt.title(f"{title}: which sub-criteria blocked eligibility?")
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, f"{title}_eligibility_failures.png"), dpi=300)
    plt.close()

    # ----------------------------------------------------------------------
    # 3) UPSET PLOT & CSV
    # ----------------------------------------------------------------------
    upset_data = from_indicators(ever_fail.columns, ever_fail)

    # Manually build a DataFrame of each combination + its count
    # ------------------------------------------------------------
    #  a) pull out the indicator combinations as a DataFrame
    combos = pd.DataFrame(list(upset_data.index), columns=upset_data.index.names)

    #  b) pull out the counts as a 1D array
    if isinstance(upset_data, pd.Series):
        counts = upset_data.values
    else:
        # if it's a DataFrame, assume the last column is the count
        counts = upset_data.iloc[:, -1].values

    #  c) assemble
    upset_df = combos.copy()
    upset_df["count"] = counts

    # save to CSV
    # upset_df.to_csv(
    #     os.path.join(out_dir, f"{title}_eligibility_failures_upset_data.csv"),
    #     index=False
    # )

    # now plot
    UpSet(
        upset_data,
        show_counts=True,
        sort_by="cardinality",
        intersection_plot_elements=15,
        element_size=None
    ).plot()
    plt.suptitle(f"{title}: top combinations of failed sub-criteria", y=1.02)
    plt.tight_layout()
    plt.savefig(
        os.path.join(out_dir, f"{title}_eligibility_failures_upset.png"),
        dpi=300
    )
    plt.close()

    return freq.to_frame("prop_blocks_failed")


# ---------------------------------------------------------------------------
# 2.  Run for each criterion
# ---------------------------------------------------------------------------
team_summary   = analyse_discharge_without_elig(df_team_competing,
                                               flag_prefix="team_",
                                               title="TEAM")

patel_summary  = analyse_discharge_without_elig(df_patel_competing,
                                               flag_prefix="patel_",
                                               title="Patel")

yellow_summary = analyse_discharge_without_elig(df_yellow_competing,
                                               flag_prefix="yellow_",
                                               title="Yellow")

green_summary = analyse_discharge_without_elig(df_green_competing,
                                               flag_prefix="green_",
                                               title="Green")

# ---------------------------------------------------------------------------
# 3.  Example: compare the three summaries side‑by‑side
# ---------------------------------------------------------------------------
compare = (team_summary.rename(columns={"prop_blocks_failed": "TEAM"})
           .join(patel_summary.rename(columns={"prop_blocks_failed": "Patel"}), how="outer")
           .join(yellow_summary.rename(columns={"prop_blocks_failed": "Yellow"}), how="outer")
           .join(green_summary.rename(columns={"prop_blocks_failed": "Green"}), how="outer")
           .fillna(0)
           .sort_index())


### Discharge alive without eligibility

In [None]:
reqd_team_fields = ['encounter_block', 'recorded_date'	,'recorded_hour',
                    'time_from_vent','hourly_trach','paralytics_flag',
                    'lactate', 'max_heart_rate', 'ne_calc_max','ne_calc_last','last_ne_dose_last_6_hours', 'min_fio2_set', 'max_peep_set', 
                    'max_resp_rate_obs', "team_pulse_flag", "team_lactate_flag", "team_ne_flag",
                    "team_fio2_flag",  "team_peep_flag",    "team_resp_rate_flag", "team_flag"]


no_elig_blocks_team = df_team_competing.loc[df_team_competing["outcome"] == 3,
                                      "encounter_block"].unique()
df_fail_team = final_df[final_df["encounter_block"].isin(no_elig_blocks_team)]
df_fail_team_filtered = df_fail_team[reqd_team_fields].copy()
df_fail_team_filtered = df_fail_team_filtered.merge(all_ids_w_outcome[['discharge_category', 'encounter_block']], on='encounter_block', how='inner')

failure_discharge_cats_team = df_fail_team_filtered.groupby('discharge_category')['encounter_block'].nunique().sort_values(ascending=False)
#write to csv
failure_discharge_cats_team.to_csv('../output/final/failure_discharge_cats_team.csv')
failure_discharge_cats_team

In [None]:
reqd_patel_fields = ['encounter_block','recorded_date'	,'recorded_hour',
                    'time_from_vent', 'hourly_trach','paralytics_flag','min_spo2',
                    'min_map', 'max_map','max_sbp', 'min_sbp','avg_map',    
                   'min_heart_rate','max_heart_rate', 'min_respiratory_rate','min_spo2', 
                    'max_respiratory_rate','patel_map_flag','patel_sbp_flag','patel_pulse_flag', 
                    'patel_resp_rate_flag' , 'patel_spo2_flag', 'patel_resp_flag', 'patel_cardio_flag', 'patel_flag' ]
no_elig_blocks_patel = df_patel_competing.loc[df_patel_competing["outcome"] == 3,
                                      "encounter_block"].unique()
df_fail_patel = final_df[final_df["encounter_block"].isin(no_elig_blocks_patel)]
df_fail_patel_filtered = df_fail_patel[reqd_patel_fields].copy()
df_fail_patel_filtered = df_fail_patel_filtered.merge(all_ids_w_outcome[['discharge_category', 'encounter_block']], 
                                                    on='encounter_block', how='inner')
failure_discharge_cats_patel = df_fail_patel_filtered.groupby('discharge_category')['encounter_block'].nunique().sort_values(ascending=False)
failure_discharge_cats_patel.to_csv('../output/final/failure_discharge_cats_patel.csv')
failure_discharge_cats_patel

In [None]:
reqd_yellow_fields =[
    'encounter_block','recorded_date'	,'recorded_hour',
    'time_from_vent', 'hourly_trach','paralytics_flag',
    # Clinical Measurements
    'min_spo2', 'min_map', 'max_map', 'ne_calc_max', 'max_sbp', 'avg_map',
    'max_heart_rate', 'min_heart_rate', 'min_fio2_set',
    'max_resp_rate_obs', 'min_peep_set', 'lactate',
    
    # Administrative/Timing
    'hourly_trach', 'paralytics_flag', 'recorded_hour',
    'time_from_vent_adjusted', 'red_meds_flag',
    # Red Flags
    'red_resp_spo2_flag', 'red_map_flag', 'red_high_support_flag',
    'red_hypertensive_flag', 'red_pulse_high_flag', 'red_pulse_low_flag',
    
    # Yellow Flags
    'yellow_resp_spo2_flag', 'yellow_fio2_flag', 'yellow_resp_rate_flag',
    'yellow_peep_flag', 'yellow_map_flag', 'yellow_pulse_flag',
    'yellow_lactate_flag',
    
    # Green Flags
    'green_resp_spo2_flag', 'green_resp_rate_flag', 'green_fio2_flag',
    'green_peep_flag', 'green_map_flag', 'green_pulse_flag',
    'green_lactate_flag', 'green_hr_flag',
    
    # Composite Flags
    'any_red', 'any_yellow', 'any_green', 'all_green',
    'all_green_no_red', 'all_green_no_red_yellow',
    'all_yellow_no_red_green', 'any_yellow_no_red_green',
    'any_yellow_or_green_no_red', 'yellow_resp_flag',
    'yellow_cardio_flag', 'yellow_all_green', 'yellow_not_all_green'
]
no_elig_blocks_yellow = df_yellow_competing.loc[df_yellow_competing["outcome"] == 3,
                                      "encounter_block"].unique()
df_fail_yellow = final_df[final_df["encounter_block"].isin(no_elig_blocks_yellow)]
df_fail_yellow_filtered = df_fail_yellow[reqd_yellow_fields].copy()
df_fail_yellow_filtered = df_fail_yellow_filtered.merge(all_ids_w_outcome[['discharge_category', 'encounter_block']], 
                                                    on='encounter_block', how='inner')
failure_discharge_cats_yellow = df_fail_yellow_filtered.groupby('discharge_category')['encounter_block'].nunique().sort_values(ascending=False)
failure_discharge_cats_yellow.to_csv('../output/final/failure_discharge_cats_yellow.csv')
failure_discharge_cats_yellow

In [None]:
reqd_green_fields =[
    'encounter_block','recorded_date'	,'recorded_hour',
    'time_from_vent', 'hourly_trach','paralytics_flag',
    # Clinical Measurements
    'min_spo2', 'min_map', 'max_map', 'ne_calc_max', 'max_sbp',
    'max_heart_rate', 'min_heart_rate', 'min_fio2_set','avg_map',
    'max_resp_rate_obs', 'min_peep_set', 'lactate',
    
    # Administrative/Timing
    'hourly_trach', 'paralytics_flag', 'recorded_hour',
    'time_from_vent_adjusted', 'red_meds_flag',
    # Red Flags
    'red_resp_spo2_flag', 'red_map_flag', 'red_high_support_flag',
    'red_hypertensive_flag', 'red_pulse_high_flag', 'red_pulse_low_flag',
    
    # Green Flags
    'green_resp_spo2_flag', 'green_resp_rate_flag', 'green_fio2_flag',
    'green_peep_flag', 'green_map_flag', 'green_pulse_flag',
    'green_lactate_flag', 'green_hr_flag',
    'all_green',
    'all_green_no_red', 
]
no_elig_blocks_green = df_green_competing.loc[df_green_competing["outcome"] == 3,
                                      "encounter_block"].unique()
df_fail_green = final_df[final_df["encounter_block"].isin(no_elig_blocks_green)]
df_fail_green_filtered = df_fail_green[reqd_green_fields].copy()
df_fail_green_filtered = df_fail_green_filtered.merge(all_ids_w_outcome[['discharge_category', 'encounter_block']], 
                                                    on='encounter_block', how='inner')
failure_discharge_cats_green = df_fail_green_filtered.groupby('discharge_category')['encounter_block'].nunique().sort_values(ascending=False)
failure_discharge_cats_green.to_csv('../output/final/failure_discharge_cats_green.csv')
failure_discharge_cats_green

### Mortality

In [None]:
def analyze_death_without_eligibility(df, criteria_name):
    # Total number of blocks
    total_blocks = len(df)
    
    # Blocks that died without eligibility (outcome=2)
    died_without_elig = df[df['outcome'] == 2].shape[0]
    
    # Calculate percentage
    percent = (died_without_elig / total_blocks) * 100
    
    return {
        'criteria': criteria_name,
        'total_blocks': total_blocks,
        'died_without_eligibility': died_without_elig,
        'percentage': percent
    }

# Analyze each dataset
results = [
    analyze_death_without_eligibility(df_team_competing, 'TEAM'),
    analyze_death_without_eligibility(df_yellow_competing, 'Yellow'),
    analyze_death_without_eligibility(df_patel_competing, 'Patel'),
    analyze_death_without_eligibility(df_green_competing, 'Green'),
    analyze_death_without_eligibility(df_green_no_red_competing, 'Green No Red')
]

# Convert to DataFrame for easier plotting
results_df = pd.DataFrame(results)

# Create the visualization
plt.figure(figsize=(10, 6))

# Create grouped bar plot
x = range(len(results_df['criteria']))
width = 0.35

# Plot bars
bars = plt.bar(x, results_df['percentage'], width, label='Percentage')

# Customize the plot
plt.title('Percentage of Blocks that Died Without Becoming Eligible by Criteria', pad=20)
plt.xlabel('Criteria')
plt.ylabel('Percentage (%)')
plt.xticks(x, results_df['criteria'])

# Add value labels on top of bars
for i, bar in enumerate(bars):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.1f}%\n({results_df["died_without_eligibility"][i]:,}/{results_df["total_blocks"][i]:,})',
             ha='center', va='bottom')

plt.tight_layout()

# Print detailed summary
print("\nDetailed Summary:")
print("=" * 80)
for result in results:
    print(f"\n{result['criteria']} Criteria:")
    print(f"Total blocks: {result['total_blocks']:,}")
    print(f"Died without eligibility: {result['died_without_eligibility']:,}")
    print(f"Percentage: {result['percentage']:.1f}%")

pd.DataFrame(results).to_csv('../output/final/death_without_eligibility_summary.csv', index=False)
plt.close()

## Final figures and tables

1. Figure 1: Percentage of encounter satisfying Patel, TEAM, and any yellow or GREEN criteria
2. Figure 2: Percentage of business hours each encounter was eligible for different criteria
3. Figure 3: Percentage of business hours not eligible for each criteria broken down by subcomponent failure


### Aggregates - 72 hours 

In [None]:
from datetime import datetime
import pandas as pd

# ── flag dictionaries ------------------------------------------------
CRITS_ALL = {
    'Patel' : 'patel_flag_all_hours',
    'TEAM'  : 'team_flag_all_hours',
    'Yellow': 'any_yellow_or_green_no_red_all_hours',
    'Green' : 'all_green_all_hours',
}
BUSINESS_FLAGS = {
    'Patel' : 'patel_flag',
    'TEAM'  : 'team_flag',
    'Yellow': 'any_yellow_or_green_no_red',
    'Green' : 'all_green'
}

BUS_HRS = range(8, 17)  # 08:00–16:59 inclusive

# ── restrict to first 72 hours only ----------------------------------
df_72h = final_df[final_df['time_from_vent'] <= 72].copy()

# ── denominators -----------------------------------------------------
total_patients        = df_72h['encounter_block'].nunique()
total_observed_hours  = len(df_72h)
total_business_hours  = len(df_72h[df_72h['recorded_hour'].isin(BUS_HRS)])

# ── build aggregate rows --------------------------------------------
rows = []
for crit in CRITS_ALL:
    f_all = CRITS_ALL[crit]
    f_bus = BUSINESS_FLAGS[crit]

    elig_all_df = df_72h[df_72h[f_all] == 1]
    eligible_hours_all = len(elig_all_df)
    eligible_patients = elig_all_df['encounter_block'].nunique()

    elig_bus_df = df_72h[
        (df_72h[f_bus] == 1) & (df_72h['recorded_hour'].isin(BUS_HRS))
    ]
    eligible_business_hours = len(elig_bus_df)

    rows.append({
        'Criteria'                      : crit,
        'Total Patients'                : total_patients,
        'Eligible Patients'             : eligible_patients,
        'Total Observed Hours'          : total_observed_hours,
        'Eligible Hours (all hrs)'      : eligible_hours_all,
        'Total Business Hours'          : total_business_hours,
        'Eligible Business Hours'       : eligible_business_hours,
        'Proportion Eligible Hours %'   : 100 * eligible_hours_all / total_observed_hours,
        'Proportion Eligible BusHrs %'  : 100 * eligible_business_hours / total_business_hours,
        'Proportion Eligible Patients %': 100 * eligible_patients / total_patients
    })

aggregate_df = pd.DataFrame(rows)

# Save the aggregate data to CSV
timestamp = datetime.now().date()
aggregate_df.to_csv(f'../output/final/aggregates_72hrs_{pyCLIF.helper["site_name"]}.csv', index=False)

In [None]:
# Define custom colors
custom_colors = ['#983232', '#003f5c', '#fdfd96', '#98FB98']  # Maroon, Dark Blue, Pastel Yellow, Pastel Green
# Plot proportion of eligible encounters
plt.figure(figsize=(10, 6))
barplot = sns.barplot(
    x='Criteria', 
    y='Proportion Eligible Patients %', 
    data=aggregate_df, 
    palette=custom_colors
)

# Add percentages on top of the bars
for index, row in aggregate_df.iterrows():
    barplot.text(index, row['Proportion Eligible Patients %'], f"{row['Proportion Eligible Patients %']:.1f}%", 
                 color='black', ha="center", va="bottom")

# Add labels and title
plt.title('Eligibility by Encounter (During first 72 hours)')
plt.xlabel('Criteria')
plt.ylabel('Percentage of Encounters Eligible')

# Save the graph
plt.savefig(f'../output/final/graphs/eligibility_by_encounters_72hrs_{pyCLIF.helper["site_name"]}.png')
plt.close()

plt.figure(figsize=(10, 6))
barplot = sns.barplot(
    x='Criteria', 
    y='Proportion Eligible Hours %', 
    data=aggregate_df, 
    palette=custom_colors
)

# Add percentages on top of the bars
for index, row in aggregate_df.iterrows():
    barplot.text(index, row['Proportion Eligible Hours %'], f"{row['Proportion Eligible Hours %']:.1f}%", 
                 color='black', ha="center", va="bottom")

# Add labels and title
plt.title('Eligibility by Total Observed Hours (During first 72 hours)')
plt.xlabel('Criteria')
plt.ylabel('Percentage of Observed Hours Eligible')

# Save the graph
plt.savefig(f'../output/final/graphs/eligibility_by_total_hours_72hrs_{pyCLIF.helper["site_name"]}.png')
plt.close()

# Define custom colors
custom_colors = ['#983232', '#003f5c', '#fdfd96', '#98FB98']  # Maroon, Dark Blue, Pastel Yellow, Pastel Green

# Plot proportion of eligible business hours
plt.figure(figsize=(10, 6))
barplot = sns.barplot(
    x='Criteria', 
    y='Proportion Eligible BusHrs %', 
    data=aggregate_df, 
    palette=custom_colors
)

# Add percentages on top of the bars
for index, row in aggregate_df.iterrows():
    barplot.text(index, row['Proportion Eligible BusHrs %'], f"{row['Proportion Eligible BusHrs %']:.1f}%", 
                 color='black', ha="center", va="bottom")

# Add labels and title
plt.title('Eligibility by Business Hours (During first 72 hours)')
plt.xlabel('Criteria')
plt.ylabel('Percentage of Business Hours Eligible')

# Save the graph
plt.savefig(f'../output/final/graphs/eligibility_by_business_hours_72hrs_{pyCLIF.helper["site_name"]}.png')
plt.close()


### Aggregates for comparison across sites- Full Encounter Trajectory

In [None]:
# --------------------------------------------------------------------
#  eligibility aggregates – ALL hours  vs  BUSINESS hours
# --------------------------------------------------------------------
from datetime import datetime
import pandas as pd

# ── flag dictionaries ------------------------------------------------
CRITS_ALL = {                         # eligibility at *any* hour
    'Patel' : 'patel_flag_all_hours',
    'TEAM'  : 'team_flag_all_hours',
    'Yellow': 'any_yellow_or_green_no_red_all_hours',
    'Green' : 'all_green_all_hours',
}
# business‑hour flags you already calculate in final_df
BUSINESS_FLAGS = dict([                 # eligibility at 8-16 inclusive hour
    ('Patel' , 'patel_flag'),
    ('TEAM'  , 'team_flag'),
    ('Yellow', 'any_yellow_or_green_no_red'),
    ('Green' , 'all_green')
])

BUS_HRS = range(8, 17)   # 08:00–16:59 inclusive

# ── denominators -----------------------------------------------------
total_patients        = final_df['encounter_block'].nunique()
total_observed_hours  = len(final_df)
total_business_hours  = len(final_df[final_df['recorded_hour'].isin(BUS_HRS)])

# ── build aggregate rows --------------------------------------------
rows = []
for crit in CRITS_ALL:                # guarantees consistent order
    f_all   = CRITS_ALL[crit]
    f_bus   = BUSINESS_FLAGS[crit]
    
    # ALL‑hour eligibility
    elig_all_df   = final_df[final_df[f_all] == 1]
    eligible_hours_all = len(elig_all_df)
    eligible_patients  = elig_all_df['encounter_block'].nunique()
    
    # BUSINESS‑hour eligibility
    elig_bus_df = final_df[
        (final_df[f_bus] == 1) & (final_df['recorded_hour'].isin(BUS_HRS))
    ]
    eligible_business_hours = len(elig_bus_df)
    
    rows.append({
        'Criteria'                      : crit,
        'Total Patients'                : total_patients,
        'Eligible Patients'             : eligible_patients,
        'Total Observed Hours'          : total_observed_hours,
        'Eligible Hours (all hrs)'      : eligible_hours_all,
        'Total Business Hours'          : total_business_hours,
        'Eligible Business Hours'       : eligible_business_hours,
        'Proportion Eligible Hours %'   : 100*eligible_hours_all/total_observed_hours,
        'Proportion Eligible BusHrs %'  : 100*eligible_business_hours/total_business_hours,
        'Proportion Eligible Patients %': 100*eligible_patients/total_patients
    })

aggregate_df = pd.DataFrame(rows)


# Save the aggregate data to CSV
timestamp = datetime.now().date()
aggregate_df.to_csv(f'../output/final/aggregates_{pyCLIF.helper["site_name"]}.csv', index=False)


### Eligibility by encounter

In [None]:
# Define custom colors
custom_colors = ['#983232', '#003f5c', '#fdfd96', '#98FB98']  # Maroon, Dark Blue, Pastel Yellow, Pastel Green
# Plot proportion of eligible encounters
plt.figure(figsize=(10, 6))
barplot = sns.barplot(
    x='Criteria', 
    y='Proportion Eligible Patients %', 
    data=aggregate_df, 
    palette=custom_colors
)

# Add percentages on top of the bars
for index, row in aggregate_df.iterrows():
    barplot.text(index, row['Proportion Eligible Patients %'], f"{row['Proportion Eligible Patients %']:.1f}%", 
                 color='black', ha="center", va="bottom")

# Add labels and title
plt.title('Eligibility by Encounter')
plt.xlabel('Criteria')
plt.ylabel('Percentage of Encounters Eligible')

# Save the graph
plt.savefig(f'../output/final/graphs/eligibility_by_encounters_{pyCLIF.helper["site_name"]}.png')
plt.close()

### Eligibility by all hours

In [None]:
plt.figure(figsize=(10, 6))
barplot = sns.barplot(
    x='Criteria', 
    y='Proportion Eligible Hours %', 
    data=aggregate_df, 
    palette=custom_colors
)

# Add percentages on top of the bars
for index, row in aggregate_df.iterrows():
    barplot.text(index, row['Proportion Eligible Hours %'], f"{row['Proportion Eligible Hours %']:.1f}%", 
                 color='black', ha="center", va="bottom")

# Add labels and title
plt.title('Eligibility by Total Observed Hours')
plt.xlabel('Criteria')
plt.ylabel('Percentage of Observed Hours Eligible')

# Save the graph
plt.savefig(f'../output/final/graphs/eligibility_by_total_hours_{pyCLIF.helper["site_name"]}.png')
plt.close()

### Eligibility by business hour

In [None]:
# Define custom colors
custom_colors = ['#983232', '#003f5c', '#fdfd96', '#98FB98']  # Maroon, Dark Blue, Pastel Yellow, Pastel Green

# Plot proportion of eligible business hours
plt.figure(figsize=(10, 6))
barplot = sns.barplot(
    x='Criteria', 
    y='Proportion Eligible BusHrs %', 
    data=aggregate_df, 
    palette=custom_colors
)

# Add percentages on top of the bars
for index, row in aggregate_df.iterrows():
    barplot.text(index, row['Proportion Eligible BusHrs %'], f"{row['Proportion Eligible BusHrs %']:.1f}%", 
                 color='black', ha="center", va="bottom")

# Add labels and title
plt.title('Eligibility by Business Hours')
plt.xlabel('Criteria')
plt.ylabel('Percentage of Business Hours Eligible')

# Save the graph
plt.savefig(f'../output/final/graphs/eligibility_by_business_hours_{pyCLIF.helper["site_name"]}.png')
plt.close()


### Eligibility by business hour - One week trend

In [None]:
BUSINESS_FLAGS = dict([                 # eligibility at 8-16 inclusive hour
    ('Patel' , 'patel_flag'),
    ('TEAM'  , 'team_flag'),
    ('Yellow', 'any_yellow_or_green_no_red'),
    ('Green' , 'all_green')
])

# ---------------------------------------------------------------
# 1) Restrict to first week (≤ 7 days = 168 hours) and bin by day
# ---------------------------------------------------------------
df_week = final_df.query("time_from_vent <= 168").copy()
df_week["day_bin"] = (df_week["time_from_vent"] // 24).astype(int)

# ---------------------------------------------------------------
# 2) Build trend DataFrame
# ---------------------------------------------------------------
trend_rows = []
for crit, flag in BUSINESS_FLAGS.items():          # e.g. {'Patel': 'patel_flag', ...}
    for day, g in df_week.groupby("day_bin", sort=True):
        # only count rows during business hours (8–16)
        bus = g[g["recorded_hour"].between(8, 16)]
        denom = bus.shape[0]
        num   = bus[bus[flag] == 1].shape[0]
        prop  = num / denom if denom else np.nan
        trend_rows.append({
            "criterion": crit,
            "day":       day,
            "prop_bus_hrs": prop
        })

trend_df = pd.DataFrame(trend_rows)
# save trend df
trend_df.to_csv(f'../output/final/eligibility_trend_first_week_{pyCLIF.helper["site_name"]}.csv', index=False)

# ---------------------------------------------------------------
# 3) Plot with custom colors
# ---------------------------------------------------------------
custom_colors = {
    'Patel':  '#983232',  # maroon
    'TEAM':   '#003f5c',  # dark blue
    'Yellow': '#c9b037',  # pastel yellow
    'Green':  '#2e8b57'   # pastel green
}

plt.figure(figsize=(8, 5))
sns.lineplot(
    data=trend_df,
    x="day",
    y="prop_bus_hrs",
    hue="criterion",
    hue_order=list(custom_colors.keys()),
    palette=custom_colors,
    marker="o",
    linewidth=2
)
plt.xlabel("Days since intubation")
plt.ylabel("Proportion of business hours eligible")
plt.title("Eligibility trend — first week (Day 0-6)")
plt.xticks(range(0, 7))
plt.ylim(0, 1)
plt.legend(title="Criterion")
plt.tight_layout()
# save figure
plt.savefig("../output/final/graphs/eligibility_trend_first_week.png", dpi=300)
# plt.show()
plt.close()

#### ECDF 

x‑axis (fraction of encounter hours eligible) – for each patient, what proportion of their ventilated hours satisfied the rule.
y‑axis (proportion of patients ≤ x) – at any x, the height of a curve tells you what fraction of patients have eligibility no greater than x.

In [None]:
tmp = (final_df
       .groupby(['encounter_block'])
       .agg(**{name: (flag, 'mean') for name, flag in CRITS_ALL.items()}))

sns.ecdfplot(data=tmp.melt(var_name='criterion', value_name='pct'),
             x='pct', hue='criterion')
plt.xlabel('Fraction of encounter hours eligible')
#save this plot
plt.savefig(f'../output/final/graphs/eligibility_hour_ecdf_{pyCLIF.helper["site_name"]}.png')
plt.close()

### Hourly distribution by hours

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ── convenience --------------------------------------------------------
CRITS = {
    'Patel' : 'patel_flag_all_hours',
    'TEAM'  : 'team_flag_all_hours',
    'Yellow': 'any_yellow_or_green_no_red_all_hours',
    'Green' : 'all_green_all_hours',
}
BUS_HRS = range(8, 17)
custom_colors = [
    '#983232',  # Maroon
    '#003f5c',  # Dark Blue
    '#c9b037',  # Darker Yellow (Gold/Mustard tone)
    '#2e8b57'   # Darker Green (Sea Green)
]
color_map = dict(zip(CRITS.keys(), custom_colors))

# ── create combined hourly data (blocks per hour) ---------------------
hourly_data = pd.DataFrame({'hour': range(24)})
for name, flag in CRITS.items():
    by_hour = (
        final_df
        .loc[final_df[flag] == 1, ['encounter_block','recorded_hour']]
        .drop_duplicates()                                   # one row per block/hour
        .groupby('recorded_hour')['encounter_block']
        .nunique()                                          # count blocks
        .reindex(range(24), fill_value=0)                   # ensure 0–23
        .reset_index(drop=True)
    )
    hourly_data[name] = by_hour

# ── melt for seaborn plotting ------------------------------------------
hourly_melted = hourly_data.melt(
    id_vars='hour',
    var_name='Criteria',
    value_name='Eligible Blocks'
)
#SAVE hourly_melted TO CSV
hourly_melted.to_csv(f'../output/final/eligibility_hourly_melted_{pyCLIF.helper["site_name"]}.csv', index=False)

# ── plot ---------------------------------------------------------------
plt.figure(figsize=(10, 6))
for crit, color in color_map.items():
    sub = hourly_melted[hourly_melted['Criteria'] == crit]
    plt.plot(sub['hour'], sub['Eligible Blocks'],
             label=crit, marker='o', color=color)

plt.axvspan(8, 17, color='orange', alpha=0.1, label='Business Hours')
plt.title("Hourly Distribution of Eligible Blocks by Criteria")
plt.xlabel("Hour of Day")
plt.ylabel("Number of Eligible Blocks")
plt.xticks(range(24))
plt.grid(True)
plt.legend(loc='upper left', bbox_to_anchor=(1,1), title="Criterion")
plt.tight_layout()

# save
plt.savefig(
    f'../output/final/graphs/eligibility_hourly_distribution_blocks_{pyCLIF.helper["site_name"]}.png',
    dpi=300
)
plt.close()

In [None]:
# totals = final_df.groupby("recorded_hour").size()
# totals.plot(marker="o")
# plt.title("All hourly rows by hour of day")
# plt.xlabel("Hour")
# plt.ylabel("Row count")
# plt.grid()
# plt.show()


In [None]:
# # raw hourly eligibility count
# raw = final_df.groupby("recorded_hour")["any_yellow_or_green_no_red"].sum()

# # block-level (all_hours) eligibility count
# blk = final_df[ final_df["any_yellow_or_green_no_red_all_hours"]==1 ] \
#       .groupby("recorded_hour")["any_yellow_or_green_no_red_all_hours"] \
#       .size()

# pd.DataFrame({
#     "Total rows": totals,
#     "Raw Yellow": raw,
#     "Yellow (all_hours)": blk
# }).plot(marker="o")
# plt.title("Raw vs block-level Yellow eligibility")
# plt.ylabel("Count")
# plt.grid()
# plt.show()

In [None]:
# # pick out only the hours where raw yellow drops most
# drop_hours = raw.diff().nsmallest(3).index  # e.g. [8,9,10]

# for hr in drop_hours:
#     slice_hr = final_df[ final_df.recorded_hour == hr ]
#     breakdown = {
#       fld: slice_hr[fld].mean() 
#       for fld in [
#         "yellow_resp_spo2_flag",
#         "yellow_fio2_flag",
#         "yellow_resp_rate_flag",
#         "yellow_peep_flag",
#         "yellow_map_flag",
#         "yellow_pulse_flag",
#         "yellow_lactate_flag"
#       ]
#     }
#     print(f"\nHour {hr} fail-rate per subflag:")
#     for k,v in breakdown.items():
#         print(f"  {k}: {(1-v):.1%} failed")


In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt

# blocks_per_hour = (
#     final_df
#     .groupby("recorded_hour")["encounter_block"]
#     .nunique()
#     .reindex(range(24), fill_value=0)
# )

# blocks_per_hour.plot(marker="o")
# plt.title("Unique encounter_blocks by hour of day")
# plt.xlabel("Hour")
# plt.ylabel("Blocks with ANY row at that hour")
# plt.grid(True)
# plt.show()


In [None]:
# h7 = set(final_df.loc[final_df.recorded_hour==7,   "encounter_block"])
# h8 = set(final_df.loc[final_df.recorded_hour==8,   "encounter_block"])
# missing = list(h7 - h8)
# print("Example blocks missing at 8 AM:", missing)


### Failure by subcomponents

In [None]:
# Define your criteria and corresponding subcomponent flags
criteria_info = {
    'patel_flag': {'resp_flag': 'patel_resp_flag', 'cardio_flag': 'patel_cardio_flag'},
    'team_flag': {'resp_flag': 'team_resp_flag', 'cardio_flag': 'team_cardio_flag'},
    'any_yellow_or_green_no_red': {'resp_flag': 'yellow_resp_flag', 'cardio_flag': 'yellow_cardio_flag'},
    'all_green': {'resp_flag': 'green_resp_flag', 'cardio_flag': 'green_cardio_flag'}
}

# Initialize an empty list to store results
results = []

# Loop over each criterion
for criterion, flags in criteria_info.items():
    resp_flag = flags['resp_flag']
    cardio_flag = flags['cardio_flag']
    
    # Calculate total hours per hospitalization_id
    total_hours = final_df.groupby('encounter_block').size().rename('total_hours')
    
    # Create failure indicators
    df_failure = final_df.copy()
    df_failure['resp_only_failure'] = ((df_failure[resp_flag] == 0) & (df_failure[cardio_flag] == 1)).astype(int)
    df_failure['cardio_only_failure'] = ((df_failure[resp_flag] == 1) & (df_failure[cardio_flag] == 0)).astype(int)
    df_failure['both_failures'] = ((df_failure[resp_flag] == 0) & (df_failure[cardio_flag] == 0)).astype(int)
    
    # Aggregate the counts per hospitalization_id
    failure_counts = df_failure.groupby('encounter_block')[['resp_only_failure', 'cardio_only_failure', 'both_failures']].sum()
    
    # Merge with total hours
    failure_counts = failure_counts.merge(total_hours, left_index=True, right_index=True)
    
    # Calculate percentages
    failure_counts['resp_only_failure_perc'] = (failure_counts['resp_only_failure'] * 100 / failure_counts['total_hours']).round(3)
    failure_counts['cardio_only_failure_perc'] = (failure_counts['cardio_only_failure'] * 100 / failure_counts['total_hours']).round(3)
    failure_counts['both_failures_perc'] = (failure_counts['both_failures'] * 100 / failure_counts['total_hours']).round(3)
    
    # Calculate total failure percentage
    failure_counts['total_failure_perc'] = (
        failure_counts['resp_only_failure'] + failure_counts['cardio_only_failure'] + failure_counts['both_failures']
    ) * 100 / failure_counts['total_hours']
    
    # Calculate criterion met percentage
    criterion_met = final_df.groupby('encounter_block')[criterion].sum().rename('criterion_met_hours')
    failure_counts = failure_counts.merge(criterion_met, left_index=True, right_index=True)
    failure_counts['criterion_met_perc'] = (failure_counts['criterion_met_hours'] * 100 / failure_counts['total_hours']).round(3)
    
    # Add criterion name to the DataFrame
    failure_counts['Criteria'] = criterion
    
    # Append to results
    results.append(failure_counts.reset_index())

# Concatenate results for all criteria
all_failure_counts = pd.concat(results, ignore_index=True)

# Now, calculate the average percentages across all hospitalizations for each criterion
avg_failure_percentages = all_failure_counts.groupby('Criteria').agg({
    'resp_only_failure_perc': 'mean',
    'cardio_only_failure_perc': 'mean',
    'both_failures_perc': 'mean',
    'total_failure_perc': 'mean',
    'criterion_met_perc': 'mean'
}).reset_index()

# Rename columns for clarity
avg_failure_percentages = avg_failure_percentages.rename(columns={
    'resp_only_failure_perc': 'Resp Failure Only',
    'cardio_only_failure_perc': 'Cardio Failure Only',
    'both_failures_perc': 'Both Failures',
    'total_failure_perc': 'Total Failure',
    'criterion_met_perc': 'Criterion Met'
})

# Display the average failure percentages
criteria_mapping = {
    'patel_flag': 'Patel',
    'team_flag': 'TEAM',
    'any_yellow_or_green_no_red': 'Yellow',
    'all_green': 'Green'
}

avg_failure_percentages['Criteria'] = avg_failure_percentages['Criteria'].replace(criteria_mapping)
avg_failure_percentages['site_name'] = pyCLIF.helper["site_name"]
pd.DataFrame(avg_failure_percentages).to_csv(f'../output/final/avg_failure_percentages_{pyCLIF.helper["site_name"]}.csv',index=False)
avg_failure_percentages

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import kaleido
import plotly.graph_objects as go
# Create a stacked bar plot using Plotly
fig = go.Figure()

# Add bars for Cardio Failure Only
fig.add_trace(go.Bar(
    x=avg_failure_percentages['Criteria'],
    y=avg_failure_percentages['Cardio Failure Only'],
    name='Cardio Failure Only',
    marker_color='#003366'  # Dark Blue
))

# Add bars for Resp Failure Only
fig.add_trace(go.Bar(
    x=avg_failure_percentages['Criteria'],
    y=avg_failure_percentages['Resp Failure Only'],
    name='Resp Failure Only',
    marker_color='#983232'  # Maroon
))

# Add bars for Both Failures
fig.add_trace(go.Bar(
    x=avg_failure_percentages['Criteria'],
    y=avg_failure_percentages['Both Failures'],
    name='Both Failures',
    marker_color='#fdfd96'  # Pastel Yellow
))

# Customize the layout
fig.update_layout(
    barmode='stack',
    xaxis_title='Criteria',
    yaxis_title='Average Percentage of Business Hours Not Met (%)',
    yaxis=dict(range=[0, 100]),
    template='plotly_white',
    legend_title='Failure Type'
)
# Save the plot
fig.write_image(f'../output/final/graphs/avg_failure_components_{pyCLIF.helper["site_name"]}_{datetime.now().date()}.png')
# Show the plot

In [None]:
# ────────────────────────────────────────────────────────────
#  analyse_criterion.py  – plug‑and‑play helper
# ────────────────────────────────────────────────────────────
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from upsetplot import from_indicators, UpSet
from pathlib   import Path

def analyse_criterion(
    df: pd.DataFrame,
    crit_name: str,
    *,
    flag_cols: list,             # list of sub‑criterion flags (0/1)
    master_flag: str,            # overall eligibility flag (0/1)
    id_col: str        = "encounter_block",
    time_col: str      = "time_from_vent",
    out_dir           = "../output/final",
    save_fig_data: bool = True,
):
    """
    • Find blocks that *never* satisfy `master_flag`
    • For those blocks …
        - how often is each sub-flag FALSE?
        - which sub-flag is the *latest* to turn TRUE       (primary blocker)
        - which combinations of sub-flags ever fail (UpSet)
        - optionally: a distribution plot of a dose / lab   (pass in yourself)

    Everything (plots + .csv helpers) is written to `out_dir/crit_name/`.
    """
    out_dir = Path(out_dir, crit_name.lower())
    out_dir.mkdir(parents=True, exist_ok=True)

    # ── 1 · which blocks NEVER became eligible? ─────────────────────────
    never = (df.groupby(id_col)[master_flag].max()
               .reset_index(name="ever")[lambda d: d["ever"] == 0]
               .drop(columns="ever"))
    fail  = never.merge(df, on=id_col, how="inner")
    print(f"[{crit_name}] {fail[id_col].nunique()} blocks never became eligible")

    # ── 2 · proportion of hours each sub‑flag is FALSE ──────────────────
    long = fail.melt(id_vars=[id_col], value_vars=flag_cols,
                     var_name="criterion", value_name="flag")
    summary = (long.groupby("criterion")["flag"]
                     .apply(lambda s: (s == 0).mean())
                     .rename("prop_hours_failed")
                     .reset_index()
                     .sort_values("prop_hours_failed", ascending=False))

    if save_fig_data:
        summary.to_csv(out_dir/"subflag_failure_rates.csv", index=False)

    plt.figure(figsize=(6,4))
    plt.barh(summary["criterion"], summary["prop_hours_failed"])
    plt.gca().invert_yaxis()
    plt.xlabel("Proportion of hours NOT satisfied")
    plt.title(f"{crit_name}: which sub‑criteria blocked eligibility?")
    plt.tight_layout()
    plt.savefig(out_dir/"subflag_bar.png", dpi=300);  plt.close()

    # ── 3 · “primary blocker” per block (latest first‑TRUE) ─────────────
    def first_true(g, col):
        hit = g.loc[g[col] == 1, time_col]
        return hit.min() if not hit.empty else np.inf

    prim = []
    for blk, g in fail.groupby(id_col):
        lags = {c: first_true(g, c) for c in flag_cols}
        prim_blk = max(lags, key=lags.get)
        prim.append([blk, prim_blk])

    prim_df = pd.DataFrame(prim, columns=[id_col, "primary_blocker"])
    # prim_df.to_csv(out_dir/"primary_blocker_per_block.csv", index=False)

    # ── 4 · UpSet plot of sub‑flag combinations that *ever* fail ────────
    failed = fail[flag_cols].eq(0)                   # True => criterion failed
    block_fail = failed.groupby(fail[id_col]).max()  # at any hour in block
    upset_data = from_indicators(block_fail.columns, block_fail)

    UpSet(upset_data, show_counts=True,
          sort_by="cardinality").plot()
    plt.suptitle(f"{crit_name}: combinations of failed sub‑criteria")
    plt.savefig(out_dir/"upset.png", dpi=300);  plt.close()

    # ── 5 · quick dose distribution (example: NE)  ----------------------
    if "ne_calc_max" in fail.columns:
        sns.histplot(fail["ne_calc_max"], bins=40, kde=False)
        plt.axvline(0.2, color="red", ls="--")
        plt.title(f"{crit_name}: hourly max NE dose (only failed blocks)")
        plt.xlabel("µg/kg/min")
        plt.tight_layout()
        plt.savefig(out_dir/"ne_hist.png", dpi=300);  plt.close()

    print(f"[{crit_name}] figures + CSV written to {out_dir}\n")
    return summary, prim_df
# ────────────────────────────────────────────────────────────


In [None]:
# --- define once --------------------------------------------------------
team_flags = ["team_pulse_flag", "team_lactate_flag", "team_ne_flag",
              "team_fio2_flag",  "team_peep_flag",    "team_resp_rate_flag"]

# --- call helper --------------------------------------------------------
summary_team, primary_team = analyse_criterion(
    final_df,                      # your long hourly dataframe
    crit_name  = "TEAM",
    flag_cols  = team_flags,
    master_flag= "team_flag",      # overall TEAM eligibility flag
    out_dir    = "../output/final"   # everything will live in …/figures/team/
)

In [None]:
reqd_yellow_fields =[
    # Red Flags
    'red_resp_spo2_flag', 'red_map_flag', 'red_high_support_flag',
    'red_hypertensive_flag', 'red_pulse_high_flag', 'red_pulse_low_flag',
    
    # Yellow Flags
    'yellow_resp_spo2_flag', 'yellow_fio2_flag', 'yellow_resp_rate_flag',
    'yellow_peep_flag', 'yellow_map_flag', 'yellow_pulse_flag',
    'yellow_lactate_flag',
    
    # Green Flags
    'green_resp_spo2_flag', 'green_resp_rate_flag', 'green_fio2_flag',
    'green_peep_flag', 'green_map_flag', 'green_pulse_flag',
    'green_lactate_flag', 'green_hr_flag',
    
    # Composite Flags
    'any_red', 'any_yellow', 'any_green', 'all_green',
    'all_green_no_red', 'all_green_no_red_yellow',
    'all_yellow_no_red_green', 'any_yellow_no_red_green',
    'any_yellow_or_green_no_red', 'yellow_resp_flag',
    'yellow_cardio_flag', 'yellow_all_green', 'yellow_not_all_green'
]

summary_yel, primary_yel = analyse_criterion(
    final_df,
    crit_name  = "Yellow",
    flag_cols  = reqd_yellow_fields,
    master_flag= "any_yellow_or_green_no_red"          # <- whatever your overall column is called
)

In [None]:
reqd_patel_fields = ['patel_map_flag','patel_sbp_flag','patel_pulse_flag', 
                    'patel_resp_rate_flag' , 'patel_spo2_flag', 'patel_resp_flag', 'patel_cardio_flag', 'patel_flag' ]

summary_patel, primary_patel = analyse_criterion(
    final_df,
    crit_name  = "Patel",
    flag_cols  = reqd_patel_fields,
    master_flag= "patel_flag"          
)

In [None]:
reqd_green_fields =[    
    # Green Flags
    'green_resp_spo2_flag', 'green_resp_rate_flag', 'green_fio2_flag',
    'green_peep_flag', 'green_map_flag', 'green_pulse_flag',
    'green_lactate_flag', 'green_hr_flag',
    
    # Composite Flags
     'all_green',
    'all_green_no_red', 
]

summary_green, primary_green = analyse_criterion(
    final_df,
    crit_name  = "Green",
    flag_cols  = reqd_green_fields,
    master_flag= "all_green"          
)

### Average Hours Criteria Met on Days 1, 2, and 3

Determine how many hours the criteria are met on specific calendar days (Day 1, Day 2, Day 3 after intubation).

1. First, assign a calendar_day column that represents the calendar day relative to intubation.
2. Use the recorded_date and recorded hour to calculate the difference from the intubation time, and categorize rows into Day 1, Day 2, Day 3.
3. For each encounter, group the data by calendar_day and hospitalization_id and sum the hours that meet each criterion.
4. Compute the average number of hours for each criterion per day.


In [None]:
# Merge final_df with vent_start_end to get 'vent_start_time'
visualization_df = pd.merge(
    final_df,
    all_ids_w_outcome[['encounter_block', 'block_vent_start_dttm']],
    on='encounter_block',
    how='left'
)

# Ensure 'vent_start_time' and 'recorded_date' are in datetime format
visualization_df['block_vent_start_dttm'] = pd.to_datetime(visualization_df['block_vent_start_dttm'])
visualization_df['recorded_date'] = pd.to_datetime(visualization_df['recorded_date'])

# Combine 'recorded_date' and 'recorded_hour' to create 'recorded_dttm'
visualization_df['recorded_dttm'] = visualization_df['recorded_date'] + pd.to_timedelta(visualization_df['recorded_hour'], unit='h')

# Verify the data types
# print("Verify data types\n", visualization_df[['vent_start_time', 'recorded_dttm']].dtypes)

# Remove timezone information from 'vent_start_time' if it's timezone-aware
if visualization_df['block_vent_start_dttm'].dt.tz is not None:
    visualization_df['block_vent_start_dttm'] = visualization_df['block_vent_start_dttm'].dt.tz_localize(None)

# Similarly, remove timezone information from 'recorded_dttm' if needed
if visualization_df['recorded_dttm'].dt.tz is not None:
    visualization_df['recorded_dttm'] = visualization_df['recorded_dttm'].dt.tz_localize(None)

# print("\nConverted data type if not tz naive\n", visualization_df[['vent_start_time', 'recorded_dttm']].dtypes)

def assign_calendar_day(df, intubation_col, recorded_col):
    # Calculate the difference in days between intubation and recorded time
    df['calendar_day'] = (df[recorded_col] - df[intubation_col]).dt.days + 1
    return df

# Assign calendar day for each encounter
visualization_df = assign_calendar_day(visualization_df, 'block_vent_start_dttm', 'recorded_dttm')

visualization_df = visualization_df[['encounter_block', 'block_vent_start_dttm', 'recorded_dttm', 
                  'calendar_day', 'patel_flag', 'team_flag', 'any_yellow_or_green_no_red', 'all_green', 'all_green_no_red',
                  'any_green']]

def compute_avg_hours_by_day(df, criteria_columns):
    # Ensure hospitalization_id is handled as string/object and numeric columns as numbers
    hours_per_day = df.groupby(['encounter_block', 'calendar_day']).agg({
        'patel_flag': 'sum',
        'team_flag': 'sum',
        'any_yellow_or_green_no_red': 'sum',
        'all_green': 'sum',
    }).reset_index()
    # Filter for Day 1, Day 2, Day 3
    hours_per_day = hours_per_day[hours_per_day['calendar_day'].isin([1, 2, 3])]
    
    # Calculate the average number of hours for each day
    avg_hours_by_day = hours_per_day.groupby('calendar_day').agg({
        'patel_flag': 'mean',
        'team_flag': 'mean',
        'any_yellow_or_green_no_red': 'mean',
        'all_green': 'mean',
    }).reset_index()
    
    return avg_hours_by_day

# Define your criteria columns
criteria_columns = ['patel_flag', 'team_flag', 'any_yellow_or_green_no_red', 'all_green']
# Calculate the average number of hours each criterion is met on Day 1, 2, and 3
avg_hours_by_day = compute_avg_hours_by_day(visualization_df, criteria_columns)
avg_hours_by_day['site_name'] = pyCLIF.helper["site_name"]
pd.DataFrame(avg_hours_by_day).to_csv(f'../output/final/avg_hours_by_day_{pyCLIF.helper["site_name"]}_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.csv',index=False)

def plot_avg_hours_by_day_bar(avg_hours_by_day, criteria_columns):
    # Melt the DataFrame for easier plotting with seaborn
    melted_df = avg_hours_by_day.melt(id_vars='calendar_day', value_vars=criteria_columns, var_name='Criteria', value_name='Average Hours Met')

    plt.figure(figsize=(10, 6))
    
    # Create a bar plot
    sns.barplot(x='calendar_day', y='Average Hours Met', hue='Criteria', data=melted_df, palette='viridis')
    
    # Add custom x-axis labels for Day 1, Day 2, Day 3
    plt.xticks(ticks=[0, 1, 2], labels=["Day 1", "Day 2", "Day 3"])
    
    # Add labels and title
    plt.title('Average Hours Criteria Met per Day')
    plt.xlabel('Calendar Day')
    plt.ylabel('Average Hours Criteria Met')
    
    # Move the legend to the bottom
    plt.legend(title='Criteria', loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)
    
    # Show plot
    plt.tight_layout()
    # Save the plot
    plt.savefig(f'../output/final/graphs/avg_hours_by_day_{pyCLIF.helper["site_name"]}.png')
    plt.close()

# Plot the average hours by day using a bar plot
plot_avg_hours_by_day_bar(avg_hours_by_day, criteria_columns)

In [None]:
# Merge final_df with vent_start_end to get 'vent_start_time'
visualization_df = pd.merge(
    df_72h,
    all_ids_w_outcome[['encounter_block', 'block_vent_start_dttm']],
    on='encounter_block',
    how='left'
)

# Ensure 'vent_start_time' and 'recorded_date' are in datetime format
visualization_df['block_vent_start_dttm'] = pd.to_datetime(visualization_df['block_vent_start_dttm'])
visualization_df['recorded_date'] = pd.to_datetime(visualization_df['recorded_date'])

# Combine 'recorded_date' and 'recorded_hour' to create 'recorded_dttm'
visualization_df['recorded_dttm'] = visualization_df['recorded_date'] + pd.to_timedelta(visualization_df['recorded_hour'], unit='h')

# Verify the data types
# print("Verify data types\n", visualization_df[['vent_start_time', 'recorded_dttm']].dtypes)

# Remove timezone information from 'vent_start_time' if it's timezone-aware
if visualization_df['block_vent_start_dttm'].dt.tz is not None:
    visualization_df['block_vent_start_dttm'] = visualization_df['block_vent_start_dttm'].dt.tz_localize(None)

# Similarly, remove timezone information from 'recorded_dttm' if needed
if visualization_df['recorded_dttm'].dt.tz is not None:
    visualization_df['recorded_dttm'] = visualization_df['recorded_dttm'].dt.tz_localize(None)

# print("\nConverted data type if not tz naive\n", visualization_df[['vent_start_time', 'recorded_dttm']].dtypes)

def assign_calendar_day(df, intubation_col, recorded_col):
    # Calculate the difference in days between intubation and recorded time
    df['calendar_day'] = (df[recorded_col] - df[intubation_col]).dt.days + 1
    return df

# Assign calendar day for each encounter
visualization_df = assign_calendar_day(visualization_df, 'block_vent_start_dttm', 'recorded_dttm')

visualization_df = visualization_df[['encounter_block', 'block_vent_start_dttm', 'recorded_dttm', 
                  'calendar_day', 'patel_flag', 'team_flag', 'any_yellow_or_green_no_red', 'all_green', 'all_green_no_red',
                  'any_green']]

def compute_avg_hours_by_day(df, criteria_columns):
    # Ensure hospitalization_id is handled as string/object and numeric columns as numbers
    hours_per_day = df.groupby(['encounter_block', 'calendar_day']).agg({
        'patel_flag': 'sum',
        'team_flag': 'sum',
        'any_yellow_or_green_no_red': 'sum',
        'all_green': 'sum',
    }).reset_index()
    # Filter for Day 1, Day 2, Day 3
    hours_per_day = hours_per_day[hours_per_day['calendar_day'].isin([1, 2, 3])]
    
    # Calculate the average number of hours for each day
    avg_hours_by_day = hours_per_day.groupby('calendar_day').agg({
        'patel_flag': 'mean',
        'team_flag': 'mean',
        'any_yellow_or_green_no_red': 'mean',
        'all_green': 'mean',
    }).reset_index()
    
    return avg_hours_by_day

# Define your criteria columns
criteria_columns = ['patel_flag', 'team_flag', 'any_yellow_or_green_no_red', 'all_green']
# Calculate the average number of hours each criterion is met on Day 1, 2, and 3
avg_hours_by_day = compute_avg_hours_by_day(visualization_df, criteria_columns)
avg_hours_by_day['site_name'] = pyCLIF.helper["site_name"]
pd.DataFrame(avg_hours_by_day).to_csv(f'../output/final/avg_hours_by_day_72h_{pyCLIF.helper["site_name"]}_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.csv',index=False)

def plot_avg_hours_by_day_bar(avg_hours_by_day, criteria_columns):
    # Melt the DataFrame for easier plotting with seaborn
    melted_df = avg_hours_by_day.melt(id_vars='calendar_day', value_vars=criteria_columns, var_name='Criteria', value_name='Average Hours Met')

    plt.figure(figsize=(10, 6))
    
    # Create a bar plot
    sns.barplot(x='calendar_day', y='Average Hours Met', hue='Criteria', data=melted_df, palette='viridis')
    
    # Add custom x-axis labels for Day 1, Day 2, Day 3
    plt.xticks(ticks=[0, 1, 2], labels=["Day 1", "Day 2", "Day 3"])
    
    # Add labels and title
    plt.title('Average Hours Criteria Met per Day')
    plt.xlabel('Calendar Day')
    plt.ylabel('Average Hours Criteria Met')
    
    # Move the legend to the bottom
    plt.legend(title='Criteria', loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)
    
    # Show plot
    plt.tight_layout()
    # Save the plot
    plt.savefig(f'../output/final/graphs/avg_hours_by_day_72h_{pyCLIF.helper["site_name"]}.png')
    plt.close()

# Plot the average hours by day using a bar plot
plot_avg_hours_by_day_bar(avg_hours_by_day, criteria_columns)

### Parallel categories plot

In [None]:
## Useful for EDA 
# Create a DataFrame for parallel categories plot
parallel_df = final_df[['patel_flag', 'team_flag', 'any_yellow_or_green_no_red', 'all_green']].copy()
parallel_df['patel_flag'] = parallel_df['patel_flag'].apply(lambda x: 1 if x else 0)
parallel_df['team_flag'] = parallel_df['team_flag'].apply(lambda x: 1 if x else 0)
parallel_df['any_yellow_or_green_no_red'] = parallel_df['any_yellow_or_green_no_red'].apply(lambda x: 1 if x else 0)
parallel_df['all_green'] = parallel_df['all_green'].apply(lambda x: 1 if x else 0)

# Create parallel categories plot
fig = px.parallel_categories(parallel_df, dimensions=['patel_flag', 'team_flag', 'any_yellow_or_green_no_red', 'all_green'],
                             color="patel_flag",
                             labels={'patel_flag': 'Patel Met', 'team_flag': 'TEAM Met', 'any_yellow_or_green_no_red': 'Yellow Flag', 'all_green': 'Green Flag'},
                             color_continuous_scale=px.colors.sequential.Inferno)

fig.update_layout(title="Parallel Categories Plot: Comparison of Criteria Satisfaction")

# Save the final figure
fig.write_image(f'../output/final/graphs/parallel_categories_{pyCLIF.helper["site_name"]}.png')


In [None]:
# look at encounters when Patel flag is not met but team flag is met
## sanity check
patel_fail_team_pass = final_df[(final_df['patel_flag'] == 0) & (final_df['team_flag'] == 1)]
# Verify the filter
print(f"\nTotal number of hours where Patel failed and Team passed: {len(patel_fail_team_pass)}\n")

if len(patel_fail_team_pass) > 0:
    # Dictionary to store our failure counts
    print("Primary cause of Patel Criteria non-compliance")
    failure_counts = {
            'MAP': sum(patel_fail_team_pass['patel_map_flag'] == 0),
            'SBP': sum(patel_fail_team_pass['patel_sbp_flag'] == 0),
            'Pulse': sum(patel_fail_team_pass['patel_pulse_flag'] == 0),
            'Respiratory Rate': sum(patel_fail_team_pass['patel_resp_rate_flag'] == 0),
            'SpO2': sum(patel_fail_team_pass['patel_spo2_flag'] == 0)
        }
    failure_df = pd.DataFrame(list(failure_counts.items()),columns = ['Criteria','Count'])
    failure_df.to_csv(f'../output/final/patel_fail_team_pass_subcomponents_{pyCLIF.helper["site_name"]}_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.csv',index=False)
    print(failure_df)

### Yellow-Green spectrum criteria distribution

In [None]:
# which sub‑criteria are green / yellow
green_cols  = [c for c in final_df.columns if c.startswith("green_") and c.endswith("_flag")]
yellow_cols = [c for c in final_df.columns if c.startswith("yellow_") and c.endswith("_flag")]

# one row per block at the moment it first became eligible
first_hit = (
    final_df.loc[final_df["any_yellow_or_green_no_red"] == 1]
             .sort_values(["encounter_block", "time_from_vent"])
             .groupby("encounter_block")
             .first()
)

# count how many green / yellow sub‑criteria were satisfied at that hour
first_hit["n_green"]  = first_hit[green_cols].sum(axis=1)
first_hit["n_yellow"] = first_hit[yellow_cols].sum(axis=1)

# yellow‑fraction: 0 = all satisfied criteria were green, 1 = all yellow
first_hit["yellow_frac"] = (
    first_hit["n_yellow"] /
    (first_hit["n_green"] + first_hit["n_yellow"])
).fillna(0)              # guard against division by zero

# yellow‑fraction: 0 = all satisfied criteria were green, 1 = all yellow
first_hit["green_frac"] = (
    first_hit["n_green"] /
    (first_hit["n_green"] + first_hit["n_yellow"])
).fillna(0)              # guard against division by zero

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# ------------------------------------------------------------
# 1.  Build the jittered scatter data  (unchanged)
# ------------------------------------------------------------
x = np.random.normal(0, 0.002, size=len(first_hit))   # tiny horizontal jitter
y = first_hit["green_frac"].values                    # 1 = pure green, 0 = pure yellow

# ------------------------------------------------------------
# 2.  Custom colormap: pure‑green  →  pure‑yellow
# ------------------------------------------------------------
green_yellow = mcolors.LinearSegmentedColormap.from_list(
    "YellowGreen", ["#ffeb3b", "#2ca02c"]   #   0 (yellow)   →   1 (green)
)

# ------------------------------------------------------------
# 3.  Plot
# ------------------------------------------------------------
fig, ax = plt.subplots(figsize=(6, 4))

sc = ax.scatter(x, y,
                s=14, alpha=0.7,
                c=y, cmap=green_yellow, vmin=0, vmax=1)

ax.set_xlim(-0.02, 0.02)
ax.set_xticks([])
ax.set_ylim(0, 1)
ax.set_ylabel("Green fraction  (1=pure green  |  0=pure yellow)")
ax.set_title("Eligibility colour spectrum per encounter block", pad=12)

cbar = fig.colorbar(sc, ax=ax, pad=0.02, shrink=0.8)
cbar.set_label("Green fraction")

# ------------------------------------------------------------
# 4.  Caption (automatic wrap)
# ------------------------------------------------------------
caption = (
    "Each dot = first eligible hour of an encounter block. "
    "Vertical position/colour show the fraction of satisfied criteria that were "
    "GREEN (physiologically safer) versus YELLOW (less conservative)." 
    "Horizontal spread is tiny random jitter to avoid over plotting; x -axis has no meaning "
)
fig.text(0.01, -0.10, caption, ha="left", va="top", wrap=True, fontsize=9)
fig.savefig(f'../output/final/graphs/yellow_eligibility_colour_spectrum_{pyCLIF.helper["site_name"]}.png') 
plt.close(fig)

### Sensitivity analysis: Weekends- WIP