# Eligibility for mobilization: Cohort identification. 

This script identifies the cohort using CLIF 2.0 tables. 

Requirements:
* Required table filenames should be clif_patient, clif_hospitalization, clif_adt, clif_vitals, clif_labs, clif_medication_admin_continuous, clif_respiratory_support

## Load Libraries

In [1]:
import sys
import os
import time
import pandas as pd
import numpy as np
import duckdb
import pyCLIF

import seaborn as sns
import matplotlib.pyplot as plt

Loaded configuration from config.json
{'site_name': 'UCMC', 'tables_path': '/Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19', 'file_type': 'parquet'}


## Required columns and categories

In [2]:
rst_required_columns = [
    'hospitalization_id',
    'recorded_dttm',
    'device_name',
    'device_category',
    'mode_name', 
    'mode_category',
    'tracheostomy',
    'fio2_set',
    'lpm_set',
    'resp_rate_set',
    'peep_set',
    'resp_rate_obs',
    'tidal_volume_set'
]

vitals_required_columns = [
    'hospitalization_id',
    'recorded_dttm',
    'vital_category',
    'vital_value'
]
vitals_of_interest = ['heart_rate', 'resp_rate', 'sbp', 'dbp', 'map', 'resp_rate', 'spo2']

labs_required_columns = [
    'hospitalization_id',
    'lab_result_dttm',
    'lab_category',
    'lab_value',
    'lab_value_numeric'
]
labs_of_interest = ['lactate']

meds_required_columns = [
    'hospitalization_id',
    'admin_dttm',
    'med_name',
    'med_category',
    'med_dose',
    'med_dose_unit'
]
meds_of_interest = [
    'norepinephrine', 'epinephrine', 'phenylephrine', 'vasopressin',
    'dopamine', 'angiotensin', 'nicardipine', 'nitroprusside',
    'clevidipine', 'cisatracurium'
]

## Load data

In [3]:
patient = pyCLIF.load_data('clif_patient')
hospitalization = pyCLIF.load_data('clif_hospitalization')

Data loaded successfully from /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_patient.parquet
Data loaded successfully from /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_hospitalization.parquet


In [4]:
# Standardize all _dttm variables to the same format
patient = pyCLIF.standardize_datetime(patient)
hospitalization = pyCLIF.standardize_datetime(hospitalization)

In [5]:
patient = pyCLIF.remove_duplicates(patient, ['patient_id'], 'patient')
hospitalization = pyCLIF.remove_duplicates(hospitalization, ['hospitalization_id'], 'hospitalization')

Processing DataFrame: patient
No duplicates found based on columns: ['patient_id'].
Processing DataFrame: hospitalization
No duplicates found based on columns: ['hospitalization_id'].


In [6]:
print(f"Total Number of unique encounters in the data: {pyCLIF.count_unique_encounters(hospitalization, 'hospitalization_id')}")

Total Number of unique encounters in the data: 448402


## Cohort Identification

### Inclusion Criteria:

* Filter Admissions for March 1, 2020 - March 31, 2022
* Encounters receiving invasive mechanical ventilation during this period

### Exclusion criteria:

1. Encounters that were on vent for less than 2 hours
2. Encounters that were on trach in the first 72 hours 
3. Encounters that received Cisatracurium for 4 hours or more

In [34]:
cohort = hospitalization[
    (hospitalization['admission_dttm'] >= '2018-03-01') &
    (hospitalization['admission_dttm'] <= '2023-03-31') &
    (hospitalization['age_at_admission'] >= 18)
].reset_index(drop=True)[['hospitalization_id']].drop_duplicates()

cohort_ids = cohort['hospitalization_id'].unique().tolist()
print(f"Number of unique encounters after filtering by date and age:", cohort['hospitalization_id'].nunique())

Number of unique encounters after filtering by date and age: 265523


In [35]:
# Import clif respiratory table for this cohort
rst_filters = {
    'hospitalization_id': cohort_ids
}
resp_support_raw = pyCLIF.load_data('clif_respiratory_support', columns=rst_required_columns, filters=rst_filters)

Data loaded successfully from /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_respiratory_support.parquet


In [36]:
filtered_rst = resp_support_raw[resp_support_raw['hospitalization_id'] == '11633026']

In [9]:
resp_support = resp_support_raw.copy()
resp_support['recorded_dttm'] = pd.to_datetime(resp_support['recorded_dttm'])
resp_support['device_category'] = resp_support['device_category'].str.lower()
resp_support['mode_category'] = resp_support['mode_category'].str.lower()

In [10]:
# Apply Nick's Waterfall fill logic for respiratory support table
# This can take time- 1- 12 mins depending on data size
processed_resp_support = pyCLIF.process_resp_support(resp_support)

Initiating waterfall processing...
Fixing out-of-range values for 'fio2_set', 'peep_set', and 'resp_rate_set'...
Creating recorded_date and recorded_hour...
Sorting data by 'hospitalization_id' and 'recorded_dttm'...
Fixing missing 'device_category' and 'device_name' based on 'mode_category'...
Fixing 'device_category' and 'device_name' based on neighboring records...
Handling duplicates and removing rows with all key variables missing...
Filling forward 'device_category' within each hospitalization...
Creating 'device_cat_id' to track changes in 'device_category'...
Filling 'device_name' within each 'device_cat_id'...
Creating 'device_id' to track changes in 'device_name'...
Filling 'mode_category' within each 'device_id'...
Creating 'mode_cat_id' to track changes in 'mode_category'...
Filling 'mode_name' within each 'mode_cat_id'...
Creating 'mode_name_id' to track changes in 'mode_name'...
Adjusting 'fio2_set' for 'room air' device_category...
Adjusting 'mode_category' for 't-piece'

In [11]:
processed_resp_support = pyCLIF.remove_duplicates(processed_resp_support, 
                                        ['hospitalization_id', 'recorded_dttm',
                                         'device_category','mode_category' ], 
                                         'processed_resp_support')

Processing DataFrame: processed_resp_support
No duplicates found based on columns: ['hospitalization_id', 'recorded_dttm', 'device_category', 'mode_category'].


In [15]:
# Identify the cohort on invasive mechanical ventilation 
columns_to_keep = [
    'hospitalization_id', 'recorded_dttm', 'device_name','device_category',
    'mode_name', 'mode_category' , 'tracheostomy',
    'fio2_set', 'lpm_set', 'peep_set', 
    'resp_rate_obs', 'resp_rate_set'
]

ventilator_usage = processed_resp_support[processed_resp_support['device_category'].str.contains("imv", case=False, na=False)]
cohort_on_vent = ventilator_usage.merge(cohort, on='hospitalization_id', how='left')
cohort_ids = cohort_on_vent['hospitalization_id'].unique().tolist()

cohort_on_vent = cohort_on_vent[columns_to_keep]
cohort_on_vent['on_vent'] = cohort_on_vent['device_category'].str.contains("imv", case=False, na=False).astype(int)
cohort_on_vent = cohort_on_vent.sort_values(by=['hospitalization_id', 'recorded_dttm'])
cohort_on_vent = cohort_on_vent[cohort_on_vent['on_vent'] == 1]

cohort_on_vent.loc[:, 'recorded_dttm'] = pd.to_datetime(cohort_on_vent['recorded_dttm'])
# Apply thresholds and replace values outside these with NaN using .loc[]
# UPDATE THIS TO USE CSV / JSON FROM OUTLIER DIRECTORY
# cohort_on_vent.loc[:, 'fio2_set'] = cohort_on_vent['fio2_set'].where(cohort_on_vent['fio2_set'].between(0.21, 1, inclusive='both'), np.nan)
# Calculate the mean of 'fio2_set', excluding NaN values
fio2_mean = cohort_on_vent['fio2_set'].mean(skipna=True)
print("FIO2_SET MEAN", fio2_mean)
# If the mean is greater than 1, divide 'fio2_set' by 100
if fio2_mean > 1:
    # Only divide values greater than 1 to avoid re-dividing already correct values
    print("Updated fio2_set to be between 0.21 and 1")
    cohort_on_vent.loc[cohort_on_vent['fio2_set'] > 1, 'fio2_set'] = \
        cohort_on_vent.loc[cohort_on_vent['fio2_set'] > 1, 'fio2_set'] / 100

cohort_on_vent.loc[:, 'fio2_set'] = cohort_on_vent['fio2_set'].where(cohort_on_vent['fio2_set'].between(0.21, 1, inclusive='both'), np.nan)
cohort_on_vent.loc[:, 'resp_rate_set'] = cohort_on_vent['resp_rate_set'].where(cohort_on_vent['resp_rate_set'].between(0, 60, inclusive='both'), np.nan)
cohort_on_vent.loc[:, 'peep_set'] = cohort_on_vent['peep_set'].where(cohort_on_vent['peep_set'].between(0, 50, inclusive='both'), np.nan)
cohort_on_vent.loc[:, 'resp_rate_obs'] = cohort_on_vent['resp_rate_obs'].where(cohort_on_vent['resp_rate_obs'].between(0, 100, inclusive='both'), np.nan)
cohort_on_vent.loc[:, 'lpm_set'] = cohort_on_vent['lpm_set'].where(cohort_on_vent['lpm_set'].between(0, 60, inclusive='both'), np.nan)

cohort_on_vent['recorded_date'] = cohort_on_vent['recorded_dttm'].dt.date
cohort_on_vent['recorded_hour'] = cohort_on_vent['recorded_dttm'].dt.hour

print(f"Number of unique encounters after filtering for ventilator usage: {cohort_on_vent['hospitalization_id'].nunique()}")

FIO2_SET MEAN 0.48376030677256426
Number of unique encounters after filtering for ventilator usage: 2755


In [16]:
vent_start_end = cohort_on_vent.groupby('hospitalization_id').agg(
    vent_start_time=('recorded_dttm', 'min'),
    vent_end_time=('recorded_dttm', 'max')
).reset_index()
# Exclude encounters that were on vent for less than 2 hours
vent_start_end = vent_start_end[vent_start_end['vent_start_time'] != vent_start_end['vent_end_time']]

In [17]:
# import required vitals
vitals_filters = {
    'hospitalization_id': cohort_ids,
    'vital_category': vitals_of_interest
}
vitals = pyCLIF.load_data('clif_vitals', columns=vitals_required_columns, filters=vitals_filters)

Data loaded successfully from /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_vitals.parquet


In [18]:
# Get first_vital_dttm and last_vital_dttm for each hospitalization_id 
# We use this as proxy for admission and discharge dttm
vital_dttm_bounds = vitals.groupby('hospitalization_id')['recorded_dttm'].agg(['min', 'max']).reset_index()
vital_dttm_bounds.columns = ['hospitalization_id', 'first_vital_dttm', 'last_vital_dttm']
print("unique encounters in vitals", pyCLIF.count_unique_encounters(vital_dttm_bounds))

unique encounters in vitals 2745


## Hourly sequence for the cohort

In [19]:
final_cohort = vent_start_end.merge(vital_dttm_bounds, on='hospitalization_id', how='inner')
print("unique encounters in resp filtered", pyCLIF.count_unique_encounters(final_cohort))

unique encounters in resp filtered 2638


In [20]:
# sanity check - last recorded vital shouldn't be less than vent start time
cases_before_vent_start = final_cohort[final_cohort['last_vital_dttm'] < final_cohort['vent_start_time']]
print("Cases where last vital dttm is before vent_start time:", len(cases_before_vent_start))
cases_before_vent_start

Cases where last vital dttm is before vent_start time: 0


Unnamed: 0,hospitalization_id,vent_start_time,vent_end_time,first_vital_dttm,last_vital_dttm


In [21]:
# Function to generate hourly sequence for each group (hospitalization_id)
def generate_hourly_sequence(group):
    # Get the vent start time and discharge time
    start_time = group['vent_start_time'].iloc[0]
    end_time = group['last_vital_dttm'].iloc[0]
    
    # Generate the sequence of hourly timestamps
    hourly_timestamps = pd.date_range(start=start_time, end=end_time, freq='h')
    
    # Create a new DataFrame for this sequence
    return pd.DataFrame({
        'hospitalization_id': group['hospitalization_id'].iloc[0],
        'recorded_dttm': hourly_timestamps
    })

# Apply the function to each group and concatenate the results
hour_sequence = final_cohort.groupby('hospitalization_id')\
    .apply(generate_hourly_sequence)\
    .reset_index(drop=True)

# Add `recorded_date` and `recorded_hour` columns
# Convert recorded_dttm to datetime sanity check
hour_sequence['recorded_dttm'] = pd.to_datetime(hour_sequence['recorded_dttm'])
hour_sequence['recorded_date'] = hour_sequence['recorded_dttm'].dt.date
hour_sequence['recorded_hour'] = hour_sequence['recorded_dttm'].dt.hour
hour_sequence['time_from_vent'] = hour_sequence.groupby('hospitalization_id').cumcount()

  hour_sequence = final_cohort.groupby('hospitalization_id')\


## Hourly Respiratory support

In [22]:
hourly_vent_df = cohort_on_vent.groupby(['hospitalization_id', 'recorded_date', 'recorded_hour']).agg(
    min_resp_rate_obs=pd.NamedAgg(column='resp_rate_obs', aggfunc='min'),
    min_lpm_set=pd.NamedAgg(column='lpm_set', aggfunc='min'),
    min_fio2_set=pd.NamedAgg(column='fio2_set', aggfunc='min'),
    min_peep_set=pd.NamedAgg(column='peep_set', aggfunc='min'),
    max_resp_rate_obs=pd.NamedAgg(column='resp_rate_obs', aggfunc='max'),
    max_lpm_set=pd.NamedAgg(column='lpm_set', aggfunc='max'),
    max_fio2_set=pd.NamedAgg(column='fio2_set', aggfunc='max'),
    max_peep_set=pd.NamedAgg(column='peep_set', aggfunc='max'),
    hourly_trach=pd.NamedAgg(column='tracheostomy', aggfunc=lambda x: 1 if x.max() == 1 else 0),
    hourly_on_vent=pd.NamedAgg(column='on_vent', aggfunc=lambda x: 1 if x.max() == 1 else 0)
).reset_index()

In [26]:
# Merge hourly_vent_df with hour_sequence on hospitalization_id, recorded_date, and recorded_hour
final_df = pd.merge(hour_sequence, hourly_vent_df, on=['hospitalization_id', 'recorded_date', 'recorded_hour'], 
                     how='left')
print("unique encounters in merged df", pyCLIF.count_unique_encounters(final_df))

unique encounters in merged df 2638


## Hourly Vitals

In [23]:
vitals['recorded_dttm'] = pd.to_datetime(vitals['recorded_dttm'])
vitals['recorded_hour'] = vitals['recorded_dttm'].dt.hour
vitals['recorded_date'] = vitals['recorded_dttm'].dt.date

vitals_min_max = vitals.groupby(['hospitalization_id', 'recorded_date', 'recorded_hour', 'vital_category']).agg(
    min=pd.NamedAgg(column='vital_value', aggfunc='min'),
    max=pd.NamedAgg(column='vital_value', aggfunc='max')
).reset_index()

# Pivot the table to reshape it
vitals_pivot = vitals_min_max.pivot_table(
    index=['hospitalization_id', 'recorded_date', 'recorded_hour'],
    columns='vital_category',
    values=['min', 'max']
).reset_index()

# Flatten the column multi-index after pivot
vitals_pivot.columns = ['_'.join(col).strip() if type(col) is tuple else col for col in vitals_pivot.columns]
# Remove trailing underscores
vitals_pivot.columns = [col.rstrip('_') for col in vitals_pivot.columns]

In [24]:
print(vitals_pivot.columns)

Index(['hospitalization_id', 'recorded_date', 'recorded_hour', 'max_dbp',
       'max_heart_rate', 'max_map', 'max_sbp', 'max_spo2', 'min_dbp',
       'min_heart_rate', 'min_map', 'min_sbp', 'min_spo2'],
      dtype='object')


In [27]:
# merge vitals with the blocked resp support data
final_df = pd.merge(final_df, vitals_pivot, on=['hospitalization_id', 'recorded_date', 'recorded_hour'], 
                   how='left')
final_df.head()

Unnamed: 0,hospitalization_id,recorded_dttm,recorded_date,recorded_hour,time_from_vent,min_resp_rate_obs,min_lpm_set,min_fio2_set,min_peep_set,max_resp_rate_obs,...,max_dbp,max_heart_rate,max_map,max_sbp,max_spo2,min_dbp,min_heart_rate,min_map,min_sbp,min_spo2
0,1000314,2020-07-01 23:30:00-05:00,2020-07-01,23,0,20.0,,1.0,5.0,20.0,...,112.0,126.0,95.0,163.0,100.0,58.0,66.0,80.0,131.0,99.0
1,1000314,2020-07-02 00:30:00-05:00,2020-07-02,0,1,20.0,,0.5,5.0,20.0,...,59.0,114.0,79.0,135.0,98.0,59.0,114.0,79.0,135.0,98.0
2,1000314,2020-07-02 01:30:00-05:00,2020-07-02,1,2,20.0,,0.5,5.0,20.0,...,57.0,109.0,76.0,120.0,95.0,57.0,109.0,76.0,120.0,95.0
3,1000314,2020-07-02 02:30:00-05:00,2020-07-02,2,3,,,,,,...,48.0,99.0,64.0,108.0,92.0,48.0,99.0,64.0,108.0,92.0
4,1000314,2020-07-02 03:30:00-05:00,2020-07-02,3,4,20.0,,0.6,5.0,27.0,...,53.0,92.0,68.0,103.0,95.0,53.0,92.0,68.0,103.0,92.0


In [28]:
final_df.columns

Index(['hospitalization_id', 'recorded_dttm', 'recorded_date', 'recorded_hour',
       'time_from_vent', 'min_resp_rate_obs', 'min_lpm_set', 'min_fio2_set',
       'min_peep_set', 'max_resp_rate_obs', 'max_lpm_set', 'max_fio2_set',
       'max_peep_set', 'hourly_trach', 'hourly_on_vent', 'max_dbp',
       'max_heart_rate', 'max_map', 'max_sbp', 'max_spo2', 'min_dbp',
       'min_heart_rate', 'min_map', 'min_sbp', 'min_spo2'],
      dtype='object')

In [29]:
checkpoint_vitals = pyCLIF.remove_duplicates(final_df, [
    'hospitalization_id', 'recorded_dttm', 'recorded_date', 'recorded_hour',
       'time_from_vent', 'min_resp_rate_obs', 'min_lpm_set', 'min_fio2_set',
       'min_peep_set', 'max_resp_rate_obs', 'max_lpm_set', 'max_fio2_set',
       'max_peep_set', 'hourly_trach', 'hourly_on_vent', 'max_dbp',
       'max_heart_rate', 'max_map', 'max_sbp', 'max_spo2', 'min_dbp',
       'min_heart_rate', 'min_map', 'min_sbp', 'min_spo2'
], 'final_df')

Processing DataFrame: final_df
No duplicates found based on columns: ['hospitalization_id', 'recorded_dttm', 'recorded_date', 'recorded_hour', 'time_from_vent', 'min_resp_rate_obs', 'min_lpm_set', 'min_fio2_set', 'min_peep_set', 'max_resp_rate_obs', 'max_lpm_set', 'max_fio2_set', 'max_peep_set', 'hourly_trach', 'hourly_on_vent', 'max_dbp', 'max_heart_rate', 'max_map', 'max_sbp', 'max_spo2', 'min_dbp', 'min_heart_rate', 'min_map', 'min_sbp', 'min_spo2'].


## Hourly Lab

Get most recent lactate defined as closest lab result time to the start of first intubation event

In [30]:
# Import clif continuous meds and clif labs table for the cohort on vent during the required time period
labs_filters = {
    'hospitalization_id': cohort_ids,
    'lab_category': labs_of_interest
}
labs = pyCLIF.load_data('clif_labs', columns=labs_required_columns, filters=labs_filters)
print("unique encounters in labs", pyCLIF.count_unique_encounters(labs))

Data loaded successfully from /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_labs.parquet
unique encounters in labs 2686


In [31]:
labs['lab_result_dttm'] = pd.to_datetime(labs['lab_result_dttm'])
labs['recorded_hour'] = labs['lab_result_dttm'].dt.hour
labs['recorded_date'] = labs['lab_result_dttm'].dt.date

lactate_df = pd.merge(labs, vent_start_end, on='hospitalization_id', how='left')
lactate_df['time_since_vent_start_hours'] = (
    (lactate_df['lab_result_dttm'] - lactate_df['vent_start_time']).dt.total_seconds() / 3600
)

# Calculate the absolute time difference between lab_result_dttm and vent_start_time in hours
lactate_df['time_diff_hours'] = abs((lactate_df['lab_result_dttm'] - lactate_df['vent_start_time']).dt.total_seconds() / 3600)

# Filter for observations within the first 72 hours since vent_start_time
# lactate_df = lactate_df[(lactate_df['time_since_vent_start_hours'] >= 0) & 
#                         (lactate_df['time_since_vent_start_hours'] <= 72)]

# Sort by hospitalization_id, recorded_hour, and time_diff_hours to find the closest measurement to vent_start_time
lactate_df = lactate_df.sort_values(by=['hospitalization_id', 'recorded_date', 'recorded_hour', 'time_diff_hours'])

# Group by hospitalization_id and recorded_hour, and get the first row in each group (which is the closest measurement)
# closest lactate measurement is defined as closest to the vent_start_time in that hour. 
closest_lactate_df = lactate_df.groupby(['hospitalization_id', 'recorded_date','recorded_hour']).first().reset_index()

labs_final = closest_lactate_df[['hospitalization_id', 'recorded_date', 'recorded_hour', 'lab_value_numeric']].copy()

# Rename the 'lab_value_numeric' column to 'lactate'
labs_final = labs_final.rename(columns={'lab_value_numeric': 'lactate'})

final_df = pd.merge(final_df, labs_final, on=['hospitalization_id', 'recorded_date', 'recorded_hour'], 
                   how='left')

# checkpoint_labs = pyCLIF.remove_duplicates(final_df, 
#                                            ['hospitalization_id', 'recorded_dttm', 'recorded_date', 'recorded_hour',
#        'time_from_vent', 'min_resp_rate_obs', 'min_lpm_set', 'min_fio2_set',
#        'min_peep_set', 'max_resp_rate_obs', 'max_lpm_set', 'max_fio2_set',
#        'max_peep_set', 'hourly_trach', 'hourly_on_vent', 'max_dbp',
#        'max_heart_rate', 'max_map', 'max_sbp', 'max_spo2', 'min_dbp',
#        'min_heart_rate', 'min_map', 'min_sbp', 'min_spo2', 'lactate'], 'final_df')
       

In [32]:
final_df.columns
# final_df

Index(['hospitalization_id', 'recorded_dttm', 'recorded_date', 'recorded_hour',
       'time_from_vent', 'min_resp_rate_obs', 'min_lpm_set', 'min_fio2_set',
       'min_peep_set', 'max_resp_rate_obs', 'max_lpm_set', 'max_fio2_set',
       'max_peep_set', 'hourly_trach', 'hourly_on_vent', 'max_dbp',
       'max_heart_rate', 'max_map', 'max_sbp', 'max_spo2', 'min_dbp',
       'min_heart_rate', 'min_map', 'min_sbp', 'min_spo2', 'lactate'],
      dtype='object')

## Hourly Meds

* Exclude encounters that received Cisatracurium for 4 hours or more
* Calculate NE equivalent levels using "norepinephrine", "epinephrine", "phenylephrine", "vasopressin", "dopamine",  "angiotensin"
* Create flags for "nicardipine", "nitroprusside", "clevidipine"


In [38]:
# Import clif continuous meds for the cohort on vent during the required time period
meds_filters = {
    'hospitalization_id': cohort_ids,
    'med_category': meds_of_interest
}
meds = pyCLIF.load_data('clif_medication_admin_continuous', columns=meds_required_columns, filters=meds_filters)
print("unique encounters in meds", pyCLIF.count_unique_encounters(meds))

Data loaded successfully from /Users/kavenchhikara/Desktop/CLIF/CLIF-UCMC/rclif/c19/clif_medication_admin_continuous.parquet
unique encounters in meds 15077


In [39]:
meds['admin_dttm'] = pd.to_datetime(meds['admin_dttm'], format='%Y-%m-%d %H:%M:%S')
meds['med_dose'] = pd.to_numeric(meds['med_dose'], errors='coerce')
# Create 'date' and 'hour_of_day' columns
meds['recorded_date'] = meds['admin_dttm'].dt.date
meds['recorded_hour'] = meds['admin_dttm'].dt.hour

In [48]:
meds.value_counts('med_category')

med_category
norepinephrine    226364
vasopressin        77774
phenylephrine      62007
dopamine           39412
epinephrine        33962
clevidipine        24971
angiotensin        14239
cisatracurium      12093
nicardipine         5569
nitroprusside        546
Name: count, dtype: int64

Handle norepinephrine equivalent calculations and exclusion based on cisatracurium



In [56]:
meds.columns

Index(['hospitalization_id', 'admin_dttm', 'med_name', 'med_category',
       'med_dose', 'med_dose_unit', 'recorded_date', 'recorded_hour'],
      dtype='object')

Exclude encounters that are on cisatracurium for more than 4 hours in the first 72 hours

In [41]:
# Ensure 'admin_dttm' is in datetime format
cisatracurium_filtered = meds[meds['med_category'].str.contains("cisatracurium", case=False, na=False)].drop_duplicates()

# Sort by 'hospitalization_id' and 'admin_dttm'
cisatracurium_filtered = cisatracurium_filtered.sort_values(['hospitalization_id', 'admin_dttm'])

# Define the maximum allowed gap between doses (e.g., 1 hour)
max_gap = pd.Timedelta(hours=1)

# Function to identify continuous periods
def identify_continuous_periods(group):
    group = group.copy()
    group['time_diff'] = group['admin_dttm'].diff()
    group['new_period'] = (group['time_diff'] > max_gap) | (group['time_diff'].isna())
    group['period_id'] = group['new_period'].cumsum()
    return group

# Apply the function to each 'hospitalization_id'
cis_periods = cisatracurium_filtered.groupby('hospitalization_id').apply(identify_continuous_periods).reset_index(drop=True)

# Calculate the duration of each continuous period
period_durations = cis_periods.groupby(['hospitalization_id', 'period_id']).agg(
    period_start=('admin_dttm', 'min'),
    period_end=('admin_dttm', 'max')
).reset_index()

period_durations['period_duration'] = (
    period_durations['period_end'] - period_durations['period_start']
).dt.total_seconds() / 3600  # Convert to hours

# Identify patients with any continuous period >= 4 hours
cis_flag_df = period_durations.groupby('hospitalization_id').agg(
    max_period_duration=('period_duration', 'max')
).reset_index()

cis_flag_df['cis_flag'] = (cis_flag_df['max_period_duration'] >= 4).astype(int)

# Merge 'cis_flag' back to your main DataFrame (e.g., 'vent_start_end')
vent_start_end = vent_start_end.merge(
    cis_flag_df[['hospitalization_id', 'cis_flag']],
    on='hospitalization_id',
    how='left'
)

# Fill NaN values in 'cis_flag' with 0 (patients who didn't meet the criteria)
vent_start_end['cis_flag'] = vent_start_end['cis_flag'].fillna(0).astype(int)


  cis_periods = cisatracurium_filtered.groupby('hospitalization_id').apply(identify_continuous_periods).reset_index(drop=True)


In [40]:
# Filter cisatracurium administrations
cisatracurium_filtered = meds[meds['med_category'].str.contains("cisatracurium", case=False, na=False)].drop_duplicates()
cisatracurium_filtered = cisatracurium_filtered.sort_values(['hospitalization_id', 'admin_dttm'])

# Merge with vent_start_end to get vent_start_time
cisatracurium_filtered = cisatracurium_filtered.merge(
    vent_start_end[['hospitalization_id', 'vent_start_time']], 
    on='hospitalization_id', 
    how='left'
)

# Filter administrations that occurred after vent_start_time
cisatracurium_filtered = cisatracurium_filtered[
    cisatracurium_filtered['admin_dttm'] >= cisatracurium_filtered['vent_start_time']
]

# Group by 'hospitalization_id' and calculate duration
cis_duration = cisatracurium_filtered.groupby('hospitalization_id').agg(
    first_admin=('admin_dttm', 'min'),
    last_admin=('admin_dttm', 'max'),
    vent_start_time=('vent_start_time', 'first')
).reset_index()

# Calculate the duration between first and last cisatracurium administration
cis_duration['cis_admin_duration'] = (
    cis_duration['last_admin'] - cis_duration['first_admin']
).dt.total_seconds() / 3600  # Convert to hours

# Check for continuous administration of cisatracurium for 4 hours or more
cis_duration['cis_flag'] = cis_duration.apply(
    lambda row: 1 if (row['last_admin'] - row['first_admin']).total_seconds() / 3600 >= 4 else 0,
    axis=1
)

In [54]:
cisatracurium_filtered = meds[meds['med_category'].str.contains("cisatracurium", case=False, na=False)].drop_duplicates().sort_values('hospitalization_id')
cisatracurium_filtered = cisatracurium_filtered.merge(vent_start_end, on='hospitalization_id', how='left')
cisatracurium_filtered['duration'] = (cisatracurium_filtered['admin_dttm'] - cisatracurium_filtered['vent_start_time']).dt.total_seconds() / 3600
cisatracurium_filtered['cis_flag'] = cisatracurium_filtered['duration'].apply(lambda x: 1 if x > 4 else 0)


In [None]:
meds_list = [
    "norepinephrine", "epinephrine", "phenylephrine", 
    "vasopressin", "dopamine",  
    "angiotensin", "cisatracurium"
]

#  Filter for Cisatracurium from the meds table - entire dataset


In [None]:
# Pivot the DataFrame to aggregate min and max doses by medication and hour
pivoted_med_df = meds_filtered.pivot_table(
    index=['hospitalization_id', 'recorded_date', 'recorded_hour'],
    columns='med_category',
    values='med_dose',
    aggfunc=['min', 'max']
).reset_index()

# Flatten the MultiIndex columns
pivoted_med_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in pivoted_med_df.columns]
# Remove trailing underscores
pivoted_med_df.columns = [col.rstrip('_') for col in pivoted_med_df.columns]

In [50]:
red_meds_list = [
    "nicardipine", "nitroprusside", "clevidipine"
]

# Filter meds_filtered for the medications in red_meds_list
red_meds_df = meds[meds['med_category'].isin(red_meds_list)].copy()

# Create a flag for each medication in red_meds_list
for med in red_meds_list:
    # Create a flag that is 1 if the medication was administered in that hour, 0 otherwise
    red_meds_df[med + '_flag'] = np.where(red_meds_df['med_category'] == med, 1, 0).astype(int)

# Aggregate to get the maximum value for each flag (per hospitalization_id, recorded_date, recorded_hour)
# This ensures that if the medication was administered even once in the hour, the flag is 1
red_meds_flags = red_meds_df.groupby(['hospitalization_id', 'recorded_date', 'recorded_hour']).agg(
    {med + '_flag': 'max' for med in red_meds_list}
).reset_index()

#  combine all flags into a single 'red_meds_flag', you can do so like this:
red_meds_flags['red_meds_flag'] = red_meds_flags[[med + '_flag' for med in red_meds_list]].max(axis=1)

# Select the relevant columns
red_meds_flags_final = red_meds_flags[[
    'hospitalization_id', 'recorded_date', 'recorded_hour',
    'nicardipine_flag', 'nitroprusside_flag',
    'clevidipine_flag', 'red_meds_flag'
]].drop_duplicates(subset=['hospitalization_id', 'recorded_date', 'recorded_hour'])

red_meds_flags_final['nicardipine_flag'] = pd.to_numeric(red_meds_flags_final['nicardipine_flag'], errors='coerce').fillna(0).astype(int)
red_meds_flags_final['nitroprusside_flag'] = pd.to_numeric(red_meds_flags_final['nitroprusside_flag'], errors='coerce').fillna(0).astype(int)
red_meds_flags_final['clevidipine_flag'] = pd.to_numeric(red_meds_flags_final['clevidipine_flag'], errors='coerce').fillna(0).astype(int)
red_meds_flags_final['red_meds_flag'] = pd.to_numeric(red_meds_flags_final['red_meds_flag'], errors='coerce').fillna(0).astype(int)

In [39]:
meds.columns

Index(['hospitalization_id', 'admin_dttm', 'med_name', 'med_category',
       'med_dose', 'med_dose_unit'],
      dtype='object')