# Sepsis EHR Data Extraction

In [9]:
import pandas as pd
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import datetime
from tempo_ql import QueryEngine, FileVariableStore, MEDSDataset, GenericDataset, formats, TimeSeriesSet

In [25]:
dataset_name = 'ehrshot' # or 'mimiciv' or 'ehrshot'
data_path = f'../data/{dataset_name}'

reset_cohort = False # set to True to remove cohort definition if you've already created one using this script

In [26]:
if not os.path.exists(data_path): os.mkdir(data_path)

var_store = FileVariableStore(os.path.join(data_path, '_cache'))

if dataset_name == 'eicu':
    dataset = GenericDataset('bigquery://ai-clinician', formats.eicu(), 
                         scratch_schema_name='ai-clinician.tempo_ql_scratch', 
                         time_field_transform=lambda x: x * 60)
    if reset_cohort: dataset.reset_trajectory_ids()

    if not os.path.exists(data_path): os.mkdir(data_path)
    query_engine = QueryEngine(dataset, variable_stores=[var_store])
elif dataset_name == 'mimiciv':
    dataset = GenericDataset('bigquery://ai-clinician', formats.mimiciv(), 
                         scratch_schema_name='ai-clinician.tempo_ql_scratch_mimic')
    if reset_cohort: dataset.reset_trajectory_ids()

    if not os.path.exists(data_path): os.mkdir(data_path)
    query_engine = QueryEngine(dataset, variable_stores=[var_store])
elif dataset_name == 'ehrshot':
    dataset = MEDSDataset(os.path.join(data_path, "data/*.parquet"), os.path.join(data_path, "metadata/*.parquet"),
                          connection_string='duckdb:///' + os.path.join(data_path, 'variables.db'))
    if reset_cohort: dataset.reset_trajectory_ids()

    query_engine = QueryEngine(dataset, variable_stores=[var_store])



# Cohort Definition

We want to select patients who have an antibiotic and a culture taken within 24 hours, or a diagnosis code with sepsis. We also want to exclude patients under 18 years old and patients who are in the ICU for at least 4 hours.

In [12]:
query_engine.interactive(file_path='test.json', api_key=open('gemini_key.txt').read())

TempoQLWidget(api_status='Configured', file_contents={'Query1': '', 'Query2': '{Gender}'}, ids_length=56897, l…

In [None]:
# Uncomment to start interactive widget to edit cohorts
# query_engine.interactive(file_path=f'queries/cohort_{dataset_name}.json')

In [None]:
cohort_information = query_engine.query_from(f'queries/cohort_{dataset_name}.json', 
                                             variable_store=var_store,
                                             show_progress=True)

In [None]:
traj_ids = cohort_information['Cohort'].get_ids()[cohort_information['Cohort'].get_values() > 0]
print("Filtering to", len(traj_ids), "IDs")
dataset.set_trajectory_ids(traj_ids)

# Signal Extraction

In this stage we extract consolidated concepts for each of the variables we are ultimately interested in.

In [None]:
# Uncomment to start interactive widget to edit extracted data
# query_engine.interactive(file_path=f'queries/extraction_{dataset_name}.json')

In [None]:
query_engine.query_from(f'queries/extraction_{dataset_name}.json', 
                        variable_store=var_store, 
                        show_progress=True);

# Modeling Features

Finally, we aggregate the features using a timestep definition: every 4 hours from either admission or sepsis onset to discharge.

In [None]:
model_features = query_engine.query_from(f'queries/model_features.json', 
                                         show_progress=True,
                                         query_transform=lambda _, query: f"({query}) every 4 h from ((SepsisOnset where #value < Discharge) impute Admission) to min(Discharge, Admission + 14 days)");

Model:CVP:  12%|████████████████▍                                                                                                                             | 26/224 [00:16<02:13,  1.48it/s]

def compiled_fn(ids, times, var_c39d46435063b9b=None):  return var_c39d46435063b9b
def compiled_fn(ids, times, var_c39d46435063b9b=None):  return var_c39d46435063b9b


  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np.nan).astype(float)))
  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np.nan).astype(float)))
  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np.nan).astype(float)))
  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np.nan).astype(float)))
  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np.nan).astype(float)))
  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np.nan).astype(float)))
  return var_exp.replace(pd.NA, np.nan).astype(np.float64).where(nan_mask, numpy_func(var_exp.get_values().replace(pd.NA, np

In [None]:
from tempo_ql.data_types import TimeSeriesSet

# Write to file in a consistent order
feature_names = sorted(model_features.keys())
df = TimeSeriesSet.from_series([model_features[k].rename(k) for k in feature_names]).serialize()[1]
df.assign(**{df.columns[0]: df[df.columns[0]].astype(int)}).rename(columns={df.columns[0]: 'id', df.columns[1]: 'timestep'}).to_csv(os.path.join(data_path, "extracted_model_features.csv"), index=False, float_format='%.4g')

# Downstream Targets

Here we define the variables used as predictive targets.

In [17]:
query_engine.interactive(file_path='queries/predictive_targets.json')

TempoQLWidget(api_status='Not configured - please provide a valid Gemini API key', file_contents={'Mortality':…

In [24]:
targets = query_engine.query_from("queries/predictive_targets.json", show_progress=True)
df = TimeSeriesSet.from_series([targets[k].rename(k) for k in targets]).serialize()[1]
df.assign(**{df.columns[0]: df[df.columns[0]].astype(int)}).rename(columns={df.columns[0]: 'id', df.columns[1]: 'timestep'}).to_csv(os.path.join(data_path, "predictive_targets.csv"), index=False, float_format='%.4g')

SOFA: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:59<00:00, 14.92s/it]
