## import data and merge readings

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
dynamic_df = pd.read_csv("../dynamic_info_cleaned.csv")

In [3]:
def merge_readings(row, col_name_from, col_name_into):
    """Merge readings from 2 columns of data:
    
    if col_name_from's value is NaN, it is replaced with value 
    from col_name_into
    """
    if not np.isnan(row[col_name_from]):
        return row[col_name_from]
    return row[col_name_into]

chart_cols = [
    'cvp', 
    'temperature', 
    'heartrate', 
    'respiration',
    'systemicsystolic',
    'systemicdiastolic',
    'systemicmean'      
]
nurse_cols = [
    'CVP - CVP', 
    'Temperature - Temperature (C)', 
    'Heart Rate - Heart Rate', 
    'Respiratory Rate - Respiratory Rate',
    'Invasive BP - Invasive BP Systolic',
    'Invasive BP - Invasive BP Diastolic', 
    'Invasive BP - Invasive BP Mean'
]

# Merge chart and nurse readings into a single column, followed by removing 
# original columns
for i in range(7):
    dynamic_df[chart_cols[i] + "_merged"] = dynamic_df.apply(
        lambda x:merge_readings(x, nurse_cols[i], chart_cols[i]), 
        axis=1
    )
dynamic_df = dynamic_df.drop(columns=chart_cols+nurse_cols)

In [4]:
# Merging in non-invasive BP measurements as well to reduce the number of 
# missing BP measurements
nurse_cols = [
    'systemicsystolic_merged', 
    'systemicdiastolic_merged', 
    'systemicmean_merged',
]
chart_cols = [
    'Non-Invasive BP - Non-Invasive BP Systolic', 
    'Non-Invasive BP - Non-Invasive BP Diastolic', 
    'Non-Invasive BP - Non-Invasive BP Mean',
]

final_col_names = [
    'Invasive and Non-Invasive BP Systolic merged',
    'Invasive and Non-Invasive BP Diastolic merged',
    'Invasive and Non-Invasive BP Mean merged',
]

for i in range(3):
    dynamic_df[final_col_names[i]] = dynamic_df.apply(
        lambda x:merge_readings(x, nurse_cols[i], chart_cols[i]), 
        axis=1
    )

# (dynamic_df["cvp"].isna() | dynamic_df['CVP - CVP'].isna()).value_counts()

In [5]:
has_hypertension = {}
for uid in dynamic_df["patientunitstayid"].unique():
    has_hypertension[uid] = int(any(
                                dynamic_df[dynamic_df["patientunitstayid"] == uid]["icp"]
                                .rolling(6)
                                .apply(lambda x: (x > 22).sum() > 4, raw=True)
                                .dropna()
                            ))

## Define the ICPV columns

In [6]:
df_a = dynamic_df[[
    "patientunitstayid",
    "observationoffset",
    "icp",
    "sao2",
    "etco2",
    "pasystolic",
    "padiastolic",
    "pamean",
    "Glasgow coma score - GCS Total",
    "sodium",
    "glucose",
    "cvp_merged",
    "temperature_merged",
    "heartrate_merged",
    "respiration_merged",
    "Invasive and Non-Invasive BP Systolic merged",
    "Invasive and Non-Invasive BP Diastolic merged",
    "Invasive and Non-Invasive BP Mean merged",
    "Intracranial operations_cumsum",
]].copy()
##calculate CPP
df_a['CPP'] = df_a["Invasive and Non-Invasive BP Mean merged"] - df_a['icp']

In [7]:
##RSD Definition
df_a['stddev_roll_10'] = df_a['icp'].rolling(2).std()
df_a['stddev_roll_15'] = df_a['icp'].rolling(3).std()
df_a['stddev_roll_20'] = df_a['icp'].rolling(4).std()
df_a['stddev_roll_30'] = df_a['icp'].rolling(6).std()
df_a['stddev_roll_45'] = df_a['icp'].rolling(9).std()
df_a['stddev_roll_60'] = df_a['icp'].rolling(12).std()
df_a['stddev_roll_70'] = df_a['icp'].rolling(14).std()
df_a['stddev_roll_80'] = df_a['icp'].rolling(16).std()
df_a['stddev_roll_90'] = df_a['icp'].rolling(18).std()

In [8]:
df_a['mean_roll_10_min'] = df_a['icp'].rolling(2).mean()
df_a['icpv_new_10'] = np.abs(df_a['icp'] - df_a['mean_roll_10_min'])

df_a['mean_roll_15_min'] = df_a['icp'].rolling(3).mean()
df_a['icpv_new_15'] = np.abs(df_a['icp'] - df_a['mean_roll_15_min'])

df_a['mean_roll_20_min'] = df_a['icp'].rolling(4).mean()
df_a['icpv_new_20'] = np.abs(df_a['icp'] - df_a['mean_roll_20_min'])

df_a['mean_roll_30_min'] = df_a['icp'].rolling(6).mean()
df_a['icpv_new_30'] = np.abs(df_a['icp'] - df_a['mean_roll_30_min'])

df_a['mean_roll_45_min'] = df_a['icp'].rolling(9).mean()
df_a['icpv_new_45'] = np.abs(df_a['icp'] - df_a['mean_roll_45_min'])

df_a['mean_roll_60_min'] = df_a['icp'].rolling(12).mean()
df_a['icpv_new_60'] = np.abs(df_a['icp'] - df_a['mean_roll_60_min'])

df_a['mean_roll_70_min'] = df_a['icp'].rolling(14).mean()
df_a['icpv_new_70'] = np.abs(df_a['icp'] - df_a['mean_roll_70_min'])

df_a['mean_roll_80_min'] = df_a['icp'].rolling(16).mean()
df_a['icpv_new_80'] = np.abs(df_a['icp'] - df_a['mean_roll_80_min'])

df_a['mean_roll_90_min'] = df_a['icp'].rolling(18).mean()
df_a['icpv_new_90'] = np.abs(df_a['icp'] - df_a['mean_roll_90_min'])

In [10]:
##mean ICP per patient -- not for LSTM analysis
df_a['ICPmean'] = df_a.groupby('patientunitstayid')['icp'].transform('mean')
df_a['icpv_mean_def2_15'] = df_a.groupby('patientunitstayid')['icpv_new_15'].transform('mean')
df_a['icpv_var_def2_15'] = df_a.groupby('patientunitstayid')['icpv_new_15'].transform('var')
df_a['icpv_mean_def2_20'] = df_a.groupby('patientunitstayid')['icpv_new_20'].transform('mean')
df_a['icpv_var_def2_20'] = df_a.groupby('patientunitstayid')['icpv_new_20'].transform('var')
df_a['icpv_mean_def2_30'] = df_a.groupby('patientunitstayid')['icpv_new_30'].transform('mean')
df_a['icpv_var_def2_30'] = df_a.groupby('patientunitstayid')['icpv_new_30'].transform('var')
df_a['icpv_mean_def2_45'] = df_a.groupby('patientunitstayid')['icpv_new_45'].transform('mean')
df_a['icpv_var_def2_45'] = df_a.groupby('patientunitstayid')['icpv_new_45'].transform('var')
df_a['icpv_mean_def2_60'] = df_a.groupby('patientunitstayid')['icpv_new_60'].transform('mean')
df_a['icpv_var_def2_60'] = df_a.groupby('patientunitstayid')['icpv_new_60'].transform('var')
df_a['icpv_mean_def1_15'] = df_a.groupby('patientunitstayid')['stddev_roll_15'].transform('mean')
df_a['icpv_var_def1_15'] = df_a.groupby('patientunitstayid')['stddev_roll_15'].transform('var')
df_a['icpv_mean_def1_20'] = df_a.groupby('patientunitstayid')['stddev_roll_20'].transform('mean')
df_a['icpv_var_def1_20'] = df_a.groupby('patientunitstayid')['stddev_roll_20'].transform('var')
df_a['icpv_mean_def1_30'] = df_a.groupby('patientunitstayid')['stddev_roll_30'].transform('mean')
df_a['icpv_var_def1_30'] = df_a.groupby('patientunitstayid')['stddev_roll_30'].transform('var')
df_a['icpv_mean_def1_45'] = df_a.groupby('patientunitstayid')['stddev_roll_45'].transform('mean')
df_a['icpv_var_def1_45'] = df_a.groupby('patientunitstayid')['stddev_roll_45'].transform('var')
df_a['icpv_mean_def1_60'] = df_a.groupby('patientunitstayid')['stddev_roll_60'].transform('mean')
df_a['icpv_var_def1_60'] = df_a.groupby('patientunitstayid')['stddev_roll_60'].transform('var')

df_a['icpv_mean_def1_10'] = df_a.groupby('patientunitstayid')['stddev_roll_10'].transform('mean')
df_a['icpv_var_def1_10'] = df_a.groupby('patientunitstayid')['stddev_roll_10'].transform('var')
df_a['icpv_mean_def1_70'] = df_a.groupby('patientunitstayid')['stddev_roll_70'].transform('mean')
df_a['icpv_var_def1_70'] = df_a.groupby('patientunitstayid')['stddev_roll_70'].transform('var')
df_a['icpv_mean_def1_80'] = df_a.groupby('patientunitstayid')['stddev_roll_80'].transform('mean')
df_a['icpv_var_def1_80'] = df_a.groupby('patientunitstayid')['stddev_roll_80'].transform('var')
df_a['icpv_mean_def1_90'] = df_a.groupby('patientunitstayid')['stddev_roll_90'].transform('mean')
df_a['icpv_var_def1_90'] = df_a.groupby('patientunitstayid')['stddev_roll_90'].transform('var')
df_a['icpv_mean_def2_10'] = df_a.groupby('patientunitstayid')['icpv_new_10'].transform('mean')
df_a['icpv_var_def2_10'] = df_a.groupby('patientunitstayid')['icpv_new_10'].transform('var')
df_a['icpv_mean_def2_70'] = df_a.groupby('patientunitstayid')['icpv_new_70'].transform('mean')
df_a['icpv_var_def2_70'] = df_a.groupby('patientunitstayid')['icpv_new_70'].transform('var')
df_a['icpv_mean_def2_80'] = df_a.groupby('patientunitstayid')['icpv_new_80'].transform('mean')
df_a['icpv_var_def2_80'] = df_a.groupby('patientunitstayid')['icpv_new_80'].transform('var')
df_a['icpv_mean_def2_90'] = df_a.groupby('patientunitstayid')['icpv_new_90'].transform('mean')
df_a['icpv_var_def2_90'] = df_a.groupby('patientunitstayid')['icpv_new_90'].transform('var')

#df_a['icp_v2'] = df_a['icp'] - df_a['ICPV1']
#df_a['var_v2'] = df_a.groupby('patientunitstayid')['icp_v2'].transform('var')
#df_a['icp_var'] = df_a.groupby('patientunitstayid')['icp'].transform('var')

df_a.to_csv("df_a_dataset.csv")

##if want to create the table for per patient instead of per entry
## df_one_patient = df_a.groupby('patientunitstayid').first()

In [11]:
df_a.shape

(1949970, 84)

## Static DF

In [12]:
static_df = pd.read_csv("../static_info_labelled.csv")

#select which columns you want to use in the static df here:
static_df_used = static_df.dropna(subset=["BMI", "GCS"]).drop(columns=['TBI','label','IH','AIS','CVT','HE','CNC','ethnicity','unitdischargestatus'])

In [13]:
len(static_df_used)

868

## Prepare data

In [14]:
dynamic_df_datathon = df_a[[
    "patientunitstayid", 
    "observationoffset", 
    "icp",
    'stddev_roll_10', 'stddev_roll_15', 'stddev_roll_20', 
    'stddev_roll_30', 'stddev_roll_45', 'stddev_roll_60', 
    'stddev_roll_70', 'stddev_roll_80', 'stddev_roll_90', 
    'icpv_new_10', 'icpv_new_15', 'icpv_new_20', 
    'icpv_new_30', 'icpv_new_45', 'icpv_new_60', 
    'icpv_new_70', 'icpv_new_80', 'icpv_new_90'
]].dropna()

In [15]:
#split into training and testing sets
sample_patients = np.array(list(set(df_a["patientunitstayid"].unique()) & set(static_df_used["patientunitstayid"])))
print(sample_patients.shape)
sets = np.array_split(sample_patients, 5)


(868,)


## Sample Extractor Function

In [16]:
#changing the slide to 1!

def sample_extractor(df, static_df, patients, col_name, input_len=18, label_len=6, offset_len=6, slide=1, max_samples=200):
    """Generates up to <max_samples> samples from patients
    Sample window:
        ------------------------------------------------
        |       input            | offset |    label   |
        ------------------------------------------------
    :param df: dataframe of patient records
    :param static_df: dataframe of patient static records
    :param patients: list of patients to extract records from
    :param col_name: str or list of column names from df to extract as features.
                     Feature columns can only contain numeric values
    :param input_len: int Number of timestamps as input
    :param label_len: int Number of timestamps as output
    :param offset_len: int Number of timestamps to skip between input and label. Defaults to 0
    :param slide: int Number of timestamps to slide the current window to extract the next sample
    :param max_samples: int Maximum number of samples to extract from each patient. Defaults to 50

    :return: np.ndarray of features extracted split as samples with shape=
             (number of samples, input_len + label_len, number of features = len(col_name))
             Input, labels can be separated from the array via:
                input, labels = array[:, :input_len, :], array[:, input_len:, :]
    """
    # list to store extracted dynamic and static samples
    samples = []
    static_samples = []
    
    # keeps track of start position of samples extracted
    starts = []
    for patient in patients:
        # keep track of number of samples extracted from each sample
        sample_count = 0
        
        static_info = static_df[static_df['patientunitstayid']==patient].iloc[:,2:].values.flatten()
        
        # Extract contiguous time blocks by:
        # 1. Extract patient readings
        patient_df = df[df['patientunitstayid'] == patient]
        # 2. calculate change in time in minutes between consecutive observations. 
        diff = patient_df["observationoffset"].diff()
        # 3. If the difference is not equal to 5 mins, that suggests missing/skipped observations
        # diff_index then stores the row index of observations that are taken more than 5 mins
        # after the previous observation, which are the start index of contiguous observations
        diff_index = list(diff[diff != 5].index)
        
        # For each start index of contiguous observations
        for i in range(len(diff_index)):
            # if i is the last index, the time block is i to the last observation
            if i == len(diff_index) - 1:
                feature_time_series = patient_df[col_name].loc[diff_index[i]:, ].to_numpy()
            # else, the time block is from i to before the next start index
            else:
                feature_time_series = patient_df[col_name].loc[diff_index[i]:diff_index[i+1], ].to_numpy()
            
            # if the feature matrix is 1D (when only 1 feature is chosen), make feature matrix 2D
            if len(feature_time_series.shape) == 1:
                feature_time_series = feature_time_series[..., np.newaxis]
            
            # starting from time 0:
            start = 0
            # while there are enough observations to extract another sample from this timeblock
            # or less than <max_samples> samples have been extracted
            while start + input_len + offset_len + label_len < len(feature_time_series) and sample_count < max_samples:
                # Extract input and label features starting from <start> position
                sample = np.concatenate((
                    # <input_len> number of observations as input
                    feature_time_series[start:start+input_len, :],
                    # <label_len> number of observations as label, after <offset_len> 
                    # number of observations skipped
                    feature_time_series[start + input_len + offset_len:start + input_len + offset_len + label_len, :]
                ), axis=0)
                
                # store extracted dynamic and static variables, as well as time offset of 
                # first observation in sample
                samples.append(sample)
                starts.append(start+input_len)
                start += slide
                sample_count += 1
                # Get the patient's static info
                static_samples.append(static_info)
    return np.stack(samples),np.stack(static_samples), np.stack(starts)
  
# input_len = 18
# offset_len = 6
# label_len = 6
# sliding_interval = 1

## Extract the npy files - one by one lol 

In [17]:
dynamic_df_datathon.columns

Index(['patientunitstayid', 'observationoffset', 'icp', 'stddev_roll_10',
       'stddev_roll_15', 'stddev_roll_20', 'stddev_roll_30', 'stddev_roll_45',
       'stddev_roll_60', 'stddev_roll_70', 'stddev_roll_80', 'stddev_roll_90',
       'icpv_new_10', 'icpv_new_15', 'icpv_new_20', 'icpv_new_30',
       'icpv_new_45', 'icpv_new_60', 'icpv_new_70', 'icpv_new_80',
       'icpv_new_90'],
      dtype='object')

In [39]:
# used at training time instead
# def is_sustained_increased_icp(vals):
#     return (vals > 22).sum(axis=1) > 4


# for each of the columns of interest (default setting)
for patients in sets:
    cols_of_int = dynamic_df_datathon.columns[2:]
    
    # Extract samples with all features; feature selection at training time
    dynamic, static_x, _ = sample_extractor(
        dynamic_df_datathon, static_df_used, patients, cols_of_int, 
        input_len=18, label_len=6, offset_len=6, slide=1, max_samples=100
    )
    
    dynamic_x = dynamic[:, :18, :]
    dynamic_y = dynamic[:, 18:, 0]
    
    # Get number of values to replace with 0 based on time (integer at the end of column name)
    # for all columns except icp column(column 0)
    for i, col_of_int in enumerate(cols_of_int[1:]):
        num_zeros = int(col_of_int.split("_")[-1]) // 5 - 1
        dynamic_x[:, :num_zeros, 1 + i] = 0
    
    with open(f'dataset/train_test_allcols_label6_offset6_slide1_max100.npy', 'ab') as f:
        np.save(f, dynamic_x)
        np.save(f, static_x)
        np.save(f, dynamic_y)
        
for patients in sets:
    cols_of_int = dynamic_df_datathon.columns[2:]
    
    # Extract samples with all features; feature selection at training time
    dynamic, static_x, _ = sample_extractor(
        dynamic_df_datathon, static_df_used, patients, cols_of_int, 
        input_len=18, label_len=12, offset_len=0, slide=1, max_samples=100
    )
    
    dynamic_x = dynamic[:, :18, :]
    dynamic_y = dynamic[:, 18:, 0]
    
    # Get number of values to replace with 0 based on time (integer at the end of column name)
    # for all columns except icp column(column 0)
    for i, col_of_int in enumerate(cols_of_int[1:]):
        num_zeros = int(col_of_int.split("_")[-1]) // 5 - 1
        dynamic_x[:, :num_zeros, 1 + i] = 0
    
    with open(f'dataset/train_test_allcols_label12_offset0_slide1_max100.npy', 'ab') as f:
        np.save(f, dynamic_x)
        np.save(f, static_x)
        np.save(f, dynamic_y)

In [38]:
# saving dataset column names, just in case:
with open("dataset/dynam_col_names.txt", "w") as f:
    f.write("\n".join(cols_of_int))

In [40]:
dynamic_x.shape, dynamic_y.shape, static_x.shape

((17170, 18, 19), (17170, 12), (17170, 3))