# Resample cleaned chartevents chunks

The purpose of this script is to resample the previously already cleaned and chunked CHARTEVENTS data.

The vital parameter values series as well as the alarm threshold value series originally have an inconsistent sampling rate. However, for forecasting based on these data, a consistent sampling rate is required. Therefore, resampling is performed.

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_clean_values_and_thresholds_with_chunkid_65 from parquet file
chartevents_clean_values_and_thresholds_with_chunkid_65 = pd.read_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65.parquet', engine='pyarrow')

In [None]:
import numpy as np
import pandas as pd
import pyarrow as pa

chartevents = chartevents_clean_values_and_thresholds_with_chunkid_65

# unique_chunkids = chartevents.CHUNK_ID_FILLED_TH.unique()
unique_chunkids = ['296490.0_220045.0_2192-09-26 23:51:00'] # To run on selected chunks

# Create data frame with the vital parameter ITEMIDs and associated alarm threshold ITEMIDs
parameters = pd.DataFrame({
    'VITAL_PARAMETER_LABEL':                    ['HR',      'NBPs',     'SpO2'],
    'VITAL_PARAMETER_ITEMID_VALUE':             [220045,    220179,     220277],
    'VITAL_PARAMETER_ITEMID_THRESHOLD_HIGH':    [220046,    223751,     223769],
    'VITAL_PARAMETER_ITEMID_THRESHOLD_LOW':     [220047,    223752,     223770]})

# Dictionaries are used to organize the data within the for loops (all_chunks_dict, current_chunk_dict, current_chunk_parameter_dict).
# After an iteration, the nested dictionary is usually transformed into a data frame using pd.concat(), so that eventually a flat table can be stored instead of a nested dictionary.

all_chunks_dict = dict()

for chunkid in unique_chunkids:

    current_chunk_dict = dict()

    for i, parameter in parameters.iterrows():

        current_chunk_parameter_dict = dict()

        # Get vital parameter value series for current chunk/ vital parameter combination and ...
        # ... resample the vital parameter value series using different methods when downsampling (median, mean, max, min)

        # Resampling of VALUENUM_CLEAN with a frequency of 60 min (1 hour), using the median of the values when downsampling.
        current_chunk_parameter_dict['VITAL_PARAMTER_VALUE_MEDIAN_RESAMPLING'] = chartevents[
            (chartevents['CHUNK_ID_FILLED_TH'] == chunkid) & (chartevents['ITEMID'] == parameter['VITAL_PARAMETER_ITEMID_VALUE'])][
            ['CHARTTIME','VALUENUM_CLEAN']
            ].sort_values(by=['CHARTTIME']).set_index('CHARTTIME').squeeze(axis=1).rename(parameter['VITAL_PARAMETER_ITEMID_VALUE']).resample('1H').median()
        
         # Resampling of VALUENUM_CLEAN with a frequency of 60 min (1 hour), using the mean of the values when downsampling.
        current_chunk_parameter_dict['VITAL_PARAMTER_VALUE_MEAN_RESAMPLING'] = chartevents[
            (chartevents['CHUNK_ID_FILLED_TH'] == chunkid) & (chartevents['ITEMID'] == parameter['VITAL_PARAMETER_ITEMID_VALUE'])][
            ['CHARTTIME','VALUENUM_CLEAN']
            ].sort_values(by=['CHARTTIME']).set_index('CHARTTIME').squeeze(axis=1).rename(parameter['VITAL_PARAMETER_ITEMID_VALUE']).resample('1H').mean()
        
        # Resampling of VALUENUM_CLEAN with a frequency of 60 min (1 hour), using the maximum value when downsampling.
        current_chunk_parameter_dict['VITAL_PARAMTER_VALUE_MAX_RESAMPLING'] = chartevents[
            (chartevents['CHUNK_ID_FILLED_TH'] == chunkid) & (chartevents['ITEMID'] == parameter['VITAL_PARAMETER_ITEMID_VALUE'])][
            ['CHARTTIME','VALUENUM_CLEAN']
            ].sort_values(by=['CHARTTIME']).set_index('CHARTTIME').squeeze(axis=1).rename(parameter['VITAL_PARAMETER_ITEMID_VALUE']).resample('1H').max()
        
        # Resampling of VALUENUM_CLEAN with a frequency of 60 min (1 hour), using the minimum value when downsampling.
        current_chunk_parameter_dict['VITAL_PARAMTER_VALUE_MIN_RESAMPLING'] = chartevents[
            (chartevents['CHUNK_ID_FILLED_TH'] == chunkid) & (chartevents['ITEMID'] == parameter['VITAL_PARAMETER_ITEMID_VALUE'])][
            ['CHARTTIME','VALUENUM_CLEAN']
            ].sort_values(by=['CHARTTIME']).set_index('CHARTTIME').squeeze(axis=1).rename(parameter['VITAL_PARAMETER_ITEMID_VALUE']).resample('1H').min()
        
        
        # Get alarm threshold value series for current chunk/ vital parameter combination
                
        current_chunk_parameter_dict['THRESHOLD_VALUE_HIGH'] = chartevents[
            (chartevents['CHUNK_ID_FILLED_TH'] == chunkid) & (chartevents['ITEMID'] == parameter['VITAL_PARAMETER_ITEMID_THRESHOLD_HIGH'])][
            ['CHARTTIME','VALUENUM_CLEAN']
            ].sort_values(by=['CHARTTIME']).set_index('CHARTTIME').squeeze(axis=1).rename(parameter['VITAL_PARAMETER_ITEMID_THRESHOLD_HIGH'])
        
        current_chunk_parameter_dict['THRESHOLD_VALUE_LOW'] = chartevents[
            (chartevents['CHUNK_ID_FILLED_TH'] == chunkid) & (chartevents['ITEMID'] == parameter['VITAL_PARAMETER_ITEMID_THRESHOLD_LOW'])][
            ['CHARTTIME','VALUENUM_CLEAN']
            ].sort_values(by=['CHARTTIME']).set_index('CHARTTIME').squeeze(axis=1).rename(parameter['VITAL_PARAMETER_ITEMID_THRESHOLD_LOW'])
        
        # Merge resampled vital parameter value series with associated alarm threshold value series into new data frame current_chunk_parameter_df
        current_chunk_parameter_df = pd.concat(current_chunk_parameter_dict, axis=1)

        # Interpolate missing values for alarm threshold value series using the last available value (also called forward fill).
        # If there is no previous value available, no value will be inserted during the interpolation. The value remains NaN.
        current_chunk_parameter_df['THRESHOLD_VALUE_HIGH'].interpolate('pad', inplace=True)
        current_chunk_parameter_df['THRESHOLD_VALUE_LOW'].interpolate('pad', inplace=True)

        # Filter for rows where the vital parameter value series are not NaN.
        # This removes the rows with an irregular timestamp originating from the merge with the alarm threshold series.
        # This step must not be performed earlier, because those rows are needed for the preceding interpolation of alarm threshold value series.
        current_chunk_parameter_df = current_chunk_parameter_df[current_chunk_parameter_df['VITAL_PARAMTER_VALUE_MEDIAN_RESAMPLING'].notna()]
        
        current_chunk_dict[parameter['VITAL_PARAMETER_LABEL']] = current_chunk_parameter_df.reset_index()
        
    else:
        None
    
    # Transform nested dictionary current_chunk_dict into a data frame using pd.concat()
    all_chunks_dict[chunkid] = pd.concat(current_chunk_dict, axis=0).reset_index(level=0).rename(columns={'level_0':'VITAL_PARAMETER_NAME'})

# Transform nested dictionary all_chunks_dict into a data frame using pd.concat()
chartevents_resampled = pd.concat(all_chunks_dict, axis=0).reset_index(level=0).rename(columns={'level_0':'CHUNK_ID_FILLED_TH'})

display(chartevents_resampled)

In [None]:
# The following illustrates how, after resampling the vital parameter value series, the respective alarm threshold values are determined.

# 1. Example data frame excerpt as it exists for a particular chunk/ vital parameter combination ...
# ... after the resampled vital parameter value series have been merged with the associated alarm threshold value series
"""
current_chunk_parameter_df = pd.concat(current_chunk_parameter_dict, axis=1)
"""
# CHARTTIME           | VITAL_PARAMTER_VALUE_MEDIAN_RESAMPLING | THRESHOLD_VALUE_HIGH | THRESHOLD_VALUE_LOW
# ---------------------------------------------------------------------------------------------------------
# 2192-09-26 23:00:00 |                                   95.0 |                  NaN |                 NaN
# 2192-09-26 23:50:00 |                                    NaN |                120.0 |                60.0
# 2192-09-27 00:00:00 |                                   90.5 |                  NaN |                 NaN
# 2192-09-27 01:00:00 |                                   91.0 |                  NaN |                 NaN
# 2192-09-27 02:00:00 |                                   91.0 |                  NaN |                 NaN
# 2192-09-27 03:00:00 |                                   85.0 |                  NaN |                 NaN
#                 ... |                                    ... |                  ... |                 ...


# 2. Data frame after interpolating, i.e. forward filling, the alarm threshold values.
"""
current_chunk_parameter_df['THRESHOLD_VALUE_HIGH'].interpolate('pad', inplace=True)
current_chunk_parameter_df['THRESHOLD_VALUE_LOW'].interpolate('pad', inplace=True)
"""
# CHARTTIME           | VITAL_PARAMTER_VALUE_MEDIAN_RESAMPLING | THRESHOLD_VALUE_HIGH | THRESHOLD_VALUE_LOW
# ---------------------------------------------------------------------------------------------------------
# 2192-09-26 23:00:00 |                                   95.0 |                  NaN |                 NaN
# 2192-09-26 23:50:00 |                                    NaN |                120.0 |                60.0
# 2192-09-27 00:00:00 |                                   90.5 |                120.0 |                60.0
# 2192-09-27 01:00:00 |                                   91.0 |                120.0 |                60.0
# 2192-09-27 02:00:00 |                                   91.0 |                120.0 |                60.0
# 2192-09-27 03:00:00 |                                   85.0 |                120.0 |                60.0
#                 ... |                                    ... |                  ... |                 ...


# 3. Data frame after filtering for rows where the vital parameter value series are not NaN.
# This removes the rows with an irregular timestamp originating from the merge with the alarm threshold series.
"""
current_chunk_parameter_df = current_chunk_parameter_df[current_chunk_parameter_df['VITAL_PARAMTER_VALUE_MEDIAN_RESAMPLING'].notna()]
"""
# CHARTTIME           | VITAL_PARAMTER_VALUE_MEDIAN_RESAMPLING | THRESHOLD_VALUE_HIGH | THRESHOLD_VALUE_LOW
# ---------------------------------------------------------------------------------------------------------
# 2192-09-26 23:00:00 |                                   95.0 |                  NaN |                 NaN
# 2192-09-27 00:00:00 |                                   90.5 |                120.0 |                60.0
# 2192-09-27 01:00:00 |                                   91.0 |                120.0 |                60.0
# 2192-09-27 02:00:00 |                                   91.0 |                120.0 |                60.0
# 2192-09-27 03:00:00 |                                   85.0 |                120.0 |                60.0
#                 ... |                                    ... |                  ... |                 ...


In [None]:
import pandas as pd
import pyarrow as pa

# Save as parquet file
chartevents_resampled.to_parquet('../data/chartevents_resampled.parquet', engine='pyarrow')