# Create clean CAHRTEVENTS data set

1. Create chartevents_subset by filtering for relevant ITEMIDs
2. Compute unique ICUSTAY_IDs in chartevents_subset
3. Remove rows with insufficient measurements from chartevents_subset
4. Mark parameter values outside clinically valid ranges
5. Threshold cleaning
  - Identify potential candidates for local threshold swap
  - Prepare data set for local threshold swap
  - Perform local threshold swap
6. Combine cleaned values and thresholds to chartevents_clean

## Create chartevents_subset by filtering for relevant ITEMIDs

* Create subset of MIMIC-III data set called `CHARTEVENTS.csv` (see also respective [MIMIC schema website](https://mit-lcp.github.io/mimic-schema-spy/tables/chartevents.html))
* No change in columns, keep all of them.
* Reduce number of rows by filtering for specific ITEMIDs and removing rows without ICUSTAY_ID

In [None]:
import pandas as pd
import dask.dataframe as dd
import pyarrow as pa
from dask.diagnostics import ProgressBar

# Read CHARTEVENTS.csv as Dask DataFrame
# Data types based on MIMIC schema specification https://mit-lcp.github.io/mimic-schema-spy/tables/chartevents.html
# Problem: Complicated use of intger data types with NaNs in Pandas, see https://pandas.pydata.org/pandas-docs/stable/user_guide/gotchas.html#nan-integer-na-values-and-na-type-promotions
# Decision: Floats and integers are read in as 'float64', strings as 'object', and timestamps via Dask's parse_dates provided for this purpose.
chartevents = dd.read_csv('../mimic/CHARTEVENTS.csv', parse_dates=['CHARTTIME','STORETIME'], dtype={
    'ROW_ID': 'float64', # int4 according to specification
    'SUBJECT_ID': 'float64', # int4 according to specification
    'HADM_ID': 'float64', # int4 according to specification
    'ICUSTAY_ID': 'float64', # int4 according to specification
    'ITEMID': 'float64', # int4 according to specification
    'CGID': 'float64', # int4 according to specification
    'VALUE': 'object',
    'VALUENUM': 'float64', # float8 according to specification
    'VALUEUOM': 'object',
    'WARNING': 'float64', # int4 according to specification
    'ERROR': 'float64', # int4 according to specification
    'RESULTSTATUS': 'object',
    'STOPPED': 'object'})

# Create set of relevant ITEMIDs to filter by
itemid_filter = [220045, 220046, 220047, 220179, 223751, 223752, 220180, 220277, 223769, 223770]
# 220045 Heart Rate
# 220046 Heart rate Alarm - High
# 220047 Heart Rate Alarm - Low
# 220179 Non Invasive Blood Pressure systolic
# 223751 Non-Invasive Blood Pressure Alarm - High
# 223752 Non-Invasive Blood Pressure Alarm - Low
# 220180 Non Invasive Blood Pressure diastolic
# 220277 O2 saturation pulseoxymetry
# 223769 O2 Saturation Pulseoxymetry Alarm - High
# 223770 O2 Saturation Pulseoxymetry Alarm - Low

with ProgressBar():
    # Filter by ITEMIDs
    chartevents_subset = chartevents[chartevents.ITEMID.isin(itemid_filter)]
    # Drop rows without ICUSTAY_ID (The ICUSTAY_ID is missing in 1811 rows, so these are removed.)
    chartevents_subset = chartevents_subset.dropna(how='any', subset=['ICUSTAY_ID'])
    # Keep only the rows for which no error occurred, which is coded by a 0. (5584 rows are dropped because the boolean ERROR column equals 1, indicating an error.)
    chartevents_subset = chartevents_subset[chartevents_subset.ERROR.isin([0])]
    # Apply the previously defined commands to the Dask DataFrame, resulting in the desired Pandas DataFrame.
    chartevents_subset = chartevents_subset.compute()
    # Computing duration on Marius' laptop (Intel i5-5200U CPU @ 2.20GHz): 21min

# Sort the rows (not essential, but gives a better overview)
chartevents_subset = chartevents_subset.sort_values(by=['ICUSTAY_ID', 'CHARTTIME','ITEMID'])

# Rest index
chartevents_subset = chartevents_subset.reset_index(drop=True)

# Save as parquet file
pd.DataFrame(chartevents_subset).to_parquet('../data/chartevents_subset.parquet', engine='pyarrow')

## Compute unique ICUSTAY_IDs in chartevents_subset

Create DataFrame that contains only the `ICUSTAY_ID` column, which contains all unique ICUSTAY_IDs contained in `chartevents_subset.parquet`

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
chartevents_subset = pd.read_parquet('../data/chartevents_subset.parquet', engine='pyarrow')

# Compute unqiue ICU stays in chartevents_subset 
unique_icustays_in_chartevents_subset = pd.Series(chartevents_subset.ICUSTAY_ID.unique()).rename('ICUSTAY_ID')

# Save as parquet file (To do this, the Pandas Series must be converted to a Pandas DataFrame.)
pd.DataFrame(unique_icustays_in_chartevents_subset).to_parquet('../data/unique_icustays_in_chartevents_subset.parquet', engine='pyarrow')

## Remove rows with insufficient measurements from chartevents_subset

Keep only those ICU stay/ parameter combinations for which more than one vital parameter measurement is available.

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
chartevents_subset = pd.read_parquet('../data/chartevents_subset.parquet', engine='pyarrow')

In [None]:
# Consider only those ITEMIDs for the analysis which refer to vital parameter values; threshold values are intentionally not included.
itemids = [220045, 220179, 220180, 220277]
# 220045 Heart Rate
# 220179 Non Invasive Blood Pressure systolic
# 220180 Non Invasive Blood Pressure diastolic
# 220277 O2 saturation pulseoxymetry

# Create subset of chartevents_subset for measurement analysis
chartevents_subset_measurement_analysis = chartevents_subset[['ICUSTAY_ID','ITEMID','VALUENUM']].copy()
chartevents_subset_measurement_analysis = chartevents_subset_measurement_analysis[chartevents_subset_measurement_analysis.ITEMID.isin(itemids)]

# For each ICUSTAY_ID-ITEMID combination, compute the number of available values as VALUENUM_COUNT
chartevents_subset_measurement_count = chartevents_subset_measurement_analysis.groupby(['ICUSTAY_ID','ITEMID']).count()
chartevents_subset_measurement_count = chartevents_subset_measurement_count.rename(columns = {'VALUENUM':'VALUENUM_COUNT'})
chartevents_subset_measurement_count = chartevents_subset_measurement_count.reset_index()
display(chartevents_subset_measurement_count)

In [None]:
import pandas as pd
import pyarrow as pa

# Save chartevents_subset_measurement_count as parquet file
chartevents_subset_measurement_count.to_parquet('../data/chartevents_subset_measurement_count.parquet', engine='pyarrow')

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file
chartevents_subset = pd.read_parquet('../data/chartevents_subset.parquet', engine='pyarrow')

# Read chartevents_subset_measurement_count from parquet file
chartevents_subset_measurement_count = pd.read_parquet('../data/chartevents_subset_measurement_count.parquet', engine='pyarrow')

In [None]:
# Get ICU stay/ parameter combinations for which more than one measurement is available
multimeasurement_icustayid_itemid_value = chartevents_subset_measurement_count[chartevents_subset_measurement_count['VALUENUM_COUNT'] > 1][['ICUSTAY_ID','ITEMID']].reset_index(drop=True)

display(multimeasurement_icustayid_itemid_value)

In [None]:
# In order to keep not only the vital parameter values but also the associated threshold values, the value ITEMIDs must be supplemented by the ITEMIDs of the respective thresholds.
# This step is solved in an unnecessarily complicated way. As of now, there is no time for optimization.
import pandas as pd
import numpy as np

parameters = pd.DataFrame({
    'LABEL':            ['HR',      'NBPs',     'SpO2'],
    'VALUE':            [220045,    220179,     220277],
    'THRESHOLD_HIGH':   [220046,    223751,     223769],
    'THRESHOLD_LOW':    [220047,    223752,     223770]})

multimeasurement_icustayid_itemids = multimeasurement_icustayid_itemid_value.rename(columns = {'ITEMID':'ITEMID_VALUE'}).copy()
# Create empty columns for ITEMID_THRESHOLD_HIGH and ITEMID_THRESHOLD_LOW
multimeasurement_icustayid_itemids.insert(loc=len(multimeasurement_icustayid_itemids.columns), column='ITEMID_THRESHOLD_HIGH', value=np.nan)
multimeasurement_icustayid_itemids.insert(loc=len(multimeasurement_icustayid_itemids.columns), column='ITEMID_THRESHOLD_LOW', value=np.nan)

# Add threshold ITEMIDs to the corresponding value ITEMIDs
for i, parameter in parameters.iterrows():

    multimeasurement_icustayid_itemids.loc[
        multimeasurement_icustayid_itemids.ITEMID_VALUE == parameter['VALUE'],
        'ITEMID_THRESHOLD_HIGH'] = parameter['THRESHOLD_HIGH']
    
    multimeasurement_icustayid_itemids.loc[
        multimeasurement_icustayid_itemids.ITEMID_VALUE == parameter['VALUE'], 
        'ITEMID_THRESHOLD_LOW'] = parameter['THRESHOLD_LOW']

display(multimeasurement_icustayid_itemids)

In [None]:
# Create a data frame, which will be used as filter.
# This data frame must consist of two columns ICUSTAY_ID and ITEMID.

multimeasurement_icustayid_itemid_threshold_high = multimeasurement_icustayid_itemids[
    ['ICUSTAY_ID','ITEMID_THRESHOLD_HIGH']
    ].rename(columns = {'ITEMID_THRESHOLD_HIGH':'ITEMID'})[['ICUSTAY_ID','ITEMID']].dropna()
    # dropna() because there are no thresholds available for Non Invasive Blood Pressure diastolic (220180)

multimeasurement_icustayid_itemid_threshold_low = multimeasurement_icustayid_itemids[
    ['ICUSTAY_ID','ITEMID_THRESHOLD_LOW']
    ].rename(columns = {'ITEMID_THRESHOLD_LOW':'ITEMID'})[['ICUSTAY_ID','ITEMID']].dropna()
    # dropna() because there are no thresholds available for Non Invasive Blood Pressure diastolic (220180)

# Concatenate data frames into filter_multiple_measurements
filter_multiple_measurements = pd.concat(
    [multimeasurement_icustayid_itemid_value, multimeasurement_icustayid_itemid_threshold_high, multimeasurement_icustayid_itemid_threshold_low], axis=0)

display(filter_multiple_measurements)

In [None]:
# Filter the chartevents_subset based on icustayid_and_itemid_with_multiple_measurements
chartevents_cleaning_02_multiple_measurements_only = pd.merge(chartevents_subset,filter_multiple_measurements)

display(chartevents_cleaning_02_multiple_measurements_only)

In [None]:
df_before = chartevents_subset
df_after = chartevents_cleaning_02_multiple_measurements_only

row_count_before = len(df_before)
row_count_after = len(df_after)
row_count_dif = row_count_before - row_count_after

icustay_count_before = len(df_before.ICUSTAY_ID.unique())
icustay_count_after = len(df_after.ICUSTAY_ID.unique())
icustay_count_dif = icustay_count_before - icustay_count_after

icustay_itemid_count_before = len(df_before.groupby(['ICUSTAY_ID','ITEMID']).size())
icustay_itemid_count_after = len(df_after.groupby(['ICUSTAY_ID','ITEMID']).size())
icustay_itemid_count_dif = icustay_itemid_count_before - icustay_itemid_count_after

print("The length of the data frame is reduced by",f'{row_count_dif:,}',"rows from",f'{row_count_before:,}',"rows to",f'{row_count_after:,}',"rows.")
print("The number of ICUSTAY_IDs is reduced by",f'{icustay_count_dif:,}',"from",f'{icustay_count_before:,}',"to",f'{icustay_count_after:,}',"ICUSTAY_IDs.")
print("The number of ICUSTAY_ID-ITEMID combinations is reduced by",f'{icustay_itemid_count_dif:,}',"from",f'{icustay_itemid_count_before:,}',"to",f'{icustay_itemid_count_after:,}',"ICUSTAY_ID-ITEMID combinations.")

In [None]:
import pandas as pd
import pyarrow as pa

# Save chartevents_subset_multiple_values as parquet file
chartevents_cleaning_02_multiple_measurements_only.to_parquet('../data/chartevents_cleaning_02_multiple_measurements_only.parquet', engine='pyarrow')

## Mark parameter values outside clinically valid ranges

Values outside the clinically valid ranges are flagged in a new column, not deleted.

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_cleaning_02_multiple_measurements_only from parquet file
chartevents_cleaning_02_multiple_measurements_only = pd.read_parquet('../data/chartevents_cleaning_02_multiple_measurements_only.parquet', engine='pyarrow')

In [None]:
# Clinically valid value ranges
# Heart Rate (220045): 0-350
# Non Invasive Blood Pressure systolic (220179): 0-375
# Non Invasive Blood Pressure diastolic (220180): 0-375
# O2 saturation pulseoxymetry (220277): 0-100

# Add new column CLEANING_FLAG, which is used to mark values outside the respective clinically valid range as "Below valid value range" or "Above valid value range".
import numpy as np
flagged_values = chartevents_cleaning_02_multiple_measurements_only[['ROW_ID','ITEMID','VALUENUM']].copy()
flagged_values.insert(loc=len(flagged_values.columns), column='CLEANING_FLAG', value=np.nan)

flagged_values.loc[
    ((flagged_values['ITEMID'] == 220045) & (flagged_values['VALUENUM'] < 0)) | 
    ((flagged_values['ITEMID'] == 220179) & (flagged_values['VALUENUM'] < 0)) | 
    ((flagged_values['ITEMID'] == 220180) & (flagged_values['VALUENUM'] < 0)) |
    ((flagged_values['ITEMID'] == 220277) & (flagged_values['VALUENUM'] < 0)),
    'CLEANING_FLAG'] = "Below valid value range"

flagged_values.loc[
    ((flagged_values['ITEMID'] == 220045) & (flagged_values['VALUENUM'] > 350)) | 
    ((flagged_values['ITEMID'] == 220179) & (flagged_values['VALUENUM'] > 375)) | 
    ((flagged_values['ITEMID'] == 220180) & (flagged_values['VALUENUM'] > 375)) |
    ((flagged_values['ITEMID'] == 220277) & (flagged_values['VALUENUM'] > 100)),
    'CLEANING_FLAG'] = "Above valid value range"

# Reduce to relevant columns and rows (only those needed for merging with chartevents_cleaning_02_multiple_measurements_only data frame) 
flagged_values = flagged_values.drop(['ITEMID','VALUENUM'], axis = 1).dropna().reset_index(drop=True)

display(flagged_values)

In [None]:
value_affected_count = len(flagged_values)
value_affected_above_count = len(flagged_values[flagged_values.CLEANING_FLAG == "Above valid value range"])
value_affected_below_count = len(flagged_values[flagged_values.CLEANING_FLAG == "Below valid value range"])

print(f'{value_affected_count:,}',"values are outside the clinically valid range,",f'{value_affected_above_count:,}',"above and",f'{value_affected_below_count:,}',"below.")

In [None]:
import pandas as pd
import pyarrow as pa

# Save flagged_values as parquet file
flagged_values.to_parquet('../data/flagged_values.parquet', engine='pyarrow')

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_cleaning_02_multiple_measurements_only from parquet file
chartevents_cleaning_02_multiple_measurements_only = pd.read_parquet('../data/chartevents_cleaning_02_multiple_measurements_only.parquet', engine='pyarrow')

# Read flagged_values from parquet file
flagged_values = pd.read_parquet('../data/flagged_values.parquet', engine='pyarrow')

In [None]:
# In the next step, data frames chartevents_cleaning_02_multiple_measurements_only and flagged_values are merged to form a new data frame chartevents_cleaning_03_values_in_valid_range. The latter will be extended/modified again in the next cleaning step and so on.

chartevents_cleaning_03_values_in_valid_range = chartevents_cleaning_02_multiple_measurements_only.copy()
# Create new column 'VALUENUM_CLEAN' that equals 'VALUENUM' values
chartevents_cleaning_03_values_in_valid_range['VALUENUM_CLEAN'] = chartevents_cleaning_03_values_in_valid_range['VALUENUM']

# Set all 'VALUENUM_CLEAN' cells to NaN, where we identified a cleaning flag (above or below valid range) in the value cleaning step
chartevents_cleaning_03_values_in_valid_range = chartevents_cleaning_03_values_in_valid_range.merge(flagged_values, how='left', on=['ROW_ID'])
chartevents_cleaning_03_values_in_valid_range.loc[
    (chartevents_cleaning_03_values_in_valid_range.CLEANING_FLAG == "Below valid value range") | 
    (chartevents_cleaning_03_values_in_valid_range.CLEANING_FLAG == "Above valid value range"),
    'VALUENUM_CLEAN'] = None

display(chartevents_cleaning_03_values_in_valid_range)

In [None]:
# For demo purposes, show rows with cleaning flag (selected columns)
chartevents_cleaning_03_values_in_valid_range[
    chartevents_cleaning_03_values_in_valid_range.CLEANING_FLAG.notnull()
    ][['ROW_ID','ICUSTAY_ID','ITEMID','CHARTTIME','VALUENUM','VALUENUM_CLEAN','CLEANING_FLAG']]

In [None]:
import pandas as pd
import pyarrow as pa

# Save chartevents_cleaning_03_values_in_valid_range as parquet file
chartevents_cleaning_03_values_in_valid_range.to_parquet('../data/chartevents_cleaning_03_values_in_valid_range.parquet', engine='pyarrow')

## Threshold cleaning

In the data of some ICU stays, the high and low thresholds overlap at certain points. For example, during a certain period of time, the threshold for a heart rate that is too high may be below the threshold for a heart rate that is too low, and vice versa.

According to the medical experts consulted, there is no plausible reason for this. In fact, medical devices should not allow the setting of such overlapping alarm thresholds in the first place.

The overlap can show up in different ways. Roughly speaking, we have observed three variants looking at time series plots:

1. The low threshold temporarily exceeds the high threshold, while the latter continues 'normally'.
2. The high threshold value temporarily falls below the low threshold value, while the latter continues 'normally'.
3. Both thresholds temporarily overlap so that they appear swapped, which is 'abnormal' for both.

There are two sub-variants for variant (3):

- 3a The threshold values are swapped, but do not decrease/increase to the same extent, so it is not an exact swap.
- 3b The thresholds are swapped, decreasing/increasing  to the same extent, so it looks like an exact swap.

According to the agreement with medical experts, the two threshold values for case 3b (exact swap) are swapped back for the time period affected.

Possibly this would also be possible for case 3a. However, this seems very complex as this case is difficult for us to distinguish from cases 1 and 2.

### Identify potential candidates for local threshold swap

Coarse detection of the ICU stay/parameter combinations for which the local swapping of thresholds is a possible option. Purpose of this preliminary step is to reduce the computational effort. The aim is to reduce the relatively complex threshold swap step to potentially affected cases.

The coarse detection is done by comparing the minimum high threshold to the maximum low threshold for each ICU stay/parameter combination. If the minimum high threshold is below the maximum low threshold, the ICU stay/parameter combination is considered for threshold swapping.

The output of this section is a data frame that includes the ICU stay/parameter combinations that are possible candidates for swapping. It is to be expected that the number of candidates is higher than the number of swaps eventually performed. The reason for this is that not all threshold overlaps allow a meaningful swap (see following section).

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_cleaning_03_values_in_valid_range from parquet file
chartevents_cleaning_03_values_in_valid_range = pd.read_parquet('../data/chartevents_cleaning_03_values_in_valid_range.parquet', engine='pyarrow')

In [None]:
import pandas as pd

parameters = pd.DataFrame({
    'LABEL':            ['HR',      'NBPs',     'SpO2'],
    'VALUE':            [220045,    220179,     220277],
    'THRESHOLD_HIGH':   [220046,    223751,     223769],
    'THRESHOLD_LOW':    [220047,    223752,     223770]})

# Create empty data frames to which will be appended during the loop
min_threshold_high_per_icustay = pd.DataFrame(columns=['ICUSTAY_ID', 'THRESHOLD_HIGH_MIN', 'ITEMID_VALUE'])
max_threshold_high_per_icustay = pd.DataFrame(columns=['ICUSTAY_ID', 'THRESHOLD_LOW_MAX', 'ITEMID_VALUE'])

for i, parameter in parameters.iterrows():

    # For current parameter, compute minimum value of high threshold for all ICU stays
    min_threshold_high = chartevents_cleaning_03_values_in_valid_range[
        chartevents_cleaning_03_values_in_valid_range['ITEMID'] == parameter['THRESHOLD_HIGH']
        ].groupby(['ICUSTAY_ID','ITEMID'])['VALUENUM'].min()
    min_threshold_high = min_threshold_high.reset_index()
    min_threshold_high = min_threshold_high[['ICUSTAY_ID','VALUENUM']].rename(columns = {'VALUENUM':'THRESHOLD_HIGH_MIN'}).assign(ITEMID_VALUE=parameter.VALUE)

    # For current parameter, compute maximium value of low threshold for all ICU stays
    max_threshold_low = chartevents_cleaning_03_values_in_valid_range[
        chartevents_cleaning_03_values_in_valid_range['ITEMID'] == parameter['THRESHOLD_LOW']
        ].groupby(['ICUSTAY_ID','ITEMID'])['VALUENUM'].max()
    max_threshold_low = max_threshold_low.reset_index()
    max_threshold_low = max_threshold_low[['ICUSTAY_ID','VALUENUM']].rename(columns = {'VALUENUM':'THRESHOLD_LOW_MAX'}).assign(ITEMID_VALUE=parameter.VALUE)

    # Append the results of the current parameter to the data frames for the overall results
    min_threshold_high_per_icustay = min_threshold_high_per_icustay.append(min_threshold_high, ignore_index=True)
    max_threshold_high_per_icustay = max_threshold_high_per_icustay.append(max_threshold_low, ignore_index=True)

# Merge data frames
threshold_min_max_per_icustay = min_threshold_high_per_icustay.merge(max_threshold_high_per_icustay, on=['ICUSTAY_ID','ITEMID_VALUE'])
threshold_min_max_per_icustay = threshold_min_max_per_icustay[['ICUSTAY_ID', 'ITEMID_VALUE', 'THRESHOLD_HIGH_MIN', 'THRESHOLD_LOW_MAX']]

display(threshold_min_max_per_icustay)

In [None]:
# Identify threshold swap candidates by comparing the minimum high threshold to the maximum low threshold for each ICU stay/parameter combination.
# If the minimum low threshold is below the maximum high threshold, the ICU stay/parameter combination is considered for threshold swapping. 
threshold_min_max_per_icustay['CROSS'] = threshold_min_max_per_icustay['THRESHOLD_HIGH_MIN'] < threshold_min_max_per_icustay['THRESHOLD_LOW_MAX']
threshold_swap_candidates = threshold_min_max_per_icustay[threshold_min_max_per_icustay['CROSS'] == True][['ICUSTAY_ID','ITEMID_VALUE']].reset_index(drop=True)

display(threshold_swap_candidates)

In [None]:
import numpy as np
import pandas as pd
import pyarrow as pa

# Save threshold_swap_candidates as parquet file
threshold_swap_candidates.to_parquet('../data/threshold_swap_candidates.parquet', engine='pyarrow')

### Prepare data set for local threshold swap

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_cleaning_03_values_in_valid_range from parquet file
chartevents_cleaning_03_values_in_valid_range = pd.read_parquet('../data/chartevents_cleaning_03_values_in_valid_range.parquet', engine='pyarrow')

# Read threshold_swap_candidates from parquet file
threshold_swap_candidates = pd.read_parquet('../data/threshold_swap_candidates.parquet', engine='pyarrow')

In [None]:
# The threshold_swap_candidates data frame contains only the ITEMIDs of the vital sign values, not the ITEMIDs of the associated thresholds.
# To facilitate the subsequent subsetting of the CHARTEVENTS data frame, auxiliary data frames are created with the threshold ITEMIDs.
# The threshold ITEMIDs are combined into one data frame, which is then used to filter the CHARTEVENT data frame.
# There is probably a smarter way to do this, but this was fast enough.

import pandas as pd

parameters = pd.DataFrame({
    'LABEL':            ['HR',      'NBPs',     'SpO2'],
    'VALUE':            [220045,    220179,     220277],
    'THRESHOLD_HIGH':   [220046,    223751,     223769],
    'THRESHOLD_LOW':    [220047,    223752,     223770]})

# Create empty data frames to which will be appended during the loop
itemid_threshold_high_per_icustay = pd.DataFrame(columns=['ICUSTAY_ID', 'ITEMID'])
itemid_threshold_low_per_icustay = pd.DataFrame(columns=['ICUSTAY_ID', 'ITEMID'])

for i, parameter in parameters.iterrows():

    # For current parameter, create data frames with threshold ITEMIDs
    itemid_threshold_high = threshold_swap_candidates[threshold_swap_candidates['ITEMID_VALUE'] == parameter['VALUE']][['ICUSTAY_ID']].assign(ITEMID=parameter.THRESHOLD_HIGH)
    itemid_threshold_low = threshold_swap_candidates[threshold_swap_candidates['ITEMID_VALUE'] == parameter['VALUE']][['ICUSTAY_ID']].assign(ITEMID=parameter.THRESHOLD_LOW)

    # Append the results of the current parameter to the data frames for the overall results
    itemid_threshold_high_per_icustay = itemid_threshold_high_per_icustay.append(itemid_threshold_high, ignore_index=True)
    itemid_threshold_low_per_icustay = itemid_threshold_low_per_icustay.append(itemid_threshold_low, ignore_index=True)

# Merge data frames vertically
threshold_swap_filter = pd.concat([itemid_threshold_high_per_icustay, itemid_threshold_low_per_icustay], axis= 0)

# Sort to make it pretty (not important)
threshold_swap_filter = threshold_swap_filter.sort_values(by=['ICUSTAY_ID','ITEMID']).reset_index(drop=True)

display(threshold_swap_filter)

In [None]:
# Filter the chartevents_subset based on the threshold_swap_filter
threshold_swap_data = pd.merge(chartevents_cleaning_03_values_in_valid_range,threshold_swap_filter)
display(threshold_swap_data)

In [None]:
import pandas as pd
import pyarrow as pa

# Save threshold_swap_data as parquet file
threshold_swap_data.to_parquet('../data/threshold_swap_data.parquet', engine='pyarrow')

### Perform local threshold swap

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_cleaning_03_values_in_valid_range from parquet file
chartevents_cleaning_03_values_in_valid_range = pd.read_parquet('../data/chartevents_cleaning_03_values_in_valid_range.parquet', engine='pyarrow')

# Read threshold_swap_data from parquet file
threshold_swap_data = pd.read_parquet('../data/threshold_swap_data.parquet', engine='pyarrow')

In [None]:
import numpy as np
import pandas as pd

parameters = pd.DataFrame({
    'LABEL':            ['HR',      'NBPs',     'SpO2'],
    'VALUE':            [220045,    220179,     220277],
    'THRESHOLD_HIGH':   [220046,    223751,     223769],
    'THRESHOLD_LOW':    [220047,    223752,     223770]})

icustays = threshold_swap_data.ICUSTAY_ID.unique()

thresholds_fixed = pd.DataFrame(columns=['ICUSTAY_ID', 'ITEMID_VALUE', 'CHARTTIME', 'THRESHOLD_LOW_FIXED', 'THRESHOLD_HIGH_FIXED'])

for icustay in icustays:
    
    for i, parameter in parameters.iterrows():
        
        threshold_high = threshold_swap_data[
            (threshold_swap_data["ICUSTAY_ID"] == icustay) & 
            (threshold_swap_data["ITEMID"] == parameter['THRESHOLD_HIGH'])][
            ['CHARTTIME','VALUENUM']
            ].sort_values(by=['CHARTTIME']).rename(columns = {'VALUENUM':'THRESHOLD_HIGH'})

        threshold_low = threshold_swap_data[
            (threshold_swap_data["ICUSTAY_ID"] == icustay) & 
            (threshold_swap_data["ITEMID"] == parameter['THRESHOLD_LOW'])][
            ['CHARTTIME','VALUENUM']
            ].sort_values(by=['CHARTTIME']).rename(columns = {'VALUENUM':'THRESHOLD_LOW'})

        thresholds_by_icustay_parameter_charttime = threshold_high.merge(threshold_low, on=['CHARTTIME']).assign(ICUSTAY_ID=icustay, ITEMID_VALUE=parameter.VALUE)
        
        # Create a new column that contains the chronologically following threshold value of the same type
        thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH_NEXT'] = thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH'].shift(-1)
        thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW_NEXT'] = thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW'].shift(-1)

        thresholds_by_icustay_parameter_charttime['DIF_THRESHOLD_HIGH_NEXT'] = thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH_NEXT'] - thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH']
        thresholds_by_icustay_parameter_charttime['DIF_THRESHOLD_LOW_NEXT'] = thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW_NEXT'] - thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW']
        

        thresholds_by_icustay_parameter_charttime.insert(loc=len(thresholds_by_icustay_parameter_charttime.columns), column='THRESHOLD_HIGH_FIXED', value=np.nan)
        thresholds_by_icustay_parameter_charttime.insert(loc=len(thresholds_by_icustay_parameter_charttime.columns), column='THRESHOLD_LOW_FIXED', value=np.nan)

        thresholds_by_icustay_parameter_charttime.loc[
            (thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH'] < thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW']) &
            (abs(thresholds_by_icustay_parameter_charttime['DIF_THRESHOLD_HIGH_NEXT']) == abs(thresholds_by_icustay_parameter_charttime['DIF_THRESHOLD_LOW_NEXT'])),
            'THRESHOLD_HIGH_FIXED'] = thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW']

        thresholds_by_icustay_parameter_charttime.loc[
            (thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH'] < thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW']) &
            (abs(thresholds_by_icustay_parameter_charttime['DIF_THRESHOLD_HIGH_NEXT']) == abs(thresholds_by_icustay_parameter_charttime['DIF_THRESHOLD_LOW_NEXT'])),
            'THRESHOLD_LOW_FIXED'] = thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH']
        
        thresholds_by_icustay_parameter_charttime.dropna(inplace=True)
        thresholds_fixed_for_icustayid_itemid = thresholds_by_icustay_parameter_charttime[['ICUSTAY_ID', 'ITEMID_VALUE', 'CHARTTIME', 'THRESHOLD_LOW_FIXED', 'THRESHOLD_HIGH_FIXED']]

        thresholds_fixed = thresholds_fixed.append(thresholds_fixed_for_icustayid_itemid, ignore_index=True)

display(thresholds_fixed)

In [None]:
import pandas as pd
import pyarrow as pa

# Save thresholds_fixed as parquet file
thresholds_fixed.to_parquet('../data/thresholds_fixed.parquet', engine='pyarrow')

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_cleaning_03_values_in_valid_range from parquet file
chartevents_cleaning_03_values_in_valid_range = pd.read_parquet('../data/chartevents_cleaning_03_values_in_valid_range.parquet', engine='pyarrow')

# Read thresholds_fixed from parquet file
thresholds_fixed = pd.read_parquet('../data/thresholds_fixed.parquet', engine='pyarrow')

In [None]:
# In the next step, data frames chartevents_cleaning_03_values_in_valid_range and thresholds_fixed are merged to form a new data frame chartevents_cleaning_04_exact_threshold_swaps_reverted. The latter will be extended/modified again in the next cleaning step and so on.

chartevents_cleaning_04_exact_threshold_swaps_reverted = chartevents_cleaning_03_values_in_valid_range.copy()

In [None]:
# Unnecessarily complicated step; could be simplified by better preparation of the data frame to be merged.
# Needed because the thresholds_fixed data frame does contain the ITEMID_VALUE but not the threshold ITEMIDs
import pandas as pd
import numpy as np

parameters = pd.DataFrame({
    'LABEL':            ['HR',      'NBPs',     'SpO2'],
    'VALUE':            [220045,    220179,     220277],
    'THRESHOLD_HIGH':   [220046,    223751,     223769],
    'THRESHOLD_LOW':    [220047,    223752,     223770]})

thresholds_fixed.insert(loc=len(thresholds_fixed.columns), column='ITEMID_THRESHOLD_HIGH', value=np.nan)
thresholds_fixed.insert(loc=len(thresholds_fixed.columns), column='ITEMID_THRESHOLD_LOW', value=np.nan)

for i, parameter in parameters.iterrows():

    thresholds_fixed.loc[thresholds_fixed.ITEMID_VALUE == parameter['VALUE'], 'ITEMID_THRESHOLD_HIGH'] = parameter['THRESHOLD_HIGH']
    thresholds_fixed.loc[thresholds_fixed.ITEMID_VALUE == parameter['VALUE'], 'ITEMID_THRESHOLD_LOW'] = parameter['THRESHOLD_LOW']

display(thresholds_fixed)

In [None]:
thresholds_fixed_high = thresholds_fixed[
    ['ICUSTAY_ID','CHARTTIME','THRESHOLD_HIGH_FIXED','ITEMID_THRESHOLD_HIGH']
    ].rename(columns = {'THRESHOLD_HIGH_FIXED':'VALUENUM_CLEAN', 'ITEMID_THRESHOLD_HIGH':'ITEMID'})
thresholds_fixed_high = thresholds_fixed_high[['ICUSTAY_ID','ITEMID','CHARTTIME','VALUENUM_CLEAN']]

thresholds_fixed_low = thresholds_fixed[
    ['ICUSTAY_ID','CHARTTIME','THRESHOLD_LOW_FIXED','ITEMID_THRESHOLD_LOW']
    ].rename(columns = {'THRESHOLD_LOW_FIXED':'VALUENUM_CLEAN', 'ITEMID_THRESHOLD_LOW':'ITEMID'})
thresholds_fixed_low = thresholds_fixed_low[['ICUSTAY_ID','ITEMID','CHARTTIME','VALUENUM_CLEAN']]

In [None]:
# Takes quite long (approx. 11min); there is probably a better why to insert/replace the swapped threshold values in the VALUENUM_CLEAN column

for i, row in thresholds_fixed_high.iterrows():

    chartevents_cleaning_04_exact_threshold_swaps_reverted.loc[
        (chartevents_cleaning_04_exact_threshold_swaps_reverted.ICUSTAY_ID == row['ICUSTAY_ID']) &
        (chartevents_cleaning_04_exact_threshold_swaps_reverted.ITEMID == row['ITEMID']) &
        (chartevents_cleaning_04_exact_threshold_swaps_reverted.CHARTTIME == row['CHARTTIME']), 
        'VALUENUM_CLEAN'] = row['VALUENUM_CLEAN']

    chartevents_cleaning_04_exact_threshold_swaps_reverted.loc[
        (chartevents_cleaning_04_exact_threshold_swaps_reverted.ICUSTAY_ID == row['ICUSTAY_ID']) &
        (chartevents_cleaning_04_exact_threshold_swaps_reverted.ITEMID == row['ITEMID']) &
        (chartevents_cleaning_04_exact_threshold_swaps_reverted.CHARTTIME == row['CHARTTIME']), 
        'CLEANING_FLAG'] = "High threshold fixed by swap"

for i, row in thresholds_fixed_low.iterrows():

    chartevents_cleaning_04_exact_threshold_swaps_reverted.loc[
        (chartevents_cleaning_04_exact_threshold_swaps_reverted.ICUSTAY_ID == row['ICUSTAY_ID']) &
        (chartevents_cleaning_04_exact_threshold_swaps_reverted.ITEMID == row['ITEMID']) &
        (chartevents_cleaning_04_exact_threshold_swaps_reverted.CHARTTIME == row['CHARTTIME']), 
        'VALUENUM_CLEAN'] = row['VALUENUM_CLEAN']

    chartevents_cleaning_04_exact_threshold_swaps_reverted.loc[
        (chartevents_cleaning_04_exact_threshold_swaps_reverted.ICUSTAY_ID == row['ICUSTAY_ID']) &
        (chartevents_cleaning_04_exact_threshold_swaps_reverted.ITEMID == row['ITEMID']) &
        (chartevents_cleaning_04_exact_threshold_swaps_reverted.CHARTTIME == row['CHARTTIME']), 
        'CLEANING_FLAG'] = "Low threshold fixed by swap"

display(chartevents_cleaning_04_exact_threshold_swaps_reverted[
    chartevents_cleaning_04_exact_threshold_swaps_reverted.CLEANING_FLAG.isin(['High threshold fixed by swap','Low threshold fixed by swap'])])

In [None]:
import pandas as pd
import pyarrow as pa

# Save chartevents_cleaning_04_exact_threshold_swaps_reverted as parquet file
chartevents_cleaning_04_exact_threshold_swaps_reverted.to_parquet('../data/chartevents_cleaning_04_exact_threshold_swaps_reverted.parquet', engine='pyarrow')

### Clean thresholds outside clinically valid ranges

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_cleaning_04_exact_threshold_swaps_reverted from parquet file to pandas data frame
chartevents_cleaning_04_exact_threshold_swaps_reverted = pd.read_parquet('../data/chartevents_cleaning_04_exact_threshold_swaps_reverted.parquet', engine='pyarrow')

In [None]:
# Clinically valid threshold ranges
# Currently, identical to the value ranges and the same for both threshold types (low and high).
# It is conceivable to define the ranges for each threshold separately, although we currently lack the medical basis for this.
# Heart rate Alarm - High (220046): 0-350
# Heart Rate Alarm - Low (220047): 0-350
# Non-Invasive Blood Pressure Alarm - High (223751): 0-375
# Non-Invasive Blood Pressure Alarm - Low (223752): 0-375
# O2 Saturation Pulseoxymetry Alarm - High (223769): 0-100
# O2 Saturation Pulseoxymetry Alarm - Low (223770): 0-100

# The approach of threshold flagging is similar to the value flagging  performed above.
# However, in this case not the original VALUENUM is used but the VALUENUM_CLEAN.
# The reason for this is that threshold values outside the valid ranges can also be among the swapped back threshold values.
# This is done to include the reverted exact threshold swaps (see above), which may contain thresholds outside the valid ranges.

chartevents_cleaning_05_thresholds_in_valid_range = chartevents_cleaning_04_exact_threshold_swaps_reverted.copy()

chartevents_cleaning_05_thresholds_in_valid_range.loc[
    ((chartevents_cleaning_05_thresholds_in_valid_range['ITEMID'].isin([220046, 220047])) & (chartevents_cleaning_05_thresholds_in_valid_range['VALUENUM_CLEAN'] < 0)) | 
    ((chartevents_cleaning_05_thresholds_in_valid_range['ITEMID'].isin([223751, 223752])) & (chartevents_cleaning_05_thresholds_in_valid_range['VALUENUM_CLEAN'] < 0)) | 
    ((chartevents_cleaning_05_thresholds_in_valid_range['ITEMID'].isin([223769, 223770])) & (chartevents_cleaning_05_thresholds_in_valid_range['VALUENUM_CLEAN'] < 0)),
    'CLEANING_FLAG'] = "Below valid threshold range"

chartevents_cleaning_05_thresholds_in_valid_range.loc[
    ((chartevents_cleaning_05_thresholds_in_valid_range['ITEMID'].isin([220046, 220047])) & (chartevents_cleaning_05_thresholds_in_valid_range['VALUENUM_CLEAN'] > 350)) | 
    ((chartevents_cleaning_05_thresholds_in_valid_range['ITEMID'].isin([223751, 223752])) & (chartevents_cleaning_05_thresholds_in_valid_range['VALUENUM_CLEAN'] > 375)) | 
    ((chartevents_cleaning_05_thresholds_in_valid_range['ITEMID'].isin([223769, 223770])) & (chartevents_cleaning_05_thresholds_in_valid_range['VALUENUM_CLEAN'] > 100)),
    'CLEANING_FLAG'] = "Above valid threshold range"

chartevents_cleaning_05_thresholds_in_valid_range.loc[
    (chartevents_cleaning_05_thresholds_in_valid_range['CLEANING_FLAG'].isin(["Below valid threshold range","Above valid threshold range"])),
    'VALUENUM_CLEAN'] = None

display(chartevents_cleaning_05_thresholds_in_valid_range[
    chartevents_cleaning_05_thresholds_in_valid_range.CLEANING_FLAG.isin(["Below valid threshold range","Above valid threshold range"])])

In [None]:
import pandas as pd
import pyarrow as pa

# Save chartevents_cleaning_05_thresholds_in_valid_range as parquet file
chartevents_cleaning_05_thresholds_in_valid_range.to_parquet('../data/chartevents_cleaning_05_thresholds_in_valid_range.parquet', engine='pyarrow')

### Identify potential candidates for local threshold removal due to overlap

The approach to identifying potential candidates for local threshold removal due to overlap is similar to the candidate identification performed above in the context of threshold swapping. However, in this case not the original VALUENUM is used but the VALUENUM_CLEAN.

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_cleaning_05_thresholds_in_valid_range from parquet file
chartevents_cleaning_05_thresholds_in_valid_range = pd.read_parquet('../data/chartevents_cleaning_05_thresholds_in_valid_range.parquet', engine='pyarrow')

In [None]:
import pandas as pd

parameters = pd.DataFrame({
    'LABEL':            ['HR',      'NBPs',     'SpO2'],
    'VALUE':            [220045,    220179,     220277],
    'THRESHOLD_HIGH':   [220046,    223751,     223769],
    'THRESHOLD_LOW':    [220047,    223752,     223770]})

# Create empty data frames to which will be appended during the loop
min_threshold_high_per_icustay = pd.DataFrame(columns=['ICUSTAY_ID', 'THRESHOLD_HIGH_MIN', 'ITEMID_VALUE'])
max_threshold_high_per_icustay = pd.DataFrame(columns=['ICUSTAY_ID', 'THRESHOLD_LOW_MAX', 'ITEMID_VALUE'])

for i, parameter in parameters.iterrows():

    # For current parameter, compute minimum value of high threshold for all ICU stays
    min_threshold_high = chartevents_cleaning_05_thresholds_in_valid_range[
        chartevents_cleaning_05_thresholds_in_valid_range['ITEMID'] == parameter['THRESHOLD_HIGH']
        ].groupby(['ICUSTAY_ID','ITEMID'])['VALUENUM_CLEAN'].min()
    min_threshold_high = min_threshold_high.reset_index()
    min_threshold_high = min_threshold_high[['ICUSTAY_ID','VALUENUM_CLEAN']].rename(columns = {'VALUENUM_CLEAN':'THRESHOLD_HIGH_MIN'}).assign(ITEMID_VALUE=parameter.VALUE)

    # For current parameter, compute maximium value of low threshold for all ICU stays
    max_threshold_low = chartevents_cleaning_05_thresholds_in_valid_range[
        chartevents_cleaning_05_thresholds_in_valid_range['ITEMID'] == parameter['THRESHOLD_LOW']
        ].groupby(['ICUSTAY_ID','ITEMID'])['VALUENUM_CLEAN'].max()
    max_threshold_low = max_threshold_low.reset_index()
    max_threshold_low = max_threshold_low[['ICUSTAY_ID','VALUENUM_CLEAN']].rename(columns = {'VALUENUM_CLEAN':'THRESHOLD_LOW_MAX'}).assign(ITEMID_VALUE=parameter.VALUE)

    # Append the results of the current parameter to the data frames for the overall results
    min_threshold_high_per_icustay = min_threshold_high_per_icustay.append(min_threshold_high, ignore_index=True)
    max_threshold_high_per_icustay = max_threshold_high_per_icustay.append(max_threshold_low, ignore_index=True)

# Merge data frames
threshold_min_max_per_icustay = min_threshold_high_per_icustay.merge(max_threshold_high_per_icustay, on=['ICUSTAY_ID','ITEMID_VALUE'])
threshold_min_max_per_icustay = threshold_min_max_per_icustay[['ICUSTAY_ID', 'ITEMID_VALUE', 'THRESHOLD_HIGH_MIN', 'THRESHOLD_LOW_MAX']]

display(threshold_min_max_per_icustay)

In [None]:
# Identify threshold removal candidates by comparing the minimum high threshold to the maximum low threshold for each ICU stay/parameter combination.
# If the minimum low threshold is below the maximum high threshold, the ICU stay/parameter combination is considered for threshold removal. 
threshold_min_max_per_icustay['CROSS'] = threshold_min_max_per_icustay['THRESHOLD_HIGH_MIN'] < threshold_min_max_per_icustay['THRESHOLD_LOW_MAX']
threshold_removal_candidates = threshold_min_max_per_icustay[threshold_min_max_per_icustay['CROSS'] == True][['ICUSTAY_ID','ITEMID_VALUE']].reset_index(drop=True)

display(threshold_removal_candidates)

In [None]:
import numpy as np
import pandas as pd
import pyarrow as pa

# Save threshold_removal_candidates as parquet file
threshold_removal_candidates.to_parquet('../data/threshold_removal_candidates.parquet', engine='pyarrow')

### Prepare data set for local threshold removal due to overlap

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_cleaning_05_thresholds_in_valid_range from parquet file
chartevents_cleaning_05_thresholds_in_valid_range = pd.read_parquet('../data/chartevents_cleaning_05_thresholds_in_valid_range.parquet', engine='pyarrow')

# Read threshold_removal_candidates from parquet file
threshold_removal_candidates = pd.read_parquet('../data/threshold_removal_candidates.parquet', engine='pyarrow')

In [None]:
# The threshold_removal_candidates data frame contains only the ITEMIDs of the vital sign values, not the ITEMIDs of the associated thresholds.
# To facilitate the subsequent subsetting, auxiliary data frames are created with the threshold ITEMIDs.
# The threshold ITEMIDs are combined into one data frame, which is then used to filter the data frame.
# There is probably a smarter way to do this, but this was fast enough.

import pandas as pd

parameters = pd.DataFrame({
    'LABEL':            ['HR',      'NBPs',     'SpO2'],
    'VALUE':            [220045,    220179,     220277],
    'THRESHOLD_HIGH':   [220046,    223751,     223769],
    'THRESHOLD_LOW':    [220047,    223752,     223770]})

# Create empty data frames to which will be appended during the loop
itemid_threshold_high_per_icustay = pd.DataFrame(columns=['ICUSTAY_ID', 'ITEMID'])
itemid_threshold_low_per_icustay = pd.DataFrame(columns=['ICUSTAY_ID', 'ITEMID'])

for i, parameter in parameters.iterrows():

    # For current parameter, create data frames with threshold ITEMIDs
    itemid_threshold_high = threshold_removal_candidates[threshold_removal_candidates['ITEMID_VALUE'] == parameter['VALUE']][['ICUSTAY_ID']].assign(ITEMID=parameter.THRESHOLD_HIGH)
    itemid_threshold_low = threshold_removal_candidates[threshold_removal_candidates['ITEMID_VALUE'] == parameter['VALUE']][['ICUSTAY_ID']].assign(ITEMID=parameter.THRESHOLD_LOW)

    # Append the results of the current parameter to the data frames for the overall results
    itemid_threshold_high_per_icustay = itemid_threshold_high_per_icustay.append(itemid_threshold_high, ignore_index=True)
    itemid_threshold_low_per_icustay = itemid_threshold_low_per_icustay.append(itemid_threshold_low, ignore_index=True)

# Merge data frames vertically
threshold_removal_filter = pd.concat([itemid_threshold_high_per_icustay, itemid_threshold_low_per_icustay], axis= 0)

# Sort to make it pretty (not important)
threshold_removal_filter = threshold_removal_filter.sort_values(by=['ICUSTAY_ID','ITEMID']).reset_index(drop=True)

display(threshold_removal_filter)

In [None]:
# Filter the chartevents_cleaning_05_thresholds_in_valid_range based on the threshold_removal_filter
threshold_removal_data = pd.merge(chartevents_cleaning_05_thresholds_in_valid_range,threshold_removal_filter)
display(threshold_removal_data)

In [None]:
import pandas as pd
import pyarrow as pa

# Save threshold_removal_data as parquet file
threshold_removal_data.to_parquet('../data/threshold_removal_data.parquet', engine='pyarrow')

### Perform local threshold removal due to overlap

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_cleaning_05_thresholds_in_valid_range from parquet file
chartevents_cleaning_05_thresholds_in_valid_range = pd.read_parquet('../data/chartevents_cleaning_05_thresholds_in_valid_range.parquet', engine='pyarrow')

# Read threshold_removal_data from parquet file
threshold_removal_data = pd.read_parquet('../data/threshold_removal_data.parquet', engine='pyarrow')

In [None]:
import numpy as np
import pandas as pd

parameters = pd.DataFrame({
    'LABEL':            ['HR',      'NBPs',     'SpO2'],
    'VALUE':            [220045,    220179,     220277],
    'THRESHOLD_HIGH':   [220046,    223751,     223769],
    'THRESHOLD_LOW':    [220047,    223752,     223770]})

icustays = threshold_removal_data.ICUSTAY_ID.unique()

thresholds_to_be_removed = pd.DataFrame(columns=['ICUSTAY_ID', 'ITEMID_VALUE', 'CHARTTIME'])

for icustay in icustays:
    
    for i, parameter in parameters.iterrows():
        
        threshold_high = threshold_removal_data[
            (threshold_removal_data["ICUSTAY_ID"] == icustay) & 
            (threshold_removal_data["ITEMID"] == parameter['THRESHOLD_HIGH'])][
            ['CHARTTIME','VALUENUM_CLEAN']
            ].sort_values(by=['CHARTTIME']).rename(columns = {'VALUENUM_CLEAN':'THRESHOLD_HIGH'})

        threshold_low = threshold_removal_data[
            (threshold_removal_data["ICUSTAY_ID"] == icustay) & 
            (threshold_removal_data["ITEMID"] == parameter['THRESHOLD_LOW'])][
            ['CHARTTIME','VALUENUM_CLEAN']
            ].sort_values(by=['CHARTTIME']).rename(columns = {'VALUENUM_CLEAN':'THRESHOLD_LOW'})

        thresholds_by_icustay_parameter_charttime = threshold_high.merge(threshold_low, on=['CHARTTIME']).assign(ICUSTAY_ID=icustay, ITEMID_VALUE=parameter.VALUE)

        thresholds_by_icustay_parameter_charttime.insert(loc=len(thresholds_by_icustay_parameter_charttime.columns), column='CLEANING_FLAG', value=np.nan)
        thresholds_by_icustay_parameter_charttime.loc[
            (thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH'] < thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW']),
            'CLEANING_FLAG'] = "Threshold removal due to overlap"
        
        thresholds_by_icustay_parameter_charttime.dropna(inplace=True)
        thresholds_to_be_removed_for_icustayid_itemid = thresholds_by_icustay_parameter_charttime[['ICUSTAY_ID', 'ITEMID_VALUE', 'CHARTTIME', 'CLEANING_FLAG']]

        thresholds_to_be_removed = thresholds_to_be_removed.append(thresholds_to_be_removed_for_icustayid_itemid, ignore_index=True)

display(thresholds_to_be_removed)

In [None]:
import pandas as pd
import pyarrow as pa

# Save thresholds_to_be_removed as parquet file
thresholds_to_be_removed.to_parquet('../data/thresholds_to_be_removed.parquet', engine='pyarrow')

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_cleaning_05_thresholds_in_valid_range from parquet file
chartevents_cleaning_05_thresholds_in_valid_range = pd.read_parquet('../data/chartevents_cleaning_05_thresholds_in_valid_range.parquet', engine='pyarrow')

# Read thresholds_to_be_removed from parquet file
thresholds_to_be_removed = pd.read_parquet('../data/thresholds_to_be_removed.parquet', engine='pyarrow')

In [None]:
chartevents_cleaning_06_overlapping_thresholds_removed = chartevents_cleaning_05_thresholds_in_valid_range.copy()

In [None]:
# Unnecessarily complicated step; could be simplified by better preparation of the data frame to be merged.
# Needed because the thresholds_to_be_removed data frame does contain the ITEMID_VALUE but not the threshold ITEMIDs
import pandas as pd
import numpy as np

parameters = pd.DataFrame({
    'LABEL':            ['HR',      'NBPs',     'SpO2'],
    'VALUE':            [220045,    220179,     220277],
    'THRESHOLD_HIGH':   [220046,    223751,     223769],
    'THRESHOLD_LOW':    [220047,    223752,     223770]})

thresholds_to_be_removed.insert(loc=len(thresholds_to_be_removed.columns), column='ITEMID_THRESHOLD_HIGH', value=np.nan)
thresholds_to_be_removed.insert(loc=len(thresholds_to_be_removed.columns), column='ITEMID_THRESHOLD_LOW', value=np.nan)

for i, parameter in parameters.iterrows():

    thresholds_to_be_removed.loc[thresholds_to_be_removed.ITEMID_VALUE == parameter['VALUE'], 'ITEMID_THRESHOLD_HIGH'] = parameter['THRESHOLD_HIGH']
    thresholds_to_be_removed.loc[thresholds_to_be_removed.ITEMID_VALUE == parameter['VALUE'], 'ITEMID_THRESHOLD_LOW'] = parameter['THRESHOLD_LOW']

display(thresholds_to_be_removed)

In [None]:
thresholds_to_be_removed_high = thresholds_to_be_removed[
    ['ICUSTAY_ID','CHARTTIME','CLEANING_FLAG','ITEMID_THRESHOLD_HIGH']
    ].rename(columns = {'ITEMID_THRESHOLD_HIGH':'ITEMID'})
thresholds_to_be_removed_high = thresholds_to_be_removed_high[['ICUSTAY_ID','ITEMID','CHARTTIME','CLEANING_FLAG']]

thresholds_to_be_removed_low = thresholds_to_be_removed[
    ['ICUSTAY_ID','CHARTTIME','CLEANING_FLAG','ITEMID_THRESHOLD_LOW']
    ].rename(columns = {'ITEMID_THRESHOLD_LOW':'ITEMID'})
thresholds_to_be_removed_low = thresholds_to_be_removed_low[['ICUSTAY_ID','ITEMID','CHARTTIME','CLEANING_FLAG']]

In [None]:
# Takes quite long (approx. 24min); there is probably a better why to insert/replace the swapped threshold values in the VALUENUM_CLEAN column

for i, row in thresholds_to_be_removed_high.iterrows():

    chartevents_cleaning_06_overlapping_thresholds_removed.loc[
        (chartevents_cleaning_06_overlapping_thresholds_removed.ICUSTAY_ID == row['ICUSTAY_ID']) &
        (chartevents_cleaning_06_overlapping_thresholds_removed.ITEMID == row['ITEMID']) &
        (chartevents_cleaning_06_overlapping_thresholds_removed.CHARTTIME == row['CHARTTIME']), 
        'VALUENUM_CLEAN'] = None

    chartevents_cleaning_06_overlapping_thresholds_removed.loc[
        (chartevents_cleaning_06_overlapping_thresholds_removed.ICUSTAY_ID == row['ICUSTAY_ID']) &
        (chartevents_cleaning_06_overlapping_thresholds_removed.ITEMID == row['ITEMID']) &
        (chartevents_cleaning_06_overlapping_thresholds_removed.CHARTTIME == row['CHARTTIME']), 
        'CLEANING_FLAG'] = "Threshold removal due to overlap"

for i, row in thresholds_to_be_removed_low.iterrows():

    chartevents_cleaning_06_overlapping_thresholds_removed.loc[
        (chartevents_cleaning_06_overlapping_thresholds_removed.ICUSTAY_ID == row['ICUSTAY_ID']) &
        (chartevents_cleaning_06_overlapping_thresholds_removed.ITEMID == row['ITEMID']) &
        (chartevents_cleaning_06_overlapping_thresholds_removed.CHARTTIME == row['CHARTTIME']), 
        'VALUENUM_CLEAN'] = None

    chartevents_cleaning_06_overlapping_thresholds_removed.loc[
        (chartevents_cleaning_06_overlapping_thresholds_removed.ICUSTAY_ID == row['ICUSTAY_ID']) &
        (chartevents_cleaning_06_overlapping_thresholds_removed.ITEMID == row['ITEMID']) &
        (chartevents_cleaning_06_overlapping_thresholds_removed.CHARTTIME == row['CHARTTIME']), 
        'CLEANING_FLAG'] = "Threshold removal due to overlap"

display(chartevents_cleaning_06_overlapping_thresholds_removed[
    chartevents_cleaning_06_overlapping_thresholds_removed.CLEANING_FLAG == "Threshold removal due to overlap"])

In [None]:
import pandas as pd
import pyarrow as pa

# Save chartevents_cleaning_06_overlapping_thresholds_removed as parquet file
chartevents_cleaning_06_overlapping_thresholds_removed.to_parquet('../data/chartevents_cleaning_06_overlapping_thresholds_removed.parquet', engine='pyarrow')

## Save final chartevents_clean data frame

In [None]:
import pandas as pd
import pyarrow as pa

# For now, the data frame is stored twice with different names, because it may be that further cleanup steps follow later, so that chartevents_cleaning_06_overlapping_thresholds_removed is not the final chartevents_clean anymore.
chartevents_clean = chartevents_cleaning_06_overlapping_thresholds_removed

# Save chartevents_cleaning_05_thresholds_in_valid_range as parquet file
chartevents_clean.to_parquet('../data/chartevents_clean.parquet', engine='pyarrow')