# Create clean CAHRTEVENTS data set

1. Create chartevents_subset by filtering for relevant ITEMIDs
2. Compute unique ICUSTAY_IDs in chartevents_subset
3. Remove rows with insufficient measurements from chartevents_subset
4. Mark parameter values outside clinically valid ranges
5. Threshold cleaning
  - Identify potential candidates for local threshold swap
  - Prepare data set for local threshold swap
  - Perform local threshold swap
6. Combine cleaned values and thresholds to chartevents_clean

## Create chartevents_subset by filtering for relevant ITEMIDs

* Create subset of MIMIC-III data set called `CHARTEVENTS.csv` (see also respective [MIMIC schema website](https://mit-lcp.github.io/mimic-schema-spy/tables/chartevents.html))
* No change in columns, keep all of them.
* Reduce number of rows by filtering for specific ITEMIDs and removing rows without ICUSTAY_ID

In [None]:
import pandas as pd
import dask.dataframe as dd
import pyarrow as pa
from dask.diagnostics import ProgressBar

# Read CHARTEVENTS.csv as Dask DataFrame
# Data types based on MIMIC schema specification https://mit-lcp.github.io/mimic-schema-spy/tables/chartevents.html
# Problem: Complicated use of intger data types with NaNs in Pandas, see https://pandas.pydata.org/pandas-docs/stable/user_guide/gotchas.html#nan-integer-na-values-and-na-type-promotions
# Decision: Floats and integers are read in as 'float64', strings as 'object', and timestamps via Dask's parse_dates provided for this purpose.
chartevents = dd.read_csv('../mimic/CHARTEVENTS.csv', parse_dates=['CHARTTIME','STORETIME'], dtype={
    'ROW_ID': 'float64', # int4 according to specification
    'SUBJECT_ID': 'float64', # int4 according to specification
    'HADM_ID': 'float64', # int4 according to specification
    'ICUSTAY_ID': 'float64', # int4 according to specification
    'ITEMID': 'float64', # int4 according to specification
    'CGID': 'float64', # int4 according to specification
    'VALUE': 'object',
    'VALUENUM': 'float64', # float8 according to specification
    'VALUEUOM': 'object',
    'WARNING': 'float64', # int4 according to specification
    'ERROR': 'float64', # int4 according to specification
    'RESULTSTATUS': 'object',
    'STOPPED': 'object'})

# Create set of relevant ITEMIDs to filter by
itemid_filter = [220045, 220046, 220047, 220179, 223751, 223752, 220180, 220277, 223769, 223770]
# 220045 Heart Rate
# 220046 Heart rate Alarm - High
# 220047 Heart Rate Alarm - Low
# 220179 Non Invasive Blood Pressure systolic
# 223751 Non-Invasive Blood Pressure Alarm - High
# 223752 Non-Invasive Blood Pressure Alarm - Low
# 220180 Non Invasive Blood Pressure diastolic
# 220277 O2 saturation pulseoxymetry
# 223769 O2 Saturation Pulseoxymetry Alarm - High
# 223770 O2 Saturation Pulseoxymetry Alarm - Low

with ProgressBar():
    # Filter by ITEMIDs
    chartevents_subset = chartevents[chartevents.ITEMID.isin(itemid_filter)]
    # Drop rows without ICUSTAY_ID (The ICUSTAY_ID is missing in 1811 rows, so these are removed.)
    chartevents_subset = chartevents_subset.dropna(how='any', subset=['ICUSTAY_ID'])
    # Keep only the rows for which no error occurred, which is coded by a 0. (5584 rows are dropped because the boolean ERROR column equals 1, indicating an error.)
    chartevents_subset = chartevents_subset[chartevents_subset.ERROR.isin([0])]
    # Apply the previously defined commands to the Dask DataFrame, resulting in the desired Pandas DataFrame.
    chartevents_subset = chartevents_subset.compute()
    # Computing duration on Marius' laptop (Intel i5-5200U CPU @ 2.20GHz): 21min

# Sort the rows (not essential, but gives a better overview)
chartevents_subset = chartevents_subset.sort_values(by=['ICUSTAY_ID', 'CHARTTIME','ITEMID'])

# Rest index
chartevents_subset = chartevents_subset.reset_index(drop=True)

# Save as parquet file
pd.DataFrame(chartevents_subset).to_parquet('../data/chartevents_subset.parquet', engine='pyarrow')

## Compute unique ICUSTAY_IDs in chartevents_subset

Create DataFrame that contains only the `ICUSTAY_ID` column, which contains all unique ICUSTAY_IDs contained in `chartevents_subset.parquet`

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
chartevents_subset = pd.read_parquet('../data/chartevents_subset.parquet', engine='pyarrow')

# Compute unqiue ICU stays in chartevents_subset 
unique_icustays_in_chartevents_subset = pd.Series(chartevents_subset.ICUSTAY_ID.unique()).rename('ICUSTAY_ID')

# Save as parquet file (To do this, the Pandas Series must be converted to a Pandas DataFrame.)
pd.DataFrame(unique_icustays_in_chartevents_subset).to_parquet('../data/unique_icustays_in_chartevents_subset.parquet', engine='pyarrow')

## Remove rows with insufficient measurements from chartevents_subset

Keep only those ICUSTAY_ID-ITEMID combinations for which more than one measurement is available.

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
chartevents_subset = pd.read_parquet('../data/chartevents_subset.parquet', engine='pyarrow')

In [None]:
# Consider only those ITEMIDs for the analysis which refer to vital parameter values; threshold values are intentionally not included.
itemids = [220045, 220179, 220180, 220277]
# 220045 Heart Rate
# 220179 Non Invasive Blood Pressure systolic
# 220180 Non Invasive Blood Pressure diastolic
# 220277 O2 saturation pulseoxymetry

# Create subset of chartevents_subset for measurement analysis
chartevents_subset_measurement_analysis = chartevents_subset[['ICUSTAY_ID','ITEMID','VALUENUM']].copy()
chartevents_subset_measurement_analysis = chartevents_subset_measurement_analysis[chartevents_subset_measurement_analysis.ITEMID.isin(itemids)]

# For each ICUSTAY_ID-ITEMID combination, compute the number of available values as VALUENUM_COUNT
chartevents_subset_measurement_count = chartevents_subset_measurement_analysis.groupby(['ICUSTAY_ID','ITEMID']).count()
chartevents_subset_measurement_count = chartevents_subset_measurement_count.rename(columns = {'VALUENUM':'VALUENUM_COUNT'})
chartevents_subset_measurement_count = chartevents_subset_measurement_count.reset_index()
display(chartevents_subset_measurement_count)

In [None]:
import pandas as pd
import pyarrow as pa

# Save chartevents_subset_measurement_count as parquet file
chartevents_subset_measurement_count.to_parquet('../data/chartevents_subset_measurement_count.parquet', engine='pyarrow')

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file
chartevents_subset = pd.read_parquet('../data/chartevents_subset.parquet', engine='pyarrow')

# Read chartevents_subset_measurement_count from parquet file
chartevents_subset_measurement_count = pd.read_parquet('../data/chartevents_subset_measurement_count.parquet', engine='pyarrow')

In [None]:
# Get ICUSTAY_ID-ITEMID combinations for which more than one measurement is available
icustayid_and_itemid_with_multiple_measurements = chartevents_subset_measurement_count[chartevents_subset_measurement_count['VALUENUM_COUNT'] > 1][['ICUSTAY_ID','ITEMID']].reset_index(drop=True)

display(icustayid_and_itemid_with_multiple_measurements)

In [None]:
# Filter the chartevents_subset based on icustayid_and_itemid_with_multiple_measurements
chartevents_subset_multiple_values_only = pd.merge(chartevents_subset,icustayid_and_itemid_with_multiple_measurements)
# The chartevents_subset_multiple_values_only data frame will be used as the basis for the final chartevents_clean data frame
display(chartevents_subset_multiple_values_only)

In [None]:
import pandas as pd
import pyarrow as pa

# Save chartevents_subset_multiple_values as parquet file
chartevents_subset_multiple_values_only.to_parquet('../data/chartevents_subset_multiple_values_only.parquet', engine='pyarrow')

## Mark parameter values outside clinically valid ranges

Values outside the clinically valid ranges are flagged in a new column, not deleted.

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset_multiple_values_only from parquet file
chartevents_subset_multiple_values_only = pd.read_parquet('../data/chartevents_subset_multiple_values_only.parquet', engine='pyarrow')

In [None]:
# Clinically valid value ranges
# Heart Rate (220045): 0-350
# Non Invasive Blood Pressure systolic (220179): 0-375
# Non Invasive Blood Pressure diastolic (220180): 0-375
# O2 saturation pulseoxymetry (220277): 0-100

# Add new column CLEANING_FLAG, which is used to mark values outside the respective clinically valid range as "Below valid value range" or "Above valid value range".
import numpy as np
chartevents_subset_values_flagged = chartevents_subset_multiple_values_only[['ICUSTAY_ID','ITEMID','CHARTTIME','VALUENUM']].copy()
chartevents_subset_values_flagged.insert(loc=len(chartevents_subset_values_flagged.columns), column='CLEANING_FLAG', value=np.nan)

chartevents_subset_values_flagged.loc[
    ((chartevents_subset_values_flagged['ITEMID'] == 220045) & (chartevents_subset_values_flagged['VALUENUM'] < 0)) | 
    ((chartevents_subset_values_flagged['ITEMID'] == 220179) & (chartevents_subset_values_flagged['VALUENUM'] < 0)) | 
    ((chartevents_subset_values_flagged['ITEMID'] == 220180) & (chartevents_subset_values_flagged['VALUENUM'] < 0)) |
    ((chartevents_subset_values_flagged['ITEMID'] == 220277) & (chartevents_subset_values_flagged['VALUENUM'] < 0)),
    'CLEANING_FLAG'] = "Below valid value range"

chartevents_subset_values_flagged.loc[
    ((chartevents_subset_values_flagged['ITEMID'] == 220045) & (chartevents_subset_values_flagged['VALUENUM'] > 350)) | 
    ((chartevents_subset_values_flagged['ITEMID'] == 220179) & (chartevents_subset_values_flagged['VALUENUM'] > 375)) | 
    ((chartevents_subset_values_flagged['ITEMID'] == 220180) & (chartevents_subset_values_flagged['VALUENUM'] > 375)) |
    ((chartevents_subset_values_flagged['ITEMID'] == 220277) & (chartevents_subset_values_flagged['VALUENUM'] > 100)),
    'CLEANING_FLAG'] = "Above valid value range"

# The chartevents_subset_values_flagged data frame will eventually be merged with the chartevents_subset_multiple_values_only data frame to create the final chartevents_clean data frame.
# Only the column CLEANING_FLAG constitutes a new infomration. It is stored together with columns ICUSTAY_ID, ITEMID, CHARTTIME, and VALUENUM, which will be used during merging. The other columns are already available in the chartevents_subset_multiple_values_only data frame. For the sake of storage space and read-in speed, they are not stored again. (Strictly speaking, the VALUENUM column is not needed for merging, but for comprehensibility it is included.)
display(chartevents_subset_values_flagged)

In [None]:
import pandas as pd
import pyarrow as pa

# Save chartevents_subset_values_flagged as parquet file
chartevents_subset_values_flagged.to_parquet('../data/chartevents_subset_values_flagged.parquet', engine='pyarrow')

In [None]:
# 215 rows, i.e. values, are below or above valid value range
chartevents_subset_values_flagged[
    (chartevents_subset_values_flagged.CLEANING_FLAG == "Below valid value range") | 
    (chartevents_subset_values_flagged.CLEANING_FLAG == "Above valid value range")
    ]

In [None]:
# 4 values are below valid value range
chartevents_subset_values_flagged[chartevents_subset_values_flagged.CLEANING_FLAG == "Below valid value range"]

In [None]:
# 211 values are below valid value range
chartevents_subset_values_flagged[chartevents_subset_values_flagged.CLEANING_FLAG == "Above valid value range"].sort_values(by=['VALUENUM'])

## Threshold cleaning

In the data of some ICU stays, the high and low thresholds overlap at certain points. For example, during a certain period of time, the threshold for a heart rate that is too high may be below the threshold for a heart rate that is too low, and vice versa.

According to the medical experts consulted, there is no plausible reason for this. In fact, medical devices should not allow the setting of such overlapping alarm thresholds in the first place.

The overlap can show up in different ways. Roughly speaking, we have observed three variants looking at time series plots:

1. The low threshold temporarily exceeds the high threshold, while the latter continues 'normally'.
2. The high threshold value temporarily falls below the low threshold value, while the latter continues 'normally'.
3. Both thresholds temporarily overlap so that they appear swapped, which is 'abnormal' for both.

There are two sub-variants for variant (3):

- 3a The threshold values are swapped, but do not decrease/increase to the same extent, so it is not an exact swap.
- 3b The thresholds are swapped, decreasing/increasing  to the same extent, so it looks like an exact swap.

According to the agreement with medical experts, the two threshold values for case 3b (exact swap) are swapped back for the time period affected.

Possibly this would also be possible for case 3a. However, this seems very complex as this case is difficult for us to distinguish from cases 1 and 2.

In [None]:
# Initial thoughts on the threshold cleaning process, which I don't want to delete yet in case we still need them:

    # Interpoliere für Timestamps (noch nicht implementiert)

    # Für jene ICUSTAY_IDs bei denen für eine Threshold-Kombination eine Überschnitt vorliegt
    # d.h. Wenn LOW threshold > HIGH threshold
    # führe nachstehende Schritte durch

    # Pro Threshold-Type, bereche Differenz zwischen aktuellem Threshold und zeitlich vorangegangenem Threshold
    # Wenn der Betrag für beide Threshold-Types derselbe ist, handelt es sich entweder um einen Einstieg oder Ausstieg des Swaps
    # Ob es ein Einstieg oder Ausstieg ist, hängt von der Kombinations aus Vorzeichen und Threshold-Type ab

In [None]:
# # DRAFT PREFILTER ONLY CASES WITH CROSSING THRESHOLDS
# thresholds_by_icustay_parameter_charttime['CROSS'] = ( thresholds_by_icustay_parameter_charttime.THRESHOLD_HIGH < thresholds_by_icustay_parameter_charttime.THRESHOLD_LOW)
# index_list = thresholds_by_icustay_parameter_charttime[thresholds_by_icustay_parameter_charttime.CROSS == True].index
# test = thresholds_by_icustay_parameter_charttime[
#     (thresholds_by_icustay_parameter_charttime.CROSS == True) | 
#     (thresholds_by_icustay_parameter_charttime.index == min(index_list)-1) |
#     (thresholds_by_icustay_parameter_charttime.index == max(index_list)+1)
#     ].copy()
# test

### Identify potential candidates for local threshold swap

Coarse detection of the ICU stay/parameter combinations for which the local swapping of thresholds is a possible option. Purpose of this preliminary step is to reduce the computational effort. The aim is to reduce the relatively complex threshold swap step to potentially affected cases.

The coarse detection is done by comparing the minimum high threshold to the maximum low threshold for each ICU stay/parameter combination. If the minimum high threshold is below the maximum low threshold, the ICU stay/parameter combination is considered for threshold swapping.

The output of this section is a data frame that includes the ICU stay/parameter combinations that are possible candidates for swapping. It is to be expected that the number of candidates is higher than the number of swaps eventually performed. The reason for this is that not all threshold overlaps allow a meaningful swap (see following section).

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file
chartevents_subset = pd.read_parquet('../data/chartevents_subset.parquet', engine='pyarrow')

In [None]:
import pandas as pd

parameters = pd.DataFrame({
    'LABEL':            ['HR',      'NBPs',     'SpO2'],
    'VALUE':            [220045,    220179,     220277],
    'THRESHOLD_HIGH':   [220046,    223751,     223769],
    'THRESHOLD_LOW':    [220047,    223752,     223770]})

# Create empty data frames to which will be appended during the loop
min_threshold_high_per_icustay = pd.DataFrame(columns=['ICUSTAY_ID', 'THRESHOLD_HIGH_MIN', 'ITEMID_VALUE'])
max_threshold_high_per_icustay = pd.DataFrame(columns=['ICUSTAY_ID', 'THRESHOLD_LOW_MAX', 'ITEMID_VALUE'])

for i, parameter in parameters.iterrows():

    # For current parameter, compute minimum value of high threshold for all ICU stays
    min_threshold_high = chartevents_subset[chartevents_subset['ITEMID'] == parameter['THRESHOLD_HIGH']].groupby(['ICUSTAY_ID','ITEMID'])['VALUENUM'].min()
    min_threshold_high = min_threshold_high.reset_index()
    min_threshold_high = min_threshold_high[['ICUSTAY_ID','VALUENUM']].rename(columns = {'VALUENUM':'THRESHOLD_HIGH_MIN'}).assign(ITEMID_VALUE=parameter.VALUE)

    # For current parameter, compute maximium value of low threshold for all ICU stays
    max_threshold_low = chartevents_subset[chartevents_subset['ITEMID'] == parameter['THRESHOLD_LOW']].groupby(['ICUSTAY_ID','ITEMID'])['VALUENUM'].max()
    max_threshold_low = max_threshold_low.reset_index()
    max_threshold_low = max_threshold_low[['ICUSTAY_ID','VALUENUM']].rename(columns = {'VALUENUM':'THRESHOLD_LOW_MAX'}).assign(ITEMID_VALUE=parameter.VALUE)

    # Append the results of the current parameter to the data frames for the overall results
    min_threshold_high_per_icustay = min_threshold_high_per_icustay.append(min_threshold_high, ignore_index=True)
    max_threshold_high_per_icustay = max_threshold_high_per_icustay.append(max_threshold_low, ignore_index=True)

# Merge data frames
threshold_min_max_per_icustay = min_threshold_high_per_icustay.merge(max_threshold_high_per_icustay, on=['ICUSTAY_ID','ITEMID_VALUE'])
threshold_min_max_per_icustay = threshold_min_max_per_icustay[['ICUSTAY_ID', 'ITEMID_VALUE', 'THRESHOLD_HIGH_MIN', 'THRESHOLD_LOW_MAX']]

display(threshold_min_max_per_icustay)

In [None]:
# Identify threshold swap candidates by comparing the minimum high threshold to the maximum low threshold for each ICU stay/parameter combination.
# If the minimum low threshold is below the maximum high threshold, the ICU stay/parameter combination is considered for threshold swapping. 
threshold_min_max_per_icustay['CROSS'] = threshold_min_max_per_icustay['THRESHOLD_HIGH_MIN'] < threshold_min_max_per_icustay['THRESHOLD_LOW_MAX']
threshold_swap_candidates = threshold_min_max_per_icustay[threshold_min_max_per_icustay['CROSS'] == True][['ICUSTAY_ID','ITEMID_VALUE']].reset_index(drop=True)

display(threshold_swap_candidates)

In [None]:
import numpy as np
import pandas as pd
import pyarrow as pa

# Save threshold_swap_candidates as parquet file
threshold_swap_candidates.to_parquet('../data/threshold_swap_candidates.parquet', engine='pyarrow')

### Prepare data set for local threshold swap

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file
chartevents_subset = pd.read_parquet('../data/chartevents_subset.parquet', engine='pyarrow')

# Read threshold_swap_candidates from parquet file
threshold_swap_candidates = pd.read_parquet('../data/threshold_swap_candidates.parquet', engine='pyarrow')

In [None]:
# The threshold_swap_candidates data frame contains only the ITEMIDs of the vital sign values, not the ITEMIDs of the associated thresholds.
# To facilitate the subsequent subsetting of the CHARTEVENTS data frame, auxiliary data frames are created with the threshold ITEMIDs.
# The threshold ITEMIDs are combined into one data frame, which is then used to filter the CHARTEVENT data frame.
# There is probably a smarter way to do this, but this was fast enough.

import pandas as pd

parameters = pd.DataFrame({
    'LABEL':            ['HR',      'NBPs',     'SpO2'],
    'VALUE':            [220045,    220179,     220277],
    'THRESHOLD_HIGH':   [220046,    223751,     223769],
    'THRESHOLD_LOW':    [220047,    223752,     223770]})

# Create empty data frames to which will be appended during the loop
itemid_threshold_high_per_icustay = pd.DataFrame(columns=['ICUSTAY_ID', 'ITEMID'])
itemid_threshold_low_per_icustay = pd.DataFrame(columns=['ICUSTAY_ID', 'ITEMID'])

for i, parameter in parameters.iterrows():

    # For current parameter, create data frames with threshold ITEMIDs
    itemid_threshold_high = threshold_swap_candidates[threshold_swap_candidates['ITEMID_VALUE'] == parameter['VALUE']][['ICUSTAY_ID']].assign(ITEMID=parameter.THRESHOLD_HIGH)
    itemid_threshold_low = threshold_swap_candidates[threshold_swap_candidates['ITEMID_VALUE'] == parameter['VALUE']][['ICUSTAY_ID']].assign(ITEMID=parameter.THRESHOLD_LOW)

    # Append the results of the current parameter to the data frames for the overall results
    itemid_threshold_high_per_icustay = itemid_threshold_high_per_icustay.append(itemid_threshold_high, ignore_index=True)
    itemid_threshold_low_per_icustay = itemid_threshold_low_per_icustay.append(itemid_threshold_low, ignore_index=True)

# Merge data frames vertically
threshold_swap_filter = pd.concat([itemid_threshold_high_per_icustay, itemid_threshold_low_per_icustay], axis= 0)

# Sort to make it pretty (not important)
threshold_swap_filter = threshold_swap_filter.sort_values(by=['ICUSTAY_ID','ITEMID']).reset_index(drop=True)

display(threshold_swap_filter)

In [None]:
# Filter the chartevents_subset based on the threshold_swap_filter
threshold_swap_data = pd.merge(chartevents_subset,threshold_swap_filter)
display(threshold_swap_data)

In [None]:
import pandas as pd
import pyarrow as pa

# Save threshold_swap_data as parquet file
threshold_swap_data.to_parquet('../data/threshold_swap_data.parquet', engine='pyarrow')

### Perform local threshold swap

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file
chartevents_subset = pd.read_parquet('../data/chartevents_subset.parquet', engine='pyarrow')

# Read threshold_swap_candidates from parquet file
threshold_swap_data = pd.read_parquet('../data/threshold_swap_data.parquet', engine='pyarrow')

In [None]:
import numpy as np
import pandas as pd

parameters = pd.DataFrame({
    'LABEL':            ['HR',      'NBPs',     'SpO2'],
    'VALUE':            [220045,    220179,     220277],
    'THRESHOLD_HIGH':   [220046,    223751,     223769],
    'THRESHOLD_LOW':    [220047,    223752,     223770]})

icustays = threshold_swap_data.ICUSTAY_ID.unique()

thresholds_fixed = pd.DataFrame(columns=['ICUSTAY_ID', 'ITEMID_VALUE', 'CHARTTIME', 'THRESHOLD_LOW_FIXED', 'THRESHOLD_HIGH_FIXED'])

for icustay in icustays:
    
    for i, parameter in parameters.iterrows():
        
        threshold_high = threshold_swap_data[
            (threshold_swap_data["ICUSTAY_ID"] == icustay) & 
            (threshold_swap_data["ITEMID"] == parameter['THRESHOLD_HIGH'])][
            ['CHARTTIME','VALUENUM']
            ].sort_values(by=['CHARTTIME']).rename(columns = {'VALUENUM':'THRESHOLD_HIGH'})

        threshold_low = threshold_swap_data[
            (threshold_swap_data["ICUSTAY_ID"] == icustay) & 
            (threshold_swap_data["ITEMID"] == parameter['THRESHOLD_LOW'])][
            ['CHARTTIME','VALUENUM']
            ].sort_values(by=['CHARTTIME']).rename(columns = {'VALUENUM':'THRESHOLD_LOW'})

        thresholds_by_icustay_parameter_charttime = threshold_high.merge(threshold_low, on=['CHARTTIME']).assign(ICUSTAY_ID=icustay, ITEMID_VALUE=parameter.VALUE)
        
        # Create a new column that contains the chronologically following threshold value of the same type
        thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH_NEXT'] = thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH'].shift(-1)
        thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW_NEXT'] = thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW'].shift(-1)

        thresholds_by_icustay_parameter_charttime['DIF_THRESHOLD_HIGH_NEXT'] = thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH_NEXT'] - thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH']
        thresholds_by_icustay_parameter_charttime['DIF_THRESHOLD_LOW_NEXT'] = thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW_NEXT'] - thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW']
        

        thresholds_by_icustay_parameter_charttime.insert(loc=len(thresholds_by_icustay_parameter_charttime.columns), column='THRESHOLD_HIGH_FIXED', value=np.nan)
        thresholds_by_icustay_parameter_charttime.insert(loc=len(thresholds_by_icustay_parameter_charttime.columns), column='THRESHOLD_LOW_FIXED', value=np.nan)

        thresholds_by_icustay_parameter_charttime.loc[
            (thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH'] < thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW']) &
            (abs(thresholds_by_icustay_parameter_charttime['DIF_THRESHOLD_HIGH_NEXT']) == abs(thresholds_by_icustay_parameter_charttime['DIF_THRESHOLD_LOW_NEXT'])),
            'THRESHOLD_HIGH_FIXED'] = thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW']

        thresholds_by_icustay_parameter_charttime.loc[
            (thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH'] < thresholds_by_icustay_parameter_charttime['THRESHOLD_LOW']) &
            (abs(thresholds_by_icustay_parameter_charttime['DIF_THRESHOLD_HIGH_NEXT']) == abs(thresholds_by_icustay_parameter_charttime['DIF_THRESHOLD_LOW_NEXT'])),
            'THRESHOLD_LOW_FIXED'] = thresholds_by_icustay_parameter_charttime['THRESHOLD_HIGH']
        
        thresholds_by_icustay_parameter_charttime.dropna(inplace=True)
        thresholds_fixed_for_icustayid_itemid = thresholds_by_icustay_parameter_charttime[['ICUSTAY_ID', 'ITEMID_VALUE', 'CHARTTIME', 'THRESHOLD_LOW_FIXED', 'THRESHOLD_HIGH_FIXED']]

        thresholds_fixed = thresholds_fixed.append(thresholds_fixed_for_icustayid_itemid, ignore_index=True)

display(thresholds_fixed)

In [None]:
import pandas as pd
import pyarrow as pa

# Save thresholds_fixed as parquet file
thresholds_fixed.to_parquet('../data/thresholds_fixed.parquet', engine='pyarrow')

## Combine cleaned values and thresholds to chartevents_clean

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file
chartevents_subset = pd.read_parquet('../data/chartevents_subset.parquet', engine='pyarrow')

# Read chartevents_subset_multiple_values_only from parquet file
# chartevents_subset_multiple_values_only = pd.read_parquet('../data/chartevents_subset_multiple_values_only.parquet', engine='pyarrow')

# Read chartevents_subset_values_flagged from parquet file
chartevents_subset_values_flagged = pd.read_parquet('../data/chartevents_subset_values_flagged.parquet', engine='pyarrow')

# Read thresholds_fixed from parquet file
thresholds_fixed = pd.read_parquet('../data/thresholds_fixed.parquet', engine='pyarrow')

In [None]:
# Merge as follows
# In the main data frame, create a new column 'VALUENUM_CLEAN' that equals 'VALUENUM' values in the first step
# Next, set all 'VALUENUM_CLEAN' cells to NaN, where we identified a cleaning flag (above or below valid range) in the value cleaning step
# Next, replace all 'VALUENUM_CLEAN' cells with the fixed threshold value (high or low) as applicable.
# This means that we can use 'VALUENUM_CLEAN' in our future analysis without having to worry about the cleaning columns etc.

In [None]:
# Problem: I overlooked the fact that we are applying threshold swapping to chartevents_subset instead of chartevents_subset_multiple_values_only. 
# This can't be changed quickly since the threshold rows are not included in chartevents_subset_multiple_values_only.
# As a result, theoretically, there may be threshold values in the combined data set for which the associated parameter values have been removed because there was not more than one data point.
# With this in mind, please do not consider the following combination of the individual data sets as finished!

In [None]:
chartevents_clean = chartevents_subset.copy()

# Create new column 'VALUENUM_CLEAN' that equals 'VALUENUM' values
chartevents_clean['VALUENUM_CLEAN'] = chartevents_clean['VALUENUM']

# Set all 'VALUENUM_CLEAN' cells to NaN, where we identified a cleaning flag (above or below valid range) in the value cleaning step
chartevents_clean = chartevents_clean.merge(chartevents_subset_values_flagged, how='left', on=['ICUSTAY_ID','ITEMID','CHARTTIME','VALUENUM'])
chartevents_clean.loc[
    (chartevents_clean.CLEANING_FLAG == "Below valid value range") | 
    (chartevents_clean.CLEANING_FLAG == "Above valid value range"),
    'VALUENUM_CLEAN'] = None

display(chartevents_clean)

In [None]:
# Unnecessarily complicated step; could be simplified by better preparation of the data frame to be merged.
# Needed because the thresholds_fixed data frame does contain the ITEMID_VALUE but not the threshold ITEMIDs
import pandas as pd
import numpy as np

parameters = pd.DataFrame({
    'LABEL':            ['HR',      'NBPs',     'SpO2'],
    'VALUE':            [220045,    220179,     220277],
    'THRESHOLD_HIGH':   [220046,    223751,     223769],
    'THRESHOLD_LOW':    [220047,    223752,     223770]})

thresholds_fixed.insert(loc=len(thresholds_fixed.columns), column='ITEMID_THRESHOLD_HIGH', value=np.nan)
thresholds_fixed.insert(loc=len(thresholds_fixed.columns), column='ITEMID_THRESHOLD_LOW', value=np.nan)

for i, parameter in parameters.iterrows():

    thresholds_fixed.loc[thresholds_fixed.ITEMID_VALUE == parameter['VALUE'], 'ITEMID_THRESHOLD_HIGH'] = parameter['THRESHOLD_HIGH']
    thresholds_fixed.loc[thresholds_fixed.ITEMID_VALUE == parameter['VALUE'], 'ITEMID_THRESHOLD_LOW'] = parameter['THRESHOLD_LOW']

display(thresholds_fixed)

In [None]:
thresholds_fixed_high = thresholds_fixed[
    ['ICUSTAY_ID','CHARTTIME','THRESHOLD_HIGH_FIXED','ITEMID_THRESHOLD_HIGH']
    ].rename(columns = {'THRESHOLD_HIGH_FIXED':'VALUENUM_CLEAN', 'ITEMID_THRESHOLD_HIGH':'ITEMID'})
thresholds_fixed_high = thresholds_fixed_high[['ICUSTAY_ID','ITEMID','CHARTTIME','VALUENUM_CLEAN']]

thresholds_fixed_low = thresholds_fixed[
    ['ICUSTAY_ID','CHARTTIME','THRESHOLD_LOW_FIXED','ITEMID_THRESHOLD_LOW']
    ].rename(columns = {'THRESHOLD_LOW_FIXED':'VALUENUM_CLEAN', 'ITEMID_THRESHOLD_LOW':'ITEMID'})
thresholds_fixed_low = thresholds_fixed_low[['ICUSTAY_ID','ITEMID','CHARTTIME','VALUENUM_CLEAN']]

In [None]:
# Takes quite long (approx. 11min); there is probably a better why to insert/replace the swapped threshold values in the VALUENUM_CLEAN column

for i, row in thresholds_fixed_high.iterrows():

    chartevents_clean.loc[
        (chartevents_clean.ICUSTAY_ID == row['ICUSTAY_ID']) &
        (chartevents_clean.ITEMID == row['ITEMID']) &
        (chartevents_clean.CHARTTIME == row['CHARTTIME']), 
        'VALUENUM_CLEAN'] = row['VALUENUM_CLEAN']

    chartevents_clean.loc[
        (chartevents_clean.ICUSTAY_ID == row['ICUSTAY_ID']) &
        (chartevents_clean.ITEMID == row['ITEMID']) &
        (chartevents_clean.CHARTTIME == row['CHARTTIME']), 
        'CLEANING_FLAG'] = "High threshold fixed by swap"

for i, row in thresholds_fixed_low.iterrows():

    chartevents_clean.loc[
        (chartevents_clean.ICUSTAY_ID == row['ICUSTAY_ID']) &
        (chartevents_clean.ITEMID == row['ITEMID']) &
        (chartevents_clean.CHARTTIME == row['CHARTTIME']), 
        'VALUENUM_CLEAN'] = row['VALUENUM_CLEAN']

    chartevents_clean.loc[
        (chartevents_clean.ICUSTAY_ID == row['ICUSTAY_ID']) &
        (chartevents_clean.ITEMID == row['ITEMID']) &
        (chartevents_clean.CHARTTIME == row['CHARTTIME']), 
        'CLEANING_FLAG'] = "Low threshold fixed by swap"


In [None]:
# Check 
chartevents_clean[chartevents_clean.CLEANING_FLAG.isin(['High threshold fixed by swap','Low threshold fixed by swap'])]

In [None]:
import pandas as pd
import pyarrow as pa

# Save chartevents_clean as parquet file
chartevents_clean.to_parquet('../data/chartevents_clean.parquet', engine='pyarrow')

In [None]:
# ToDos:
# Check whether merging the final chartevents_clean data frame works correctly
# See issue described above, regarding the multiple values stuff