# Analysis of CHARTEVENTS.csv Subset

Aim: Create descriptive statistics for CHARTEVENTS.csv subset which is pre-filtered for the three relevant ITEM IDs and is purged of rows without an ICUSTAY_ID or with ERROR=true.

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents subsets from parquet file to pandas data frame
chartevents_subset = pd.read_parquet('./data/chartevents_subset.parquet', engine='pyarrow')
unique_ICU_stays = pd.read_parquet('./data/unique_icustays_in_chartevents_subset.parquet', engine='pyarrow')

In [None]:
# Add new column with ITEMID_LABEL, which can be used for the legend of the plots
import numpy as np

chartevents_subset.insert(loc=len(chartevents_subset.columns), column='ITEMID_LABEL', value=np.nan)

# Heart rate ITEMID_LABELs
chartevents_subset.loc[chartevents_subset['ITEMID'] == 220045, 'ITEMID_LABEL'] = 'Heart rate\n(bpm)'
chartevents_subset.loc[chartevents_subset['ITEMID'] == 220046, 'ITEMID_LABEL'] = 'Alarm threshold:\nHigh\nheart rate\n(bpm)'
chartevents_subset.loc[chartevents_subset['ITEMID'] == 220047, 'ITEMID_LABEL'] = 'Alarm threshold:\nLow\nheart rate\n(bpm)'

# Blood pressure ITEMID_LABELs
chartevents_subset.loc[chartevents_subset['ITEMID'] == 220179, 'ITEMID_LABEL'] = 'Non-invasive\nsystolic\nblood pressure\n(mmHg)'
chartevents_subset.loc[chartevents_subset['ITEMID'] == 223751, 'ITEMID_LABEL'] = 'Alarm threshold:\nHigh systolic\nblood pressure\n(mmHg)'
chartevents_subset.loc[chartevents_subset['ITEMID'] == 223752, 'ITEMID_LABEL'] = 'Alarm threshold:\nLow systolic\nblood pressure\n(mmHg)'
chartevents_subset.loc[chartevents_subset['ITEMID'] == 220180, 'ITEMID_LABEL'] = 'Non-invasive\ndiastolic\nblood pressure\n(mmHg)'

# O2 saturation ITEMID_LABELs
chartevents_subset.loc[chartevents_subset['ITEMID'] == 220277, 'ITEMID_LABEL'] = 'O2 saturation\npulseoxymetry\n(%)'
chartevents_subset.loc[chartevents_subset['ITEMID'] == 223769, 'ITEMID_LABEL'] = 'Alarm threshold:\nHigh\nO2 saturation\n(%)'
chartevents_subset.loc[chartevents_subset['ITEMID'] == 223770, 'ITEMID_LABEL'] = 'Alarm threshold:\nLow\nO2 saturation\n(%)'

In [None]:
chartevents_subset.head()

In [None]:
unique_ICU_stays.head()

## General Insights

### How many unique ICU stays exist?
There are 23,446 unique ICU stays.

In [None]:
len(unique_ICU_stays.ICUSTAY_ID)

### How many measurements resp. thresholds exist per ITEM ID?

There are 154,751 to 2,761,436 data points per ITEM ID.

In [None]:
item_id_count = chartevents_subset\
    .groupby(['ITEMID_LABEL'])\
    .size()\
    .reset_index(name='Count')
item_id_count.sort_values(by=['Count'], inplace=True)

print(item_id_count)

In [None]:
import numpy as np

# Add category column for viz
item_id_count.insert(loc=len(item_id_count.columns), column='Category', value=np.nan)

item_id_count.loc[item_id_count['ITEMID_LABEL'].str.contains('High'), 'Category'] = 'HIGH'
item_id_count.loc[item_id_count['ITEMID_LABEL'].str.contains('Low'), 'Category'] = 'LOW'
item_id_count.loc[~item_id_count['ITEMID_LABEL'].str.contains(':'), 'Category'] = 'Measurement'

# Add parameter column for viz
item_id_count.insert(loc=len(item_id_count.columns), column='Parameter', value=np.nan)

item_id_count.loc[item_id_count['ITEMID_LABEL'].str.lower().str.contains('heart'), 'Parameter'] = 'Heart rate\n(bpm)'
item_id_count.loc[item_id_count['ITEMID_LABEL'].str.contains('systolic'), 'Parameter'] = 'Non-invasive\nsystolic\nblood pressure\n(mmHg)'
item_id_count.loc[item_id_count['ITEMID_LABEL'].str.contains('diastolic'), 'Parameter'] = 'Non-invasive\ndiastolic\nblood pressure\n(mmHg)'
item_id_count.loc[item_id_count['ITEMID_LABEL'].str.contains('saturation'), 'Parameter'] = 'O2 saturation\npulseoxymetry\n(%)'

print(item_id_count)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set variables
title = 'CHARTEVENTS: Frequency by ITEMID'
plotdata = item_id_count
xlabel = 'Parameter'
ylabel = 'Frequency'

# Actual plot
g = sns.catplot(
    data=item_id_count,
    kind='bar',
    x='Parameter',
    y='Count',
    hue='Category',
    ci='sd',
    palette=sns.color_palette('colorblind'),
    alpha=.6,
    height=6,
    legend=False)
g.despine(left=True)

g.set_axis_labels(xlabel, ylabel)
g.fig.suptitle(title)
plt.legend(loc='upper right')
plt.tight_layout()

plt.show()

### How many measurements resp. thresholds exist per ICU stay?

There are 2 to 89,387 data points per ICU stay.

In [None]:
icu_stay_count = chartevents_subset\
    .groupby(['ICUSTAY_ID'])\
    .size()\
    .reset_index(name='Count')

icu_stay_count.Count.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set variables
title = 'CHARTEVENTS: Frequency by ICUSTAY_ID'
plotdata = icu_stay_count
xvalue = 'Count'
xlabel = 'Frequency'

# Actual plot
sns.set_style('whitegrid')
sns.stripplot(
    data=plotdata,
    x=xvalue,
    palette=sns.color_palette('colorblind'))
plt.title(title, fontsize=18)
plt.xlabel(xlabel, fontsize=16)

plt.show()

### How many measurements resp. thresholds exist per ICU stay and ITEM ID?

In general, there are 1 to 43,929 data points per ICU stay and ITEM ID. Especially for the heart rate and the oxygen saturation measurements, there are many data points for an ICU stay. After that, blood pressure was recorded next most frequently, with approximately up to 1,600 data points. The systolic and diastolic blood pressure distribution is very similar.

In [None]:
item_id_icu_stay_count = chartevents_subset\
    .groupby(['ITEMID_LABEL', 'ICUSTAY_ID'])\
    .size()\
    .reset_index(name='Count')
item_id_icu_stay_count['ICUSTAY_ID'] = item_id_icu_stay_count['ICUSTAY_ID'].astype(int)
item_id_icu_stay_count.sort_values(by=['Count'], inplace=True)

item_id_icu_stay_count.Count.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set variables
title = 'CHARTEVENTS: Frequency by ITEMID and ICUSTAY_ID'
plotdata = item_id_icu_stay_count
xvalue = 'ITEMID_LABEL'
yvalue = 'Count'
xlabel = 'Item'
ylabel = 'Frequency'

# Actual plot
sns.set_style('whitegrid')
sns.set(rc={'figure.figsize':(20, 10)})
sns.stripplot(
    data=plotdata,
    x=xvalue,
    y=yvalue,
    palette=sns.color_palette('colorblind'),
    order=['Heart rate\n(bpm)',
           'Alarm threshold:\nHigh\nheart rate\n(bpm)',
           'Alarm threshold:\nLow\nheart rate\n(bpm)',
           'O2 saturation\npulseoxymetry\n(%)',
           'Alarm threshold:\nLow\nO2 saturation\n(%)',
           'Alarm threshold:\nHigh\nO2 saturation\n(%)',
           'Non-invasive\nsystolic\nblood pressure\n(mmHg)',
           'Alarm threshold:\nLow systolic\nblood pressure\n(mmHg)',
           'Alarm threshold:\nHigh systolic\nblood pressure\n(mmHg)',
           'Non-invasive\ndiastolic\nblood pressure\n(mmHg)'])
plt.title(title, fontsize=18)
plt.xlabel(xlabel, fontsize=16)
plt.ylabel(ylabel, fontsize=16)
plt.gca().set_ylim(bottom=0)

plt.show()

In [None]:
# Look at same stripplot again without extrema (heart rate and oxygen saturation):

import seaborn as sns
import matplotlib.pyplot as plt

# Set variables
title = 'CHARTEVENTS: Frequency by ITEMID and ICUSTAY_ID'
plotdata = item_id_icu_stay_count
xvalue = 'ITEMID_LABEL'
yvalue = 'Count'
xlabel = 'Item'
ylabel = 'Frequency'

# Actual plot
sns.set_style('whitegrid')
sns.set(rc={'figure.figsize':(20, 10)})
sns.stripplot(
    data=plotdata,
    x=xvalue,
    y=yvalue,
    palette=sns.color_palette('colorblind'),
    order=['',
           'Alarm threshold:\nHigh\nheart rate\n(bpm)',
           'Alarm threshold:\nLow\nheart rate\n(bpm)',
           '',
           'Alarm threshold:\nLow\nO2 saturation\n(%)',
           'Alarm threshold:\nHigh\nO2 saturation\n(%)',
           'Non-invasive\nsystolic\nblood pressure\n(mmHg)',
           'Alarm threshold:\nLow systolic\nblood pressure\n(mmHg)',
           'Alarm threshold:\nHigh systolic\nblood pressure\n(mmHg)',
           'Non-invasive\ndiastolic\nblood pressure\n(mmHg)'])
plt.title(title, fontsize=18)
plt.xlabel(xlabel, fontsize=16)
plt.ylabel(ylabel, fontsize=16)
plt.gca().set_ylim(bottom=0)

plt.show()

In [None]:
item_id_icu_stay_count[(item_id_icu_stay_count['ITEMID_LABEL'] == 'Heart rate\n(bpm)')].Count.describe()

In [None]:
item_id_icu_stay_count[(item_id_icu_stay_count['ITEMID_LABEL'] == 'O2 saturation\npulseoxymetry\n(%)')].Count.describe()

In [None]:
item_id_icu_stay_count[(item_id_icu_stay_count['ITEMID_LABEL'] == 'Non-invasive\nsystolic\nblood pressure\n(mmHg)')].Count.describe()

In [None]:
item_id_icu_stay_count[(item_id_icu_stay_count['ITEMID_LABEL'] == 'Non-invasive\ndiastolic\nblood pressure\n(mmHg)')].Count.describe()

## Heart Rate Insights

For the heart rate, we currently assume values from 0 to 350 bpm to be meaningful.

### Heart Rate Measurements

In [None]:
HR_measurements = chartevents_subset[(chartevents_subset['ITEMID'] == 220045)]
HR_measurements.VALUENUM.describe()

In [None]:
HR_measurements.VALUEUOM.unique()

Among the 2,761,436 heart rate measurements is at least one negative value (min = -88 bpm) and one too high value (max = 9,999,999 bpm). These appear to be implausible outliers. Let's consider several specific VALUENUM ranges.

In [None]:
HR_measurements_up_to_350 = HR_measurements[(HR_measurements['VALUENUM'] >= 0) & (HR_measurements['VALUENUM'] <= 350)]
HR_measurements_up_to_350.VALUENUM.describe() # 2,761,422 values ranging from 0 to 280 bpm

In [None]:
HR_measurements_up_to_350.VALUENUM.value_counts().sort_index()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set variables
title = 'CHARTEVENTS: Cleaned Heart Rate Measurements (0 - 280 bpm)'
xlabel = 'Value'
plotdata = HR_measurements_up_to_350
xvalue = 'VALUENUM'

# Actual plot
sns.set_style('whitegrid')
fig, (fig_box, fig_hist) = plt.subplots(
    2,
    sharex=True,
    gridspec_kw={'height_ratios': (.15, .85)},
    figsize=(10, 5)
    )
fig.suptitle(title, fontweight='bold', color= 'black', fontsize=14, y=1)
sns.boxplot(ax=fig_box, data=plotdata, x=xvalue, palette=sns.color_palette('colorblind'))
fig_box.set(xlabel='')
sns.histplot(ax=fig_hist, data=plotdata, x=xvalue, kde=True, palette=sns.color_palette('colorblind'))
fig_hist.set_xlabel(xlabel, fontsize=12, labelpad=15)
fig_hist.set_ylabel('Count', fontsize=12, labelpad=15)

plt.show()

In [None]:
HR_measurements_over_350 = HR_measurements[HR_measurements['VALUENUM'] > 350]
HR_measurements_over_350.VALUENUM.value_counts().sort_index() # 12 outliers in range from 459 to 9,999,999 bpm

In [None]:
HR_measurements_negative = HR_measurements[HR_measurements['VALUENUM'] < 0]
HR_measurements_negative.VALUENUM.value_counts().sort_index() # two outliers

Most of the 2,761,436 heart rate measurements are in the range currently assumed to be reasonable (0 to 280 bpm). There are just two negative (-88 and -1 bpm) and one extreme high (9,999,999 bpm) outlier. The remaining 11 measurements, which are outside the range mentioned (459 to 86,101 bpm), each occur only once.

### Heart Rate Thresholds

#### Heart Rate - LOW Thresholds

In [None]:
HR_thresholds_low = chartevents_subset[(chartevents_subset['ITEMID'] == 220047)]
HR_thresholds_low.VALUENUM.describe()

Among the 215,805 LOW thresholds for the heart rate is at least one negative value (min = -50 bpm) and one too high value (max = 85,160 bpm). These appear to be implausible outliers. Let's consider several specific VALUENUM ranges.

In [None]:
HR_thresholds_low_up_to_350 = HR_thresholds_low[(HR_thresholds_low['VALUENUM'] >= 0) & (HR_thresholds_low['VALUENUM'] <= 350)]
HR_thresholds_low_up_to_350.VALUENUM.describe() # ranges from 0 to 300 bpm

In [None]:
HR_thresholds_low_negative = HR_thresholds_low[HR_thresholds_low['VALUENUM'] < 0]
HR_thresholds_low_negative.VALUENUM.value_counts().sort_index() # one outlier at -50 bpm

In [None]:
HR_thresholds_low_above_350 = HR_thresholds_low[HR_thresholds_low['VALUENUM'] > 350]
HR_thresholds_low_above_350.VALUENUM.value_counts().sort_index() # 53 outliers in range from 360 to 85,160 bpm

Most of the 215,805 LOW thresholds for the heart rate are in the range currently assumed to be reasonable (0 to 300 bpm). There are one negative (-50 bpm) and 53 values that exceed this range (360 to 85,160 bpm). It may be necessary to keep some of these exceeding values, since e.g. the 360 was filtered out very narrowly.

#### Heart Rate - HIGH Thresholds

In [None]:
HR_thresholds_high = chartevents_subset[(chartevents_subset['ITEMID'] == 220046)]
HR_thresholds_high.VALUENUM.describe()

Among the 215,658 HIGH thresholds for the heart rate are only positive values but some of them are quite high (max = 180,160 bpm). These appear to be implausible outliers. Let's consider several specific VALUENUM ranges.

In [None]:
HR_thresholds_high_up_to_350 = HR_thresholds_high[HR_thresholds_high['VALUENUM'] <= 350]
HR_thresholds_high_up_to_350.VALUENUM.describe() # ranges from 0 to 300 bpm

In [None]:
HR_thresholds_high_above_350 = HR_thresholds_high[HR_thresholds_high['VALUENUM'] > 350]
HR_thresholds_high_above_350.VALUENUM.value_counts().sort_index() # 103 outliers in range from 420 to 180,160 bpm

Most of the 215,658 HIGH thresholds for the heart rate are in the range currently assumed to be reasonable (0 to 300 bpm). There are 103 values that exceed this range (420 to 180,160 bpm). It may be necessary to keep some of these exceeding values, since e.g. there is a big gap 'after' 420 and since there is a cluster around 1200.

#### Heart Rate - Cleaned Threshold Visualization

In [None]:
import numpy as np

# Concat cleaned heart rate threshold data frames
cleaned_HR_thresholds = pd.concat([HR_thresholds_low_up_to_350, HR_thresholds_high_up_to_350])

# Add threshold type column for viz
cleaned_HR_thresholds.insert(loc=len(cleaned_HR_thresholds.columns), column='THRESHOLD_TYPE', value=np.nan)

cleaned_HR_thresholds.loc[cleaned_HR_thresholds['ITEMID'] == 220046, 'THRESHOLD_TYPE'] = 'HIGH'
cleaned_HR_thresholds.loc[cleaned_HR_thresholds['ITEMID'] == 220047, 'THRESHOLD_TYPE'] = 'LOW'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set variables
title = 'CHARTEVENTS: Cleaned Heart Rate Thresholds (0 - 300 bpm)'
xlabel = 'Threshold Value of Heart Rate (bpm)'
plotdata = cleaned_HR_thresholds
xvalue = 'VALUENUM'
stratify_by = 'THRESHOLD_TYPE'

# Config figure
sns.set_style('whitegrid')
fig, axs = plt.subplots(1, 3, figsize=(20, 5))
fig.suptitle(title, fontweight='bold', color= 'black', fontsize=14, y=1)

# Actual plots
sns.histplot(ax=axs[0], data=plotdata, x=xvalue, hue=stratify_by, palette=[sns.color_palette('colorblind')[0],sns.color_palette('colorblind')[1]])
axs[0].set_title('Histogram (overlapping)', fontsize=12)
axs[0].set_xlabel(xlabel, fontsize=12)
axs[0].set_ylabel('Frequency', fontsize=12)

sns.histplot(ax=axs[1], data=plotdata, x=xvalue, hue=stratify_by, multiple='stack', palette=[sns.color_palette('colorblind')[0],sns.color_palette('colorblind')[1]])
axs[1].set_title('Histogram (stacked)', fontsize=12)
axs[1].set_xlabel(xlabel, fontsize=12)
axs[1].set_ylabel('Frequency', fontsize=12)

sns.kdeplot(ax=axs[2], data=plotdata, x=xvalue, hue=stratify_by, palette=[sns.color_palette('colorblind')[0],sns.color_palette('colorblind')[1]])
axs[2].set_title('Kernel Density Estimate (KDE)', fontsize=12)
axs[2].set_xlabel(xlabel, fontsize=12)
axs[2].set_ylabel('Density', fontsize=12)

plt.show(fig)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import math

# Round up to the nearest ten
def roundup(x):
    return int(math.ceil(x / 10.0)) * 10

# Set variables
title = 'CHARTEVENTS: Cleaned Heart Rate Thresholds (0 - 300 bpm)'
xlabel = 'Threshold Value for Heart Rate (bpm)'
ylabel = 'Threshold Type'
plotdata = cleaned_HR_thresholds
xvalue = 'VALUENUM'
yvalue = 'THRESHOLD_TYPE'
xlimpadding = 5 # Padding added to the x axis limits
xticks_steps = 30
xticks_lower = plotdata[xvalue].min() - (plotdata[xvalue].min() % 10) # rounds down the minimum xvalue to the nearest ten
xticks_upper = roundup( plotdata[xvalue].max() ) + xticks_steps # rounds up the maximum xvalue to the nearest ten

# Config figure
sns.set_style('whitegrid')
fig, axs = plt.subplots(
    3,
    1,
    figsize = (10, 15),
    sharex = True,
    dpi = 72 # e.g. 72 for screen, 300 for print
    )
fig.suptitle(title, fontweight='bold', color= 'black', fontsize=14, y=0.9)
fig.subplots_adjust(hspace = 0.1)

# Actual plots
sns.stripplot(
    ax = axs[0],
    data = plotdata,
    x = xvalue,
    y = yvalue,
    palette = sns.color_palette('colorblind')
    )
axs[0].set_xlabel('')
axs[0].set_ylabel(ylabel, fontsize=12, labelpad=15)
axs[0].set_xticks(np.arange(xticks_lower, xticks_upper, xticks_steps))
axs[0].grid(b=True, which='both')
axs[0].margins(.1)

sns.boxplot(
    ax = axs[1],
    data = plotdata,
    x = xvalue,
    y = yvalue,
    palette = sns.color_palette('colorblind')
    )
axs[1].set_xlabel('')
axs[1].set_ylabel(ylabel, fontsize=12, labelpad=15)
axs[1].set_xticks(np.arange(xticks_lower, xticks_upper, xticks_steps))
axs[1].grid(b=True, which='both')
axs[1].margins(.1)

sns.violinplot(
    ax = axs[2],
    data = plotdata,
    x = xvalue,
    y = yvalue,
    palette = sns.color_palette('colorblind')
    )
axs[2].set_xlabel(xlabel, fontsize=12, labelpad=15)
axs[2].set_ylabel(ylabel, fontsize=12, labelpad=15)
axs[2].set_xticks(np.arange(xticks_lower, xticks_upper, xticks_steps))
axs[2].grid(b=True, which='both')
axs[2].margins(.1)

plt.show(fig)

## Oxygen Saturation Insights

For the oxygen saturation, we currently assume values from 0 to 100% to be meaningful.

### Oxygen Saturation Measurements

In [None]:
OS_measurements = chartevents_subset[(chartevents_subset['ITEMID'] == 220277)]
OS_measurements.VALUENUM.describe()

In [None]:
OS_measurements.VALUEUOM.unique()

Among the 2,669,618 oxygen saturation measurements is at least one too high value (max = 6,363,333%). Let's consider several specific VALUENUM ranges.

In [None]:
OS_measurements_up_to_100 = OS_measurements[OS_measurements['VALUENUM'] <= 100]
OS_measurements_up_to_100.VALUENUM.describe() # 2,669,575 values ranging from 0 to 100%

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set variables
title = 'CHARTEVENTS: Cleaned Oxygen Saturation Measurements (0 - 100%)'
xlabel = 'Value'
plotdata = OS_measurements_up_to_100
xvalue = 'VALUENUM'

# Actual plot
sns.set_style('whitegrid')
fig, (fig_box, fig_hist) = plt.subplots(
    2,
    sharex=True,
    gridspec_kw={'height_ratios': (.15, .85)},
    figsize=(10, 5)
    )
fig.suptitle(title, fontweight='bold', color= 'black', fontsize=14, y=1)
sns.boxplot(ax=fig_box, data=plotdata, x=xvalue, palette=sns.color_palette('colorblind'))
fig_box.set(xlabel='')
sns.histplot(ax=fig_hist, data=plotdata, x=xvalue, kde=True, palette=sns.color_palette('colorblind'))
fig_hist.set_xlabel(xlabel, fontsize=12, labelpad=15)
fig_hist.set_ylabel('Frequency', fontsize=12, labelpad=15)

plt.show()

In [None]:
OS_measurements_over_100 = OS_measurements[OS_measurements['VALUENUM'] > 100]
OS_measurements_over_100.VALUENUM.value_counts().sort_index() # 43 outliers in range from 110 to 6,363,333%

Most of the 2,669,618 oxygen saturation measurements are in the range currently assumed to be reasonable (0 to 100%). There are just 43 values exceeding this range (110 to 6,363,333%). It may be necessary to keep some of these exceeding values, since e.g. the 110, 198 and 220% were filtered out very narrowly in contrast to the other values.

### Oxygen Saturation Thresholds

#### Oxygen Saturation - LOW Thresholds

In [None]:
OS_thresholds_low = chartevents_subset[(chartevents_subset['ITEMID'] == 223770)]
OS_thresholds_low.VALUENUM.describe()

Among the 213,238 LOW thresholds for the oxygen saturation is at least one negative value (min = -92%) and one too high value (max = 90,100%). These appear to be implausible outliers. Let's consider several specific VALUENUM ranges.

In [None]:
OS_thresholds_low_up_to_100 = OS_thresholds_low[(OS_thresholds_low['VALUENUM'] >= 0) & (OS_thresholds_low['VALUENUM'] <= 100)]
OS_thresholds_low_up_to_100.VALUENUM.describe() # 213,200 values ranging from 0 to 100%

In [None]:
OS_thresholds_low_negative = OS_thresholds_low[OS_thresholds_low['VALUENUM'] < 0]
OS_thresholds_low_negative.VALUENUM.value_counts().sort_index() # one outlier at -92%

In [None]:
OS_thresholds_low_above_100 = OS_thresholds_low[OS_thresholds_low['VALUENUM'] > 100]
OS_thresholds_low_above_100.VALUENUM.value_counts().sort_index() # 37 outliers in range from 160 to 90,100%

Most of the 213,238 LOW thresholds for the oxygen saturation are in the range currently assumed to be reasonable (0 to 100%). There are one negative (-92%) and 37 values that exceed this range (160 to 90,100%). Among these values could be some with comma errors, e.g. 952% should be 95.2%.

#### Oxygen Saturation - HIGH Thresholds

In [None]:
OS_thresholds_high = chartevents_subset[(chartevents_subset['ITEMID'] == 223769)]
OS_thresholds_high.VALUENUM.describe()

Among the 212,230 HIGH thresholds for the oxygen saturation are only positive values but some of them are quite high (max = 100,185%). These appear to be implausible outliers. Let's consider several specific VALUENUM ranges.

In [None]:
OS_thresholds_high_up_to_100 = OS_thresholds_high[OS_thresholds_high['VALUENUM'] <= 100]
OS_thresholds_high_up_to_100.VALUENUM.describe() # 211,685 values ranging from 0 to 100%

In [None]:
OS_thresholds_high_above_100 = OS_thresholds_high[OS_thresholds_high['VALUENUM'] > 100]
OS_thresholds_high_above_100.VALUENUM.describe() # 545 outliers in range from 101 to 100,185%

Most of the 212,230 HIGH thresholds for the oxygen saturation are in the range currently assumed to be reasonable (0 to 100%). There are 545 values that exceed this range (101 to 100,185%). Strangely enough, there are some values among them that occur much more frequently than the outliers before.

#### Oxygen Saturation - Cleaned Threshold Visualization

In [None]:
import numpy as np

# Concat cleaned heart rate threshold data frames
cleaned_OS_thresholds = pd.concat([OS_thresholds_low_up_to_100, OS_thresholds_high_up_to_100])

# Add threshold type column for viz
cleaned_OS_thresholds.insert(loc=len(cleaned_OS_thresholds.columns), column='THRESHOLD_TYPE', value=np.nan)

cleaned_OS_thresholds.loc[cleaned_OS_thresholds['ITEMID'] == 223769, 'THRESHOLD_TYPE'] = 'HIGH'
cleaned_OS_thresholds.loc[cleaned_OS_thresholds['ITEMID'] == 223770, 'THRESHOLD_TYPE'] = 'LOW'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set variables
title = 'CHARTEVENTS: Cleaned Oxygen Saturation Thresholds (0 - 100%)'
xlabel = 'Threshold Value of Oxygen Saturation (%)'
plotdata = cleaned_OS_thresholds
xvalue = 'VALUENUM'
stratify_by = 'THRESHOLD_TYPE'

# Config figure
sns.set_style('whitegrid')
fig, axs = plt.subplots(1, 3, figsize=(20, 5))
fig.suptitle(title, fontweight='bold', color= 'black', fontsize=14, y=1)

# Actual plots
sns.histplot(ax=axs[0], data=plotdata, x=xvalue, hue=stratify_by, palette=[sns.color_palette('colorblind')[0],sns.color_palette('colorblind')[1]])
axs[0].set_title('Histogram (overlapping)', fontsize=12)
axs[0].set_xlabel(xlabel, fontsize=12)
axs[0].set_ylabel('Frequency', fontsize=12)

sns.histplot(ax=axs[1], data=plotdata, x=xvalue, hue=stratify_by, multiple='stack', palette=[sns.color_palette('colorblind')[0],sns.color_palette('colorblind')[1]])
axs[1].set_title('Histogram (stacked)', fontsize=12)
axs[1].set_xlabel(xlabel, fontsize=12)
axs[1].set_ylabel('Frequency', fontsize=12)

sns.kdeplot(ax=axs[2], data=plotdata, x=xvalue, hue=stratify_by, palette=[sns.color_palette('colorblind')[0],sns.color_palette('colorblind')[1]])
axs[2].set_title('Kernel Density Estimate (KDE)', fontsize=12)
axs[2].set_xlabel(xlabel, fontsize=12)
axs[2].set_ylabel('Density', fontsize=12)

plt.show(fig)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import math

# Round up to the nearest ten
def roundup(x):
    return int(math.ceil(x / 10.0)) * 10

# Set variables
title = 'CHARTEVENTS: Cleaned Oxygen Saturation Thresholds (0 - 100%)'
xlabel = 'Threshold Value for Oxygen Saturation (%)'
ylabel = 'Threshold Type'
plotdata = cleaned_OS_thresholds
xvalue = 'VALUENUM'
yvalue = 'THRESHOLD_TYPE'
xlimpadding = 5 # Padding added to the x axis limits
xticks_steps = 10
xticks_lower = plotdata[xvalue].min() - (plotdata[xvalue].min() % 10) # rounds down the minimum xvalue to the nearest ten
xticks_upper = roundup( plotdata[xvalue].max() ) + xticks_steps # rounds up the maximum xvalue to the nearest ten

# Config figure
sns.set_style('whitegrid')
fig, axs = plt.subplots(
    3,
    1,
    figsize = (10, 15),
    sharex = True,
    dpi = 72 # e.g. 72 for screen, 300 for print
    )
fig.suptitle(title, fontweight='bold', color= 'black', fontsize=14, y=0.9)
fig.subplots_adjust(hspace = 0.1)

# Actual plots
sns.stripplot(
    ax = axs[0],
    data = plotdata,
    x = xvalue,
    y = yvalue,
    palette = sns.color_palette('colorblind')
    )
axs[0].set_xlabel('')
axs[0].set_ylabel(ylabel, fontsize=12, labelpad=15)
axs[0].set_xticks(np.arange(xticks_lower, xticks_upper, xticks_steps))
axs[0].grid(b=True, which='both')
axs[0].margins(.1)

sns.boxplot(
    ax = axs[1],
    data = plotdata,
    x = xvalue,
    y = yvalue,
    palette = sns.color_palette('colorblind')
    )
axs[1].set_xlabel('')
axs[1].set_ylabel(ylabel, fontsize=12, labelpad=15)
axs[1].set_xticks(np.arange(xticks_lower, xticks_upper, xticks_steps))
axs[1].grid(b=True, which='both')
axs[1].margins(.1)

sns.violinplot(
    ax = axs[2],
    data = plotdata,
    x = xvalue,
    y = yvalue,
    palette = sns.color_palette('colorblind')
    )
axs[2].set_xlabel(xlabel, fontsize=12, labelpad=15)
axs[2].set_ylabel(ylabel, fontsize=12, labelpad=15)
axs[2].set_xticks(np.arange(xticks_lower, xticks_upper, xticks_steps))
axs[2].grid(b=True, which='both')
axs[2].margins(.1)

plt.show(fig)

## Non-invasive, Systolic Blood Pressure Insights

For the non-invasive, systolic blood pressure, we currently assume values from 0 to 375 mmHg to be meaningful.

### Non-invasive, Systolic Blood Pressure Measurements

In [None]:
NBPs_measurements = chartevents_subset[(chartevents_subset['ITEMID'] == 220179)]
NBPs_measurements.VALUENUM.describe()

In [None]:
NBPs_measurements.VALUEUOM.unique()

Among the 1,289,001 non-invasive, systolic blood pressure measurements is at least one negative (min = -69 mmHg) and one too high value (max = 141,146 mmHg). Let's consider several specific VALUENUM ranges.

In [None]:
NBPs_measurements_up_to_375 = NBPs_measurements[(NBPs_measurements['VALUENUM'] >= 0) & (NBPs_measurements['VALUENUM'] <= 375)]
NBPs_measurements_up_to_375.VALUENUM.describe() # 1,288,985 values ranging from 0 to 315 mmHg

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set variables
title = 'CHARTEVENTS: Cleaned Systolic Blood Pressure Measurements (0 - 315 mmHg)'
xlabel = 'Value'
plotdata = NBPs_measurements_up_to_375
xvalue = 'VALUENUM'

# Actual plot
sns.set_style('whitegrid')
fig, (fig_box, fig_hist) = plt.subplots(
    2,
    sharex=True,
    gridspec_kw={'height_ratios': (.15, .85)},
    figsize=(10, 5)
    )
fig.suptitle(title, fontweight='bold', color= 'black', fontsize=14, y=1)
sns.boxplot(ax=fig_box, data=plotdata, x=xvalue, palette=sns.color_palette('colorblind'))
fig_box.set(xlabel='')
sns.histplot(ax=fig_hist, data=plotdata, x=xvalue, kde=True, palette=sns.color_palette('colorblind'))
fig_hist.set_xlabel(xlabel, fontsize=12, labelpad=15)
fig_hist.set_ylabel('Frequency', fontsize=12, labelpad=15)

plt.show()

In [None]:
NBPs_measurements_negative = NBPs_measurements[NBPs_measurements['VALUENUM'] < 0]
NBPs_measurements_negative.VALUENUM.value_counts().sort_index() # one outlier at -69 mmHg

In [None]:
NBPs_measurements_over_375 = NBPs_measurements[NBPs_measurements['VALUENUM'] > 375]
NBPs_measurements_over_375.VALUENUM.value_counts().sort_index() # 15 outliers in range from 840 to 141,146.04 mmHg

Most of the 1,289,001 non-invasive, systolic blood pressure measurements are in the range currently assumed to be reasonable (0 to 315 mmHg). There are one negative outlier and 15 values exceeding this range clearly (840 to 141,146.04 mmHg).

### Non-invasive, Systolic Blood Pressure Thresholds

#### Non-invasive, Systolic Blood Pressure - LOW Thresholds

In [None]:
NBPs_thresholds_low = chartevents_subset[(chartevents_subset['ITEMID'] == 223752)]
NBPs_thresholds_low.VALUENUM.describe()

Among the 154,863 LOW thresholds for the non-invasive, systolic blood pressure is at least one negative value (min = -10 mmHg) and one too high value (max = 95,160 mmHg). These appear to be implausible outliers. Let's consider several specific VALUENUM ranges.

In [None]:
NBPs_thresholds_low_up_to_375 = NBPs_thresholds_low[(NBPs_thresholds_low['VALUENUM'] >= 0) & (NBPs_thresholds_low['VALUENUM'] <= 375)]
NBPs_thresholds_low_up_to_375.VALUENUM.describe() # 154,835 values ranging from 0 to 220 mmHg

In [None]:
NBPs_thresholds_low_negative = NBPs_thresholds_low[NBPs_thresholds_low['VALUENUM'] < 0]
NBPs_thresholds_low_negative.VALUENUM.value_counts().sort_index() # one outlier at -10 mmHg

In [None]:
NBPs_thresholds_low_above_375 = NBPs_thresholds_low[NBPs_thresholds_low['VALUENUM'] > 375]
NBPs_thresholds_low_above_375.VALUENUM.value_counts().sort_index() # 27 outliers in range from 900 to 95,160 mmHg

Most of the 154,863 LOW thresholds for the non-invasive, systolic blood pressure are in the range currently assumed to be reasonable (0 to 220 mmHg). There are one negative (-10 mmHg) and 27 values that exceed this range (900 to 95,160 mmHg).

#### Non-invasive, Systolic Blood Pressure - HIGH Thresholds

In [None]:
NBPs_thresholds_high = chartevents_subset[(chartevents_subset['ITEMID'] == 223751)]
NBPs_thresholds_high.VALUENUM.describe()

Among the 154,751 HIGH thresholds for the non-invasive, systolic blood pressure are only positive values but some of them are quite high (max = 170,170 mmHg). These appear to be implausible outliers. Let's consider several specific VALUENUM ranges.

In [None]:
NBPs_thresholds_high_up_to_375 = NBPs_thresholds_high[NBPs_thresholds_high['VALUENUM'] <= 375]
NBPs_thresholds_high_up_to_375.VALUENUM.describe() # 154,680 values ranging from 0 to 300 mmHg

In [None]:
NBPs_thresholds_high_above_375 = NBPs_thresholds_high[NBPs_thresholds_high['VALUENUM'] > 375]
NBPs_thresholds_high_above_375.VALUENUM.value_counts().sort_index() # 71 outliers in range from 460 to 170,170 mmHg

Most of the 154,751 HIGH thresholds for the non-invasive, systolic blood pressure are in the range currently assumed to be reasonable (0 to 300 mmHg). There are 71 values that exceed this range (460 to 170,170 mmHg).

#### Non-invasive, Systolic Blood Pressure - Cleaned Threshold Visualization

In [None]:
import numpy as np

# Concat cleaned heart rate threshold data frames
cleaned_NBPs_thresholds = pd.concat([NBPs_thresholds_low_up_to_375, NBPs_thresholds_high_up_to_375])

# Add threshold type column for viz
cleaned_NBPs_thresholds.insert(loc=len(cleaned_NBPs_thresholds.columns), column='THRESHOLD_TYPE', value=np.nan)

cleaned_NBPs_thresholds.loc[cleaned_NBPs_thresholds['ITEMID'] == 223751, 'THRESHOLD_TYPE'] = 'HIGH'
cleaned_NBPs_thresholds.loc[cleaned_NBPs_thresholds['ITEMID'] == 223752, 'THRESHOLD_TYPE'] = 'LOW'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set variables
title = 'CHARTEVENTS: Cleaned Systolic Blood Pressure Thresholds (0 - 220/300 mmHg)'
xlabel = 'Threshold Value of Systolic Blood Pressure (mmHg)'
plotdata = cleaned_NBPs_thresholds
xvalue = 'VALUENUM'
stratify_by = 'THRESHOLD_TYPE'

# Config figure
sns.set_style('whitegrid')
fig, axs = plt.subplots(1, 3, figsize=(20, 5))
fig.suptitle(title, fontweight='bold', color= 'black', fontsize=14, y=1)

# Actual plots
sns.histplot(ax=axs[0], data=plotdata, x=xvalue, hue=stratify_by, palette=[sns.color_palette('colorblind')[0],sns.color_palette('colorblind')[1]])
axs[0].set_title('Histogram (overlapping)', fontsize=12)
axs[0].set_xlabel(xlabel, fontsize=12)
axs[0].set_ylabel('Frequency', fontsize=12)

sns.histplot(ax=axs[1], data=plotdata, x=xvalue, hue=stratify_by, multiple='stack', palette=[sns.color_palette('colorblind')[0],sns.color_palette('colorblind')[1]])
axs[1].set_title('Histogram (stacked)', fontsize=12)
axs[1].set_xlabel(xlabel, fontsize=12)
axs[1].set_ylabel('Frequency', fontsize=12)

sns.kdeplot(ax=axs[2], data=plotdata, x=xvalue, hue=stratify_by, palette=[sns.color_palette('colorblind')[0],sns.color_palette('colorblind')[1]])
axs[2].set_title('Kernel Density Estimate (KDE)', fontsize=12)
axs[2].set_xlabel(xlabel, fontsize=12)
axs[2].set_ylabel('Density', fontsize=12)

plt.show(fig)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import math

# Round up to the nearest ten
def roundup(x):
    return int(math.ceil(x / 10.0)) * 10

# Set variables
title = 'CHARTEVENTS: Cleaned Systolic Blood Pressure Thresholds (0 - 220/300 mmHg)'
xlabel = 'Threshold Value for Systolic Blood Pressure (mmHg)'
ylabel = 'Threshold Type'
plotdata = cleaned_NBPs_thresholds
xvalue = 'VALUENUM'
yvalue = 'THRESHOLD_TYPE'
xlimpadding = 5 # Padding added to the x axis limits
xticks_steps = 20
xticks_lower = plotdata[xvalue].min() - (plotdata[xvalue].min() % 10) # rounds down the minimum xvalue to the nearest ten
xticks_upper = roundup( plotdata[xvalue].max() ) + xticks_steps # rounds up the maximum xvalue to the nearest ten

# Config figure
sns.set_style('whitegrid')
fig, axs = plt.subplots(
    3,
    1,
    figsize = (10, 15),
    sharex = True,
    dpi = 72 # e.g. 72 for screen, 300 for print
    )
fig.suptitle(title, fontweight='bold', color= 'black', fontsize=14, y=0.9)
fig.subplots_adjust(hspace = 0.1)

# Actual plots
sns.stripplot(
    ax = axs[0],
    data = plotdata,
    x = xvalue,
    y = yvalue,
    palette = sns.color_palette('colorblind')
    )
axs[0].set_xlabel('')
axs[0].set_ylabel(ylabel, fontsize=12, labelpad=15)
axs[0].set_xticks(np.arange(xticks_lower, xticks_upper, xticks_steps))
axs[0].grid(b=True, which='both')
axs[0].margins(.1)

sns.boxplot(
    ax = axs[1],
    data = plotdata,
    x = xvalue,
    y = yvalue,
    palette = sns.color_palette('colorblind')
    )
axs[1].set_xlabel('')
axs[1].set_ylabel(ylabel, fontsize=12, labelpad=15)
axs[1].set_xticks(np.arange(xticks_lower, xticks_upper, xticks_steps))
axs[1].grid(b=True, which='both')
axs[1].margins(.1)

sns.violinplot(
    ax = axs[2],
    data = plotdata,
    x = xvalue,
    y = yvalue,
    palette = sns.color_palette('colorblind')
    )
axs[2].set_xlabel(xlabel, fontsize=12, labelpad=15)
axs[2].set_ylabel(ylabel, fontsize=12, labelpad=15)
axs[2].set_xticks(np.arange(xticks_lower, xticks_upper, xticks_steps))
axs[2].grid(b=True, which='both')
axs[2].margins(.1)

plt.show(fig)