# Analysis of Sampling Rate

Aim: Analyze Sampling Rates by ICUSTAY ID an Item Id to generate an overview

## Load and Prepare Data

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
chartevents_subset = pd.read_parquet('./data/chartevents_subset.parquet', engine='pyarrow')
unique_icu_stays = pd.read_parquet('./data/unique_icustays_in_chartevents_subset.parquet', engine='pyarrow')

In [None]:
#select ICU_stay ids for analysis
icustayid_filter = unique_icu_stays.ICUSTAY_ID

# Filter by ICU_stay
sampling_rate_data = chartevents_subset[chartevents_subset.ICUSTAY_ID.isin(icustayid_filter)].copy()

In [None]:
# Sampling Rate Analysis is only being conducted on the values, not thresholds
# Filter for item ids that refer to value
itemids_for_values_filter = [220045, 220179, 220277]
sampling_rate_data = sampling_rate_data[sampling_rate_data.ITEMID.isin(itemids_for_values_filter)].copy()

#Create empty data frame that will hold the sampling rate for each selected icu stay and item id
sampling_rates_for_icustay_itemid = pd.DataFrame(columns=["ICUSTAY_ID", "ITEMID","FIRST_TSP","LAST_TSP","ICUSTAY_DURATION(h)","N_MEASUREMENTS","SAMPLING_RATE"])

## Generate Data Frame with Sampling Rate

In [None]:
# Calculate min timestamp
sampling_rate_data_min = sampling_rate_data.groupby(['ICUSTAY_ID','ITEMID'])['CHARTTIME'].min()
sampling_rate_data_min_df = sampling_rate_data_min.to_frame()
sampling_rate_data_min_df.reset_index(inplace=True)
sampling_rate_data_min_df = sampling_rate_data_min_df.rename(columns = {'CHARTTIME':'CHARTTIME_MIN'})
len(sampling_rate_data_min_df)

In [None]:
# Calculate max timestamp
sampling_rate_data_max = sampling_rate_data.groupby(['ICUSTAY_ID','ITEMID'])['CHARTTIME'].max()
sampling_rate_data_max_df = sampling_rate_data_max.to_frame()
sampling_rate_data_max_df.reset_index(inplace=True)
sampling_rate_data_max_df = sampling_rate_data_max_df.rename(columns = {'CHARTTIME':'CHARTTIME_MAX'})
len(sampling_rate_data_max_df)

In [None]:
# Calculate N_MEASUREMENTS
# For each ICUSTAY_ID-ITEMID combination, compute the number of available values as VALUENUM_COUNT
# Reduce to relevant columns
sampling_rate_data_count = sampling_rate_data[['ICUSTAY_ID','ITEMID','VALUENUM']].copy()
sampling_rate_data_count = sampling_rate_data_count.groupby(['ICUSTAY_ID','ITEMID']).count()
sampling_rate_data_count = sampling_rate_data_count.rename(columns = {'VALUENUM':'VALUENUM_COUNT'})
sampling_rate_data_count = sampling_rate_data_count.reset_index()
display(sampling_rate_data_count)

In [None]:
# Merge together by ICUSTAYID, ITEMID
sampling_rates_for_icustay_itemid = pd.merge(sampling_rate_data_min_df, sampling_rate_data_max_df,  how='left', on=['ICUSTAY_ID','ITEMID'])
sampling_rates_for_icustay_itemid = pd.merge(sampling_rates_for_icustay_itemid,sampling_rate_data_count,how='left', on=['ICUSTAY_ID','ITEMID'])
len(min_max_charttime_for_icustay_itemid)

In [None]:
import datetime as dt
# Calculate ICUSTAY_ID duration
sampling_rates_for_icustay_itemid['ICUSTAY_DURATION'] = sampling_rates_for_icustay_itemid['CHARTTIME_MAX']-sampling_rates_for_icustay_itemid['CHARTTIME_MIN']
sampling_rates_for_icustay_itemid['ICUSTAY_DURATION(s)'] = sampling_rates_for_icustay_itemid['ICUSTAY_DURATION'].dt.total_seconds()
sampling_rates_for_icustay_itemid['ICUSTAY_DURATION(h)'] = divmod(sampling_rates_for_icustay_itemid['ICUSTAY_DURATION(s)'], 3600)[0]

In [None]:
sampling_rates_for_icustay_itemid = sampling_rates_for_icustay_itemid.drop(columns=['ICUSTAY_DURATION','ICUSTAY_DURATION(s)'])

In [None]:
import numpy as np
sampling_rates_for_icustay_itemid['SAMPLING_RATE'] = np.where(sampling_rates_for_icustay_itemid['ICUSTAY_DURATION(h)'] == 0,sampling_rates_for_icustay_itemid['VALUENUM_COUNT'],(sampling_rates_for_icustay_itemid['VALUENUM_COUNT']/sampling_rates_for_icustay_itemid['ICUSTAY_DURATION(h)']))

         

In [None]:
import pandas as pd
import pyarrow as pa
import numpy as np
# Save chartevents_subset as parquet file
pd.DataFrame(sampling_rates_for_icustay_itemid).to_parquet('./data/sampling_rates_for_icustay_itemid.parquet', engine='pyarrow')

## Sampling Rate - Visualizations

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
sampling_rates_for_icustay_itemid = pd.read_parquet('./data/sampling_rates_for_icustay_itemid.parquet', engine='pyarrow')

In [None]:
# create subsets for item Ids
sampling_rates_for_icustay_HR = sampling_rates_for_icustay_itemid[sampling_rates_for_icustay_itemid['ITEMID'] == 220045]
sampling_rates_for_icustay_O2 = sampling_rates_for_icustay_itemid[sampling_rates_for_icustay_itemid['ITEMID'] == 220277]
sampling_rates_for_icustay_NBP = sampling_rates_for_icustay_itemid[sampling_rates_for_icustay_itemid['ITEMID'] == 220179]

## Plot the sampling rate by Item Id

In [None]:
# Imports
import seaborn as sns
import matplotlib.pyplot as plt

# Set variables
title = "Sampling Rate by Item Id"
xlabel = "Item Id"
ylabel = "Avg. # of samples obtained in 1 hour"
plotdata = sampling_rates_for_icustay_itemid
xvalue = "ITEMID"
yvalue = "SAMPLING_RATE"

# Config figure
sns.set_style("whitegrid")
fig, ax = plt.subplots(
    figsize = (10, 5), 
    dpi = 72 # e.g. 72 for screen, 300 for print
    )
ax = sns.boxplot( # Insert on of: sns.stripplot , sns.boxplot , sns.violinplot
    data = plotdata, 
    x = xvalue,
    y = yvalue, # Comment out if no stratification is to be performed based on yvalue
    palette = sns.color_palette("colorblind")
    )
ax.set_title(title, fontweight='bold', color= 'black', fontsize=14, y=1.05)
ax.set_xlabel(xlabel, fontsize=12, labelpad=15)
ax.set_ylabel(ylabel, fontsize=12, labelpad=15) # Comment out if no stratification is to be performed based on yvalue
ax.grid(b=True, which='both')
ax.margins(.1)

# Plot figure
plt.show(fig)

In [None]:
sampling_rates_for_icustay_HR.describe()

In [None]:
sampling_rates_for_icustay_NBP.describe()

In [None]:
sampling_rates_for_icustay_O2.describe()

## Validate Data

In [None]:
# check icus stays where we see nan values

sampling_rate_nan = sampling_rates_for_icustay_itemid[sampling_rates_for_icustay_itemid['FIRST_TSP'].isnull()]
sampling_rate_nan['ICUSTAY_ID'].value_counts()
# we have 306 unique ICU Stays for which at least one parameter is missing



In [None]:
# Check for ICU STay: 204787 and ITEM ID:220179
# Sampling Data Frame shows 
selected_icustay = sampling_rate_data[(sampling_rate_data['ICUSTAY_ID']==204787) & (sampling_rate_data['ITEMID']==220179)]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Set variables
title = "History of Non Invasive Blood Pressure systolic of ICU stay 253126"
xlabel = "Time"
ylabel = "mmHg"
plotdata = selected_icustay
xvalue = "CHARTTIME"
yvalue = "VALUENUM"
#huevalue = "ITEMID_LABEL"

# Config figure
sns.set_style("whitegrid")
fig, ax = plt.subplots(
    figsize = (11, 5), 
    dpi = 72 # e.g. 72 for screen, 300 for print
    )
ax = sns.lineplot(
    data = plotdata, 
    x = xvalue,
    y = yvalue, 
    #hue = huevalue,
    #style= huevalue,
    drawstyle = 'steps-post', # Interpolate missing values by using the last available value
    markers = ['p','^','v'],
    markersize = 5,
    dashes = False,
    #palette = [sns.color_palette("colorblind")[0],sns.color_palette("colorblind")[1],sns.color_palette("colorblind")[2]]
    )

plt.legend(title = None, bbox_to_anchor=(1.02, 0.3), loc='upper left', borderaxespad=0)
ax.set_title(title, fontweight='bold', color= 'black', fontsize=14, y=1.05)
ax.set_xlabel(xlabel, fontsize=12, labelpad=15)
ax.set_ylabel(ylabel, fontsize=12, labelpad=15)
plt.xticks(rotation = 90)

# Plot figure
plt.show(fig)