# Analysis of Sampling Rate for Chunks

Aim: Analyze Sampling Rates by Chunk Id to generate an overview

## Load and Prepare Data

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
sampling_rate_data = pd.read_parquet('./data/chartevent_subset_values_with_chunkid_65.parquet', engine='pyarrow')

In [None]:
# Sampling Rate Analysis is only being conducted on the values, not thresholds
# Filter for item ids that refer to value - only relevant if chunk data also holds thresholds (not yet in there)
itemids_for_values_filter = [220045, 220179, 220277]
# chunk data only consits of parameters
sampling_rate_data = sampling_rate_data[sampling_rate_data.ITEMID.isin(itemids_for_values_filter)].copy()


In [None]:
sampling_rate_data = sampling_rate_data.rename(columns={"CHUNK_ID_FILLED":"CHUNK_ID"})

In [None]:
sampling_rate_data['CHUNK_ID'].value_counts()

## Generate Data Frame with Sampling Rate

In [None]:
# Calculate min timestamp
sampling_rate_data_min = sampling_rate_data.groupby(['CHUNK_ID'])['CHARTTIME'].min()
sampling_rate_data_min_df = sampling_rate_data_min.to_frame()
sampling_rate_data_min_df.reset_index(inplace=True)
sampling_rate_data_min_df = sampling_rate_data_min_df.rename(columns = {'CHARTTIME':'CHARTTIME_MIN'})
len(sampling_rate_data_min_df)

In [None]:
# Calculate max timestamp
sampling_rate_data_max = sampling_rate_data.groupby(['CHUNK_ID'])['CHARTTIME'].max()
sampling_rate_data_max_df = sampling_rate_data_max.to_frame()
sampling_rate_data_max_df.reset_index(inplace=True)
sampling_rate_data_max_df = sampling_rate_data_max_df.rename(columns = {'CHARTTIME':'CHARTTIME_MAX'})
len(sampling_rate_data_max_df)

In [None]:
# Calculate N_MEASUREMENTS
# For each ICUSTAY_ID-ITEMID combination, compute the number of available values as VALUENUM_COUNT
# Reduce to relevant columns
sampling_rate_data_count = sampling_rate_data[['CHUNK_ID','VALUENUM']].copy()
sampling_rate_data_count = sampling_rate_data_count.groupby(['CHUNK_ID']).count()
sampling_rate_data_count = sampling_rate_data_count.rename(columns = {'VALUENUM':'VALUENUM_COUNT'})
sampling_rate_data_count = sampling_rate_data_count.reset_index()
display(sampling_rate_data_count)

In [None]:
# Merge together by ICUSTAYID, ITEMID
sampling_rates_for_chunkid = pd.merge(sampling_rate_data_min_df, sampling_rate_data_max_df,  how='left', on=['CHUNK_ID'])
sampling_rates_for_chunkid = pd.merge(sampling_rates_for_chunkid,sampling_rate_data_count,how='left', on=['CHUNK_ID'])
len(sampling_rates_for_chunkid)

In [None]:
import datetime as dt
# Calculate ICUSTAY_ID duration
sampling_rates_for_chunkid['CHUNKID_DURATION'] = sampling_rates_for_chunkid['CHARTTIME_MAX']-sampling_rates_for_chunkid['CHARTTIME_MIN']
sampling_rates_for_chunkid['CHUNKID_DURATION(s)'] = sampling_rates_for_chunkid['CHUNKID_DURATION'].dt.total_seconds()
sampling_rates_for_chunkid['CHUNKID_DURATION(h)'] = divmod(sampling_rates_for_chunkid['CHUNKID_DURATION(s)'], 3600)[0]

In [None]:
sampling_rates_for_chunkid = sampling_rates_for_chunkid.drop(columns=['CHUNKID_DURATION','CHUNKID_DURATION(s)'])

In [None]:
import numpy as np
sampling_rates_for_chunkid['SAMPLING_RATE'] = np.where(sampling_rates_for_chunkid['CHUNKID_DURATION(h)'] == 0,sampling_rates_for_chunkid['VALUENUM_COUNT'],(sampling_rates_for_chunkid['VALUENUM_COUNT']/sampling_rates_for_chunkid['CHUNKID_DURATION(h)']))

         

In [None]:
# Merge ICUSTAYID and  ITEMID to sampling_rates_for_chunkid
icustay_and_itemid_for_chunk = sampling_rate_data[['ICUSTAY_ID','ITEMID','CHUNK_ID']]
icustay_and_itemid_for_chunk=icustay_and_itemid_for_chunk.drop_duplicates()
sampling_rates_for_chunkid = pd.merge(icustay_and_itemid_for_chunk,sampling_rates_for_chunkid,how='left', on=['CHUNK_ID'])

In [None]:
import pandas as pd
import pyarrow as pa
import numpy as np
# Save chartevents_subset as parquet file
pd.DataFrame(sampling_rates_for_chunkid).to_parquet('./data/sampling_rates_for_chunkid.parquet', engine='pyarrow')

## Sampling Rate - Visualizations

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
sampling_rates_for_chunkid = pd.read_parquet('./data/sampling_rates_for_chunkid.parquet', engine='pyarrow')

In [None]:
# create subsets for item Ids
sampling_rates_for_chunkid_HR = sampling_rates_for_chunkid[sampling_rates_for_chunkid['ITEMID'] == 220045]
sampling_rates_for_chunkid_O2 = sampling_rates_for_chunkid[sampling_rates_for_chunkid['ITEMID'] == 220277]
sampling_rates_for_chunkid_NBP = sampling_rates_for_chunkid[sampling_rates_for_chunkid['ITEMID'] == 220179]

## Plot the sampling rate by Item Id

In [None]:
# Imports
import seaborn as sns
import matplotlib.pyplot as plt

# Set variables
title = "Sampling Rate by Chunk Id"
xlabel = "Item Id"
ylabel = "Avg. # of samples obtained in 1 hour"
plotdata = sampling_rates_for_chunkid
xvalue = "ITEMID"
yvalue = "SAMPLING_RATE"

# Config figure
sns.set_style("whitegrid")
fig, ax = plt.subplots(
    figsize = (10, 5), 
    dpi = 72 # e.g. 72 for screen, 300 for print
    )
ax = sns.boxplot( # Insert on of: sns.stripplot , sns.boxplot , sns.violinplot
    data = plotdata, 
    x = xvalue,
    y = yvalue, # Comment out if no stratification is to be performed based on yvalue
    palette = sns.color_palette("colorblind")
    )
ax.set_title(title, fontweight='bold', color= 'black', fontsize=14, y=1.05)
ax.set_xlabel(xlabel, fontsize=12, labelpad=15)
ax.set_ylabel(ylabel, fontsize=12, labelpad=15) # Comment out if no stratification is to be performed based on yvalue
ax.grid(b=True, which='both')
ax.margins(.1)

# Plot figure
plt.show(fig)

In [None]:
sampling_rates_for_chunkid_HR.describe()

In [None]:
sampling_rates_for_chunkid_NBP.describe()

In [None]:
sampling_rates_for_chunkid_O2.describe()

## Further Analysis


### Analyze # of Chunk Ids per ICUSTAY_ID

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
chunk_analysis_data = pd.read_parquet('./data/chartevent_subset_values_with_chunkid_65.parquet', engine='pyarrow')
chunk_analysis_data = chunk_analysis_data.rename(columns={"CHUNK_ID_FILLED":"CHUNK_ID"})

In [None]:
chunk_count_by_icustay = chunk_analysis_data[['ICUSTAY_ID','CHUNK_ID']]
chunk_count_by_icustay=chunk_count_by_icustay.drop_duplicates()
chunk_count_by_icustay = chunk_count_by_icustay.groupby(['ICUSTAY_ID']).count()

chunk_count_by_icustay = chunk_count_by_icustay.rename(columns = {'CHUNK_ID':'CHUNK_ID_COUNT'})
chunk_count_by_icustay = chunk_count_by_icustay.reset_index()
display(chunk_count_by_icustay)

In [None]:
chunk_count_by_icustay.describe()

### Analyze # of Chunk Ids per ICUSTAY_ID, stratified by ITEMID

In [None]:
chunk_count_by_icustay_itemid = chunk_analysis_data[['ICUSTAY_ID','ITEMID','CHUNK_ID']]
chunk_count_by_icustay_itemid = chunk_count_by_icustay_itemid.drop_duplicates()
chunk_count_by_icustay_itemid = chunk_count_by_icustay_itemid.groupby(['ICUSTAY_ID','ITEMID']).count()

chunk_count_by_icustay_itemid = chunk_count_by_icustay_itemid.rename(columns = {'CHUNK_ID':'CHUNK_ID_COUNT'})
chunk_count_by_icustay_itemid = chunk_count_by_icustay_itemid.reset_index()
display(chunk_count_by_icustay_itemid)


In [None]:
chunk_count_by_icustay_itemid['ITEMID'] = chunk_count_by_icustay_itemid['ITEMID'].astype(str)


In [None]:
# Imports
import seaborn as sns
import matplotlib.pyplot as plt

# Set variables
title = "Chunk Analysis by ICUSTAY and ITEMID"
xlabel = "Count of Chunks per ICUSTAY"
ylabel = "Item Id"
plotdata = chunk_count_by_icustay_itemid
xvalue = "CHUNK_ID_COUNT"
yvalue = "ITEMID"

# Config figure
sns.set_style("whitegrid")
fig, ax = plt.subplots(
    figsize = (10, 5), 
    dpi = 72 # e.g. 72 for screen, 300 for print
    )
ax = sns.boxplot( # Insert on of: sns.stripplot , sns.boxplot , sns.violinplot
    data = plotdata, 
    x = xvalue,
    y = yvalue, # Comment out if no stratification is to be performed based on yvalue
    palette = sns.color_palette("colorblind")
    )
ax.set_title(title, fontweight='bold', color= 'black', fontsize=14, y=1.05)
ax.set_xlabel(xlabel, fontsize=12, labelpad=15)
ax.set_ylabel(ylabel, fontsize=12, labelpad=15) # Comment out if no stratification is to be performed based on yvalue
ax.grid(b=True, which='both')
ax.margins(.1)

# Plot figure
plt.show(fig)

In [None]:
chunk_count_by_icustay_itemid_avg = chunk_count_by_icustay_itemid.groupby(['ITEMID'])['CHUNK_ID_COUNT'].mean()
chunk_count_by_icustay_itemid_avg

In [None]:
chunk_count_by_icustay_itemid_median = chunk_count_by_icustay_itemid.groupby(['ITEMID'])['CHUNK_ID_COUNT'].median()
chunk_count_by_icustay_itemid_median

In [None]:
chunk_count_by_icustay_itemid_describe = chunk_count_by_icustay_itemid.groupby(['ITEMID'])['CHUNK_ID_COUNT'].describe()
chunk_count_by_icustay_itemid_describe