# Analysis of Difference between Measurements

### Aim: Provide an overview of difference between measurements by ICUSTAY & ITEMID to create a histogram
The histogramm shall be used to identify the need for chunking and determine possible chunking rules

## Add Timestamp of next Measurement as column to row of current measurement

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
chartevents_subset = pd.read_parquet('./data/chartevents_subset.parquet', engine='pyarrow')
unique_icu_stays = pd.read_parquet('./data/unique_icustays_in_chartevents_subset.parquet', engine='pyarrow')

In [None]:
#select ICU_stay ids for analysis
# to test performance: select subset of unique_icu_stays
#unique_icu_stays_subset = unique_icu_stays[:1000]
icustayid_filter = unique_icu_stays.ICUSTAY_ID

# Filter by ICU_stay
chunk_analysis_data = chartevents_subset[chartevents_subset.ICUSTAY_ID.isin(icustayid_filter)].copy()

In [None]:
# Sampling Rate Analysis is only being conducted on the values, not thresholds
# Filter for item ids that refer to value
itemids_for_values_filter = [220045, 220179, 220277]
chunk_analysis_data = chunk_analysis_data[chunk_analysis_data.ITEMID.isin(itemids_for_values_filter)].copy()
len(chunk_analysis_data)



In [None]:

# Idea: Keep chunk_analysis_data as is, only add a new column that holds the next timestamp, the difference can then be performed outside the loop
chunk_analysis_data['CHARTTIME_NEXT'] = chunk_analysis_data.groupby(['ICUSTAY_ID','ITEMID'])['CHARTTIME'].shift(-1)

In [None]:
len(chunk_analysis_data)

### Quick Validation 

In [None]:
#Count NaN values - we should have one Nan value for each existing ICUSTAY_ID & ITEMID
# 100 ICUSTAYs *3 ITEMIDS (if all three ITEMID exit)
len(chunk_analysis_data.groupby(['ICUSTAY_ID','ITEMID']).size())
# we would assume to see that many Nans

In [None]:
chunk_analysis_data.CHARTTIME_NEXT.isna().sum()
#correct

## Calculate Difference between Timestamps

In [None]:
chunk_analysis_data['DIF_CHARTTIME_NEXT'] = chunk_analysis_data['CHARTTIME_NEXT']-chunk_analysis_data['CHARTTIME']
chunk_analysis_data['DIF_CHARTTIME_NEXT_S'] = chunk_analysis_data['DIF_CHARTTIME_NEXT'].dt.total_seconds()
chunk_analysis_data['DIF_CHARTTIME_NEXT_MIN'] = divmod(chunk_analysis_data['DIF_CHARTTIME_NEXT_S'], 60)[0]

In [None]:
chunk_analysis_data.head()

In [None]:
#drop timedelta column as it can not be saved n parquet
chunk_analysis_data = chunk_analysis_data.drop(columns='DIF_CHARTTIME_NEXT')

In [None]:
# Save as parquet file
pd.DataFrame(chunk_analysis_data).to_parquet('./data/chunk_analysis_data.parquet', engine='pyarrow')

In [None]:
import pandas as pd
import pyarrow as pa
chunk_analysis_data = pd.read_parquet('./data/chunk_analysis_data.parquet', engine='pyarrow')


## Visualization - Create Histogram of Difference between Timestamps

In [None]:
# create subsets for item Ids
chunk_analysis_data_HR = chunk_analysis_data[chunk_analysis_data['ITEMID'] == 220045]
chunk_analysis_data_O2 = chunk_analysis_data[chunk_analysis_data['ITEMID'] == 220277]
chunk_analysis_data_NBP = chunk_analysis_data[chunk_analysis_data['ITEMID'] == 220179]

### Analysis of Difference between Measurements - General

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)
chunk_analysis_data["DIF_CHARTTIME_NEXT_MIN"].describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set variables
title = "Analysis of Difference between Measurements"
xlabel = "Difference between Measurements (min)"
plotdata = chunk_analysis_data
xvalue = "DIF_CHARTTIME_NEXT_MIN"

sns.set_style("whitegrid")
fig, (fig_box, fig_hist) = plt.subplots(
    2, 
    sharex=True, 
    gridspec_kw={"height_ratios": (.15, .85)}, 
    figsize=(10, 5)
    )
fig.suptitle(title, fontweight='bold', color= 'black', fontsize=14, y=1)
sns.boxplot(ax=fig_box, data=plotdata, x=xvalue, palette=sns.color_palette("colorblind"))
fig_box.set(xlabel="")
sns.histplot(ax=fig_hist, data=plotdata, x=xvalue, kde=True, palette=sns.color_palette("colorblind"))
fig_hist.set_xlabel(xlabel, fontsize=12, labelpad=15)
fig_hist.set_ylabel("Count", fontsize=12, labelpad=15)

# Save plot as PNG file
path = './plots/'
timestamp = dt.datetime.today().strftime('%Y-%m-%dT%H-%M-%S')
plt.savefig(str(path + 'analysis_of_difference_between_measurementsplot_300-DPI_' + timestamp + '.png'), dpi=300, bbox_inches='tight')

plt.show()

### Analysis of Difference between Measurements - General by ITEMID

In [None]:
chunk_analysis_data_HR["DIF_CHARTTIME_NEXT_MIN"].describe()

In [None]:
chunk_analysis_data_O2["DIF_CHARTTIME_NEXT_MIN"].describe()

In [None]:
chunk_analysis_data_NBP["DIF_CHARTTIME_NEXT_MIN"].describe()

### Analysis of Differences between Measurements within 75% 

In [None]:
chunk_analysis_data_Q3 = chunk_analysis_data[chunk_analysis_data["DIF_CHARTTIME_NEXT_MIN"] <= 60]

In [None]:
# Histogram and Boxplott - Cross ITEMID
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

# Set variables
title = "Analysis of Difference between Measurements - Q3"
xlabel = "Difference between Measurements (min)"
plotdata = chunk_analysis_data_Q3
xvalue = "DIF_CHARTTIME_NEXT_MIN"

sns.set_style("whitegrid")
fig, (fig_box, fig_hist) = plt.subplots(
    2, 
    sharex=True, 
    gridspec_kw={"height_ratios": (.15, .85)}, 
    figsize=(10, 5)
    )
fig.suptitle(title, fontweight='bold', color= 'black', fontsize=14, y=1)
sns.boxplot(ax=fig_box, data=plotdata, x=xvalue, palette=sns.color_palette("colorblind"))
fig_box.set(xlabel="")
sns.histplot(ax=fig_hist, data=plotdata, x=xvalue, kde=True, palette=sns.color_palette("colorblind"))
fig_hist.set_xlabel(xlabel, fontsize=12, labelpad=15)
fig_hist.set_ylabel("Count", fontsize=12, labelpad=15)

# Save plot as PNG file
path = './plots/'
timestamp = dt.datetime.today().strftime('%Y-%m-%dT%H-%M-%S')
plt.savefig(str(path + 'analysis_of_difference_between_measurementsplot_Q3_300-DPI_' + timestamp + '.png'), dpi=300, bbox_inches='tight')

plt.show()

In [None]:
# Histograms - Stratified by ITEMID
import matplotlib.pyplot as plt
import seaborn as sns

# Set variables
title = "Analysis of Difference between Measurements - Q3"
xlabel = "Difference between Measurements (min)"
plotdata = chunk_analysis_data_Q3
xvalue = "DIF_CHARTTIME_NEXT_MIN"
stratify_by = "ITEMID"

sns.set_style("whitegrid")
fig, axs = plt.subplots(1, 3, figsize=(20, 5))
fig.suptitle(title, fontweight='bold', color= 'black', fontsize=14, y=1)

sns.histplot(ax=axs[0], data=plotdata, x=xvalue, hue=stratify_by, palette=[sns.color_palette("colorblind")[0],sns.color_palette("colorblind")[1],sns.color_palette("colorblind")[2]])
axs[0].set_title("Histogram (overlapping)", fontsize=12)
axs[0].set_xlabel(xlabel, fontsize=12)
axs[0].set_ylabel("Count", fontsize=12)

sns.histplot(ax=axs[1], data=plotdata, x=xvalue, hue=stratify_by, multiple="stack", palette=[sns.color_palette("colorblind")[0],sns.color_palette("colorblind")[1],sns.color_palette("colorblind")[2]])
axs[1].set_title("Histogram (stacked)", fontsize=12)
axs[1].set_xlabel(xlabel, fontsize=12)
axs[1].set_ylabel("Count", fontsize=12)

sns.kdeplot(ax=axs[2], data=plotdata, x=xvalue, hue=stratify_by, palette=[sns.color_palette("colorblind")[0],sns.color_palette("colorblind")[1],sns.color_palette("colorblind")[2]])
axs[2].set_title("Kernel Density Estimate (KDE)", fontsize=12)
axs[2].set_xlabel(xlabel, fontsize=12)
axs[2].set_ylabel("Density", fontsize=12)

# Save plot as PNG file
path = './plots/'
timestamp = dt.datetime.today().strftime('%Y-%m-%dT%H-%M-%S')
plt.savefig(str(path + 'analysis_of_difference_between_measurementsplot_Q3_by_ITEMID_300-DPI_' + timestamp + '.png'), dpi=300, bbox_inches='tight')

plt.show(fig)

In [None]:
# Analyze observable secons peak <10min
chunk_analysis_data_10 = chunk_analysis_data[chunk_analysis_data["DIF_CHARTTIME_NEXT_MIN"] <=10]
chunk_analysis_data_10["DIF_CHARTTIME_NEXT_MIN"].value_counts()
# Second peak (next to peak at 60 min) occurs at 1 min

### Analysis of Differences between Measurements above 75%

In [None]:
chunk_analysis_data_above_Q3 = chunk_analysis_data[chunk_analysis_data["DIF_CHARTTIME_NEXT_MIN"] > 60]

In [None]:
chunk_analysis_data_above_Q3["DIF_CHARTTIME_NEXT_MIN"].describe()
# We could derive another rule - e.g at 120min

In [None]:
# Histograms - Stratified by ITEMID
import matplotlib.pyplot as plt
import seaborn as sns

# Set variables
title = "Analysis of Difference between Measurements - > Q3"
xlabel = "Difference between Measurements (min)"
plotdata = chunk_analysis_data_above_Q3
xvalue = "DIF_CHARTTIME_NEXT_MIN"
stratify_by = "ITEMID"

sns.set_style("whitegrid")
fig, axs = plt.subplots(1, 3, figsize=(20, 5))
fig.suptitle(title, fontweight='bold', color= 'black', fontsize=14, y=1)

sns.histplot(ax=axs[0], data=plotdata, x=xvalue, hue=stratify_by, palette=[sns.color_palette("colorblind")[0],sns.color_palette("colorblind")[1],sns.color_palette("colorblind")[2]])
axs[0].set_title("Histogram (overlapping)", fontsize=12)
axs[0].set_xlabel(xlabel, fontsize=12)
axs[0].set_ylabel("Count", fontsize=12)

sns.histplot(ax=axs[1], data=plotdata, x=xvalue, hue=stratify_by, multiple="stack", palette=[sns.color_palette("colorblind")[0],sns.color_palette("colorblind")[1],sns.color_palette("colorblind")[2]])
axs[1].set_title("Histogram (stacked)", fontsize=12)
axs[1].set_xlabel(xlabel, fontsize=12)
axs[1].set_ylabel("Count", fontsize=12)

sns.kdeplot(ax=axs[2], data=plotdata, x=xvalue, hue=stratify_by, palette=[sns.color_palette("colorblind")[0],sns.color_palette("colorblind")[1],sns.color_palette("colorblind")[2]])
axs[2].set_title("Kernel Density Estimate (KDE)", fontsize=12)
axs[2].set_xlabel(xlabel, fontsize=12)
axs[2].set_ylabel("Density", fontsize=12)

# Save plot as PNG file
path = './plots/'
timestamp = dt.datetime.today().strftime('%Y-%m-%dT%H-%M-%S')
plt.savefig(str(path + 'analysis_of_difference_between_measurementsplot_above_Q3_by_ITEMID_300-DPI_' + timestamp + '.png'), dpi=300, bbox_inches='tight')

plt.show(fig)

### Analyze amount of ICUSTAY_IDs that would be affected by a chunk rule 

First: Chunk after 60 min

In [None]:
len(chunk_analysis_data_above_Q3["ICUSTAY_ID"].value_counts())

Second: Chunk after 120 min

In [None]:
chunk_analysis_data_above_120 = chunk_analysis_data[chunk_analysis_data["DIF_CHARTTIME_NEXT_MIN"] > 120]
len(chunk_analysis_data_above_120["ICUSTAY_ID"].value_counts())

## Apply Chunking Rule
Generate Chunk IDs

In [1]:
import pandas as pd
import pyarrow as pa
chunk_analysis_data = pd.read_parquet('./data/chunk_analysis_data.parquet', engine='pyarrow')
chunking_dif = 60

In [None]:
# by ICUSTAY/ID - check if dif to next measurement is >chunking dif,
# if dif to next measurement is >chunking dif
# gebe allen reihen, die seit dem letzten chunk oder beginn durchgelaufen sind, sowie der aktuellen die selbe chunking id

In [None]:
# select all rows where di to next measurement is >chunking dif,
# assign a unique chunking ID to these rows
#merge back to all rows
#create initial chunk id that holds until merged chunnk id occurs
#duplicate chunking id until new chunking id occurs - Logic only possible with sorted data

In [2]:
# select all rows where di to next measurement is >chunking dif
chunk_data = chunk_analysis_data[chunk_analysis_data["DIF_CHARTTIME_NEXT_MIN"] > chunking_dif]

In [3]:
# assign a unique chunking ID to these rows
chunk_data["CHUNK_ID"] = chunk_data.ICUSTAY_ID.map(str) + "_" + chunk_data.ITEMID.map(str) + "_" + chunk_data.CHARTTIME.map(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [4]:
# check uniqueness - can only be violated if multiple measurements for that itemid/icustayid occured at the same charttime
print(len(chunk_data["CHUNK_ID"].value_counts()))
print(len(chunk_data))
# uiqueness for this data set is given

323624
323624


In [5]:
chunk_data_subset = chunk_data["CHUNK_ID"]

In [6]:
#merge back to all rows
#no we have a data set that has a chunk_id at the beginning of each
chunk_data_merged = pd.merge(chunk_analysis_data, chunk_data_subset,  how='left', left_index=True, right_index=True )

In [7]:
# change sorting structure -  turn ITEMID and CHARTTIME around
chunk_data_merged = chunk_data_merged.sort_values(by=['ICUSTAY_ID', 'ITEMID','CHARTTIME'])

In [8]:
# Assign Chunk ID to first measurement of   ICUSTAY_ID/TEMID in case it does not already exist
chunk_data_min = chunk_data_merged.groupby(['ICUSTAY_ID','ITEMID'])['CHARTTIME'].min()

In [9]:
chunk_data_min_df = chunk_data_min.to_frame()

In [10]:
chunk_data_min_df.reset_index(inplace=True)

In [11]:
# for each first charttime (by ICUSTAYID/ITEEMID) create a chunk ID
chunk_data_min_df["CHUNK_ID_MIN"] = chunk_data_min_df.ICUSTAY_ID.map(str) + "_" + chunk_data_min_df.ITEMID.map(str) + "_" + chunk_data_min_df.CHARTTIME.map(str)


In [12]:
# merge that back so we have a chunk id for each first Measurement (by ICUSTAYID/TEMID)
# what about cases where we alrady have a chunk idea? (because after first measurement it took more than e.g. 60 min?) - CHUNKID would look the same
chunk_data_merged_2 = pd.merge(chunk_data_merged, chunk_data_min_df,  how='left', on=['ICUSTAY_ID','ITEMID','CHARTTIME'])

In [13]:
import numpy as np
# wenn chunkIdMin nicht Nan,schreib chunk_id_min in chunk_id
chunk_data_merged_2['CHUNK_ID'] = np.where(chunk_data_merged_2['CHUNK_ID_MIN'].notnull(), chunk_data_merged_2['CHUNK_ID_MIN'], chunk_data_merged_2['CHUNK_ID'])

In [14]:
chunk_data_merged_2 = chunk_data_merged_2.drop(columns='CHUNK_ID_MIN')

In [17]:
# fill all cells with previous chunk id, until new chunk idea occurs
#pre-requisite: data is sorted by ICUSTAY_ID & ITEMID
chunk_data_merged_2['CHUNK_ID_FILLEDD'] = chunk_data_merged_2['CHUNK_ID'].fillna(method='ffill')

In [19]:
# Save as parquet file
pd.DataFrame(chunk_data_merged_2).to_parquet('./data/chartevent_subset_values_with_chunkid.parquet', engine='pyarrow')