# Expand Chartevents by Information about Time Difference to Next Measurement

The new data frame shall be used further analyze the difference and derive possible chunking rules
### Load and Prepare Data

In [None]:
import pandas as pd
import pyarrow as pa
import datetime as dt

# Read chartevents_subset from parquet file to pandas data frame
chartevents_subset = pd.read_parquet('./data/chartevents_subset.parquet', engine='pyarrow')
unique_icu_stays = pd.read_parquet('./data/unique_icustays_in_chartevents_subset.parquet', engine='pyarrow')

In [None]:
# Select relevant ICUSTAY_ID for analysis - only the ones appearing for the analyzed ITEMIDs
icustayid_filter = unique_icu_stays.ICUSTAY_ID

# Filter by ICUSTAY_ID
chunk_analysis_data = chartevents_subset[chartevents_subset.ICUSTAY_ID.isin(icustayid_filter)].copy()

In [None]:
# Sampling Rate Analysis is only being conducted on the values, not thresholds
# Filter for ITEMIDs that refer to parameter value
# Heart Rate: 220045 | NBP: 220179 | O2: 220277
itemids_for_values_filter = [220045, 220179, 220277]
chunk_analysis_data = chunk_analysis_data[chunk_analysis_data.ITEMID.isin(itemids_for_values_filter)].copy()
len(chunk_analysis_data)


### Create New Data Frame Columns for Analysis

Example of relevant resulting data frame columns:

ICUSTAY_ID  |  ITEMID  | CHARTTIME           | VALUENUM | **CHARTTIME_NEXT**         | **DIF_CHARTTIME_NEXT_MIN**

20221       |   220045 | 2181-11-25T19:06:00 | 115      | 2181-11-25T19:16:00 10 | 10

20221       |   220045 | 2181-11-25T19:16:00 | 113      | 2181-11-25T20:00:00 10 | 44

Add Timestamp of Next Measurement as Column to Row of Current Measurement - CHARTTIME_NEXT 


In [None]:
# Idea: Keep chunk_analysis_data as is, only add a new column that holds the next timestamp, the difference can then be performed outside the loop
chunk_analysis_data['CHARTTIME_NEXT'] = chunk_analysis_data.groupby(['ICUSTAY_ID','ITEMID'])['CHARTTIME'].shift(-1)

In [None]:
# Quick Validation
print(len(chunk_analysis_data))

# Count NaN Values in CHARTTIME_NEXT  - We should have one NaN Value for each existing ICUSTAY_ID - ITEMID combination (the last one has no next)
print(len(chunk_analysis_data.groupby(['ICUSTAY_ID','ITEMID']).size()))
chunk_analysis_data.CHARTTIME_NEXT.isna().sum()

Calculate Difference between Timestamps - DIF_CHARTTIME_NEXT_MIN

In [None]:
chunk_analysis_data['DIF_CHARTTIME_NEXT'] = chunk_analysis_data['CHARTTIME_NEXT']-chunk_analysis_data['CHARTTIME']
chunk_analysis_data['DIF_CHARTTIME_NEXT_S'] = chunk_analysis_data['DIF_CHARTTIME_NEXT'].dt.total_seconds()
chunk_analysis_data['DIF_CHARTTIME_NEXT_MIN'] = divmod(chunk_analysis_data['DIF_CHARTTIME_NEXT_S'], 60)[0]

In [None]:
chunk_analysis_data.head()

In [None]:
chunk_analysis_data_reduced = chunk_analysis_data[['ICUSTAY_ID','ITEMID','CHARTTIME','VALUENUM','VALUEUOM','CHARTTIME_NEXT','DIF_CHARTTIME_NEXT_MIN']]
chunk_analysis_data_reduced.head()

In [None]:
#drop timedelta column as it can not be saved in parquet
chunk_analysis_data = chunk_analysis_data.drop(columns='DIF_CHARTTIME_NEXT')

In [None]:
# Save as parquet file
pd.DataFrame(chunk_analysis_data).to_parquet('./data/chunk_analysis_data.parquet', engine='pyarrow')