# Perform Chunking
Based on the visual analysis, we derived two possible chunking options:
* Chunk after 60 min difference to previous timestamp
* Chunk after 120 min difference to previous timestamp

After the discussion with the teaching team, we decided to **chunk after 65 min difference to the previous timestamp** with the possibility to adapt that in future

## Add Timestamp of previous measurement and difference between timestamps to dataset

In [2]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
chartevents_subset = pd.read_parquet('./data/chartevents_subset.parquet', engine='pyarrow')
unique_icu_stays = pd.read_parquet('./data/unique_icustays_in_chartevents_subset.parquet', engine='pyarrow')

In [3]:
#select ICU_stay ids for analysis
icustayid_filter = unique_icu_stays.ICUSTAY_ID

# Filter by ICU_stay
chunk_analysis_data = chartevents_subset[chartevents_subset.ICUSTAY_ID.isin(icustayid_filter)].copy()

In [4]:
# Sampling Rate Analysis is only being conducted on the values, not thresholds
# Filter for item ids that refer to value
itemids_for_values_filter = [220045, 220179, 220277]
chunk_analysis_data = chunk_analysis_data[chunk_analysis_data.ITEMID.isin(itemids_for_values_filter)].copy()
len(chunk_analysis_data)

6720055

In [5]:
# Idea: Keep chunk_analysis_data as is, only add a new column that holds the previous timestamp, the difference can then be performed outside the loop
# Prerequisite: Sorted Data by ICUSTAY_ID,ITEMID,CHARTTIME
chunk_analysis_data['CHARTTIME_PREV'] = chunk_analysis_data.groupby(['ICUSTAY_ID','ITEMID'])['CHARTTIME'].shift(1)

In [6]:
# Calculate difference between timestamps
chunk_analysis_data['DIF_CHARTTIME_PREV'] = chunk_analysis_data['CHARTTIME']-chunk_analysis_data['CHARTTIME_PREV']
chunk_analysis_data['DIF_CHARTTIME_PREV_S'] = chunk_analysis_data['DIF_CHARTTIME_PREV'].dt.total_seconds()
chunk_analysis_data['DIF_CHARTTIME_PREV_MIN'] = divmod(chunk_analysis_data['DIF_CHARTTIME_PREV_S'], 60)[0]

In [7]:
chunk_analysis_data.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUENUM,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED,CHARTTIME_PREV,DIF_CHARTTIME_PREV,DIF_CHARTTIME_PREV_S,DIF_CHARTTIME_PREV_MIN
0,14005075.0,55973.0,152234.0,200001.0,220045.0,2181-11-25 19:06:00,2181-11-25 19:17:00,20622.0,115,115.0,bpm,0.0,0.0,,,NaT,NaT,,
1,14005077.0,55973.0,152234.0,200001.0,220179.0,2181-11-25 19:08:00,2181-11-25 19:17:00,20622.0,113,113.0,mmHg,0.0,0.0,,,NaT,NaT,,
3,14005080.0,55973.0,152234.0,200001.0,220277.0,2181-11-25 19:14:00,2181-11-25 19:17:00,20622.0,94,94.0,%,0.0,0.0,,,NaT,NaT,,
10,14005090.0,55973.0,152234.0,200001.0,220045.0,2181-11-25 19:16:00,2181-11-25 19:16:00,20622.0,114,114.0,bpm,0.0,0.0,,,2181-11-25 19:06:00,0 days 00:10:00,600.0,10.0
11,14005092.0,55973.0,152234.0,200001.0,220277.0,2181-11-25 19:16:00,2181-11-25 19:16:00,20622.0,95,95.0,%,0.0,0.0,,,2181-11-25 19:14:00,0 days 00:02:00,120.0,2.0


## Apply Chunking Rule


In [8]:
chunking_dif = 65

In [9]:
# reduce data to relevant columns to make validation easier
#chunk_analysis_data = chunk_analysis_data[['ICUSTAY_ID','ITEMID','CHARTTIME','VALUENUM','VALUEUOM','CHARTTIME_PREV','DIF_CHARTTIME_PREV_MIN']]

In [10]:
# select all rows where dif to prev measurement is >chunking dif
chunk_data = chunk_analysis_data[chunk_analysis_data["DIF_CHARTTIME_PREV_MIN"] > chunking_dif]

In [11]:
# assign a unique chunking ID to these rows
chunk_data["CHUNK_ID"] = chunk_data.ICUSTAY_ID.map(str) + "_" + chunk_data.ITEMID.map(str) + "_" + chunk_data.CHARTTIME.map(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [12]:
# check uniqueness - can only be violated if multiple measurements for that itemid/icustayid occured at the same charttime
print(len(chunk_data["CHUNK_ID"].value_counts()))
print(len(chunk_data))
# uiqueness for this data set is given

237226
237226


In [13]:
# only keep chunkid and index
chunk_data_subset = chunk_data["CHUNK_ID"]

In [14]:
#merge back to all rows via index
#no we have a data set that has a chunk_id at the beginning of each measurement that was conducted later than the chunking rule allows
chunk_data_merged = pd.merge(chunk_analysis_data, chunk_data_subset,  how='left', left_index=True, right_index=True )

In [15]:
# change sorting structure -  turn ITEMID and CHARTTIME around
chunk_data_merged = chunk_data_merged.sort_values(by=['ICUSTAY_ID', 'ITEMID','CHARTTIME'])

In [16]:
# Assign Chunk ID to first measurement of   ICUSTAY_ID/TEMID in case it does not already exist
# Calculate min timestamp
chunk_data_min = chunk_data_merged.groupby(['ICUSTAY_ID','ITEMID'])['CHARTTIME'].min()
chunk_data_min_df = chunk_data_min.to_frame()
chunk_data_min_df.reset_index(inplace=True)

# for each first charttime (by ICUSTAYID/ITEEMID) create a chunk ID
chunk_data_min_df["CHUNK_ID_MIN"] = chunk_data_min_df.ICUSTAY_ID.map(str) + "_" + chunk_data_min_df.ITEMID.map(str) + "_" + chunk_data_min_df.CHARTTIME.map(str)

In [17]:
# merge that back so we have a chunk id for each first Measurement (by ICUSTAYID/TEMID)
chunk_data_merged_2 = pd.merge(chunk_data_merged, chunk_data_min_df,  how='left', on=['ICUSTAY_ID','ITEMID','CHARTTIME'])

In [18]:
import numpy as np
# if chunkIdMin not Nan,write chunk_id_min in chunk_id
# #no we have a data set that has a chunk_id at the beginning of each measurement that was conducted later than the chunking rule allows as well as an initial chunk id
chunk_data_merged_2['CHUNK_ID'] = np.where(chunk_data_merged_2['CHUNK_ID_MIN'].notnull(), chunk_data_merged_2['CHUNK_ID_MIN'], chunk_data_merged_2['CHUNK_ID'])

In [19]:
chunk_data_merged_2 = chunk_data_merged_2.drop(columns='CHUNK_ID_MIN')

In [20]:
# fill all cells with previous chunk id, until new chunk idea occurs
#pre-requisite: data is sorted by ICUSTAY_ID & ITEMID
chunk_data_merged_2['CHUNK_ID_FILLED'] = chunk_data_merged_2['CHUNK_ID'].fillna(method='ffill')

In [21]:
# remove columns that are obsolete now - only kept for validation purpose in previous steps
chunk_data_merged_2 = chunk_data_merged_2.drop(columns='CHUNK_ID')
chunk_data_merged_2.rename(columns={"CHUNK_ID_FILLED":"CHUNK_ID"})

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUENUM,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED,CHARTTIME_PREV,DIF_CHARTTIME_PREV,DIF_CHARTTIME_PREV_S,DIF_CHARTTIME_PREV_MIN,CHUNK_ID
0,14005075.0,55973.0,152234.0,200001.0,220045.0,2181-11-25 19:06:00,2181-11-25 19:17:00,20622.0,115,115.0,bpm,0.0,0.0,,,NaT,NaT,,,200001.0_220045.0_2181-11-25 19:06:00
1,14005090.0,55973.0,152234.0,200001.0,220045.0,2181-11-25 19:16:00,2181-11-25 19:16:00,20622.0,114,114.0,bpm,0.0,0.0,,,2181-11-25 19:06:00,0 days 00:10:00,600.0,10.0,200001.0_220045.0_2181-11-25 19:06:00
2,14005105.0,55973.0,152234.0,200001.0,220045.0,2181-11-25 20:00:00,2181-11-25 22:02:00,21108.0,113,113.0,bpm,0.0,0.0,,,2181-11-25 19:16:00,0 days 00:44:00,2640.0,44.0,200001.0_220045.0_2181-11-25 19:06:00
3,14005111.0,55973.0,152234.0,200001.0,220045.0,2181-11-25 21:00:00,2181-11-25 22:02:00,21108.0,108,108.0,bpm,0.0,0.0,,,2181-11-25 20:00:00,0 days 01:00:00,3600.0,60.0,200001.0_220045.0_2181-11-25 19:06:00
4,14005117.0,55973.0,152234.0,200001.0,220045.0,2181-11-25 22:00:00,2181-11-25 22:02:00,21108.0,110,110.0,bpm,0.0,0.0,,,2181-11-25 21:00:00,0 days 01:00:00,3600.0,60.0,200001.0_220045.0_2181-11-25 19:06:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6720050,20563946.0,69587.0,158288.0,299998.0,220277.0,2181-07-07 11:00:00,2181-07-07 11:15:00,18462.0,98,98.0,%,0.0,0.0,,,2181-07-07 10:00:00,0 days 01:00:00,3600.0,60.0,299998.0_220277.0_2181-07-06 20:00:00
6720051,20563951.0,69587.0,158288.0,299998.0,220277.0,2181-07-07 12:00:00,2181-07-07 13:37:00,18462.0,96,96.0,%,0.0,0.0,,,2181-07-07 11:00:00,0 days 01:00:00,3600.0,60.0,299998.0_220277.0_2181-07-06 20:00:00
6720052,20563960.0,69587.0,158288.0,299998.0,220277.0,2181-07-07 13:00:00,2181-07-07 13:37:00,18462.0,91,91.0,%,0.0,0.0,,,2181-07-07 12:00:00,0 days 01:00:00,3600.0,60.0,299998.0_220277.0_2181-07-06 20:00:00
6720053,20563966.0,69587.0,158288.0,299998.0,220277.0,2181-07-07 14:00:00,2181-07-07 14:47:00,18462.0,96,96.0,%,0.0,0.0,,,2181-07-07 13:00:00,0 days 01:00:00,3600.0,60.0,299998.0_220277.0_2181-07-06 20:00:00


In [23]:
chunk_data_merged_2 = chunk_data_merged_2.drop(columns='DIF_CHARTTIME_PREV')

In [24]:
# Save as parquet file
pd.DataFrame(chunk_data_merged_2).to_parquet('./data/chartevent_subset_values_with_chunkid_' + str(chunking_dif) + '.parquet', engine='pyarrow')