# Resample cleaned chartevents chunks

In [None]:
import pandas as pd
import pyarrow as pa

chartevents_clean_values_and_thresholds_with_chunkid_65 = pd.read_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65.parquet', engine='pyarrow')

In [None]:
# Select rows with relevant ITEMIDs (only values, no thresholds) and
# Select relevant columns
chartevents_to_be_resampled = chartevents_clean_values_and_thresholds_with_chunkid_65[
    chartevents_clean_values_and_thresholds_with_chunkid_65.ITEMID.isin([220045,220179,220277])][
        ['CHUNK_ID_FILLED_TH','ICUSTAY_ID','ITEMID','CHARTTIME','VALUENUM_CLEAN']
    ].copy()

# Sort
chartevents_to_be_resampled = chartevents_to_be_resampled.sort_values(by=['CHUNK_ID_FILLED_TH','ICUSTAY_ID','ITEMID','CHARTTIME'])

display(chartevents_to_be_resampled)

In [None]:
# Resampling of VALUENUM_CLEAN with a frequency of 60min (1 hour), using the median of the values when downsampling.
# See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Grouper.html for further info
# Maybe interesting read: https://benalexkeen.com/resampling-time-series-data-with-pandas/

chartevents_resampled = chartevents_to_be_resampled.groupby(
    ['CHUNK_ID_FILLED_TH','ICUSTAY_ID','ITEMID', pd.Grouper(key='CHARTTIME', freq='1H')]
    )['VALUENUM_CLEAN'].median().reset_index()

display(chartevents_resampled)

In [None]:
import pandas as pd
import pyarrow as pa

# Save as parquet file
chartevents_resampled.to_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65_resampled.parquet', engine='pyarrow')

In [None]:
import pandas as pd
import pyarrow as pa

# Read from parquet file
chartevents_resampled = pd.read_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65_resampled.parquet', engine='pyarrow')

## Validate resmapling

Ongoing, not complete

In [None]:
# Selection of a single chunk for testing purposes
dummy = chartevents_to_be_resampled[chartevents_to_be_resampled.CHUNK_ID_FILLED_TH == '260223.0_220045.0_2156-07-22 06:49:00']
display(dummy)

In [None]:
# Applying resampling to a single chunk for test purposes
dummy_resampled = dummy.groupby(['CHUNK_ID_FILLED_TH','ICUSTAY_ID','ITEMID', pd.Grouper(key='CHARTTIME', freq='1H')])['VALUENUM_CLEAN'].median().reset_index()
display(dummy_resampled)

In [None]:
import numpy as np

delta = dummy.CHARTTIME.max() - dummy.CHARTTIME.min()
delta_in_hours = delta/np.timedelta64(1,'h')
lenght_resampled = len(dummy_resampled)

print("Difference",delta)
print("Difference in hours",delta_in_hours)
print("Length",lenght_resampled)

In [None]:
# Quick and dirty plot for testing purposes
import seaborn as sns
sns.set(rc={"figure.figsize":(15, 5)})
sns.lineplot(data=dummy, x="CHARTTIME", y="VALUENUM_CLEAN", hue="ITEMID", marker="o", markersize = 5)

In [None]:
# Quick and dirty plot for testing purposes
import seaborn as sns
sns.set(rc={"figure.figsize":(15, 5)})
sns.lineplot(data=dummy_resampled, x="CHARTTIME", y="VALUENUM_CLEAN", hue="ITEMID", marker="o", markersize = 5)