# Resample cleaned chartevents chunks

In [None]:
import pandas as pd
import pyarrow as pa

chartevents_clean_values_and_thresholds_with_chunkid_65 = pd.read_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65.parquet', engine='pyarrow')

In [None]:
# Select rows with relevant ITEMIDs (only values, no thresholds) and
# Select relevant columns
chartevents_to_be_resampled = chartevents_clean_values_and_thresholds_with_chunkid_65[
    chartevents_clean_values_and_thresholds_with_chunkid_65.ITEMID.isin([220045,220179,220277])][
        ['CHUNK_ID_FILLED_TH','ICUSTAY_ID','ITEMID','CHARTTIME','VALUENUM_CLEAN']
    ].copy()

# Sort
chartevents_to_be_resampled = chartevents_to_be_resampled.sort_values(by=['CHUNK_ID_FILLED_TH','ICUSTAY_ID','ITEMID','CHARTTIME'])

display(chartevents_to_be_resampled)

In [None]:
# Resampling of VALUENUM_CLEAN with a frequency of 60min (1 hour), using the median of the values when downsampling.
# See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Grouper.html for further info
# Maybe interesting read: https://benalexkeen.com/resampling-time-series-data-with-pandas/

chartevents_resampled_median = chartevents_to_be_resampled.groupby(
    ['CHUNK_ID_FILLED_TH','ICUSTAY_ID','ITEMID', pd.Grouper(key='CHARTTIME', freq='1H')]
    )['VALUENUM_CLEAN'].median().reset_index()

chartevents_resampled_mean = chartevents_to_be_resampled.groupby(
    ['CHUNK_ID_FILLED_TH','ICUSTAY_ID','ITEMID', pd.Grouper(key='CHARTTIME', freq='1H')]
    )['VALUENUM_CLEAN'].mean().reset_index()

In [None]:
display(chartevents_resampled_median)

In [None]:
display(chartevents_resampled_mean)

In [None]:
import pandas as pd
import pyarrow as pa

# Save as parquet file
chartevents_resampled_median.to_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65_resampled_median.parquet', engine='pyarrow')
chartevents_resampled_mean.to_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65_resampled_mean.parquet', engine='pyarrow')

In [None]:
import pandas as pd
import pyarrow as pa

# Read from parquet file
chartevents_resampled_median = pd.read_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65_resampled_median.parquet', engine='pyarrow')
chartevents_resampled_mean = pd.read_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65_resampled_mean.parquet', engine='pyarrow')

## Create Resampling Plots for Weekly 

In [None]:
import pandas as pd
import pyarrow as pa

chartevents_before = pd.read_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65.parquet', engine='pyarrow')
chartevents_resampled_median = pd.read_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65_resampled_median.parquet', engine='pyarrow')
chartevents_resampled_mean = pd.read_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65_resampled_mean.parquet', engine='pyarrow')

In [None]:
# Selection of a single chunk
chunkid = '226799.0_220045.0_2153-07-31 08:10:00'
# '260223.0_220045.0_2156-07-22 06:49:00'
# '203164.0_220045.0_2178-02-08 03:00:00'
# '296490.0_220045.0_2192-09-26 23:51:00'
# '226799.0_220045.0_2153-07-31 08:10:00'
itemid = 220045

before = chartevents_before[
    (chartevents_before.CHUNK_ID_FILLED_TH == chunkid) & (chartevents_before.ITEMID == itemid)][
    ['CHARTTIME','VALUENUM_CLEAN']].sort_values(by=['CHARTTIME'])

after_median = chartevents_resampled_median[(chartevents_resampled_median.CHUNK_ID_FILLED_TH == chunkid) & (chartevents_resampled_median.ITEMID == itemid)][
    ['CHARTTIME','VALUENUM_CLEAN']].sort_values(by=['CHARTTIME']).rename(columns={"VALUENUM_CLEAN": "MEDIAN"}).set_index('CHARTTIME')

after_mean = chartevents_resampled_mean[(chartevents_resampled_mean.CHUNK_ID_FILLED_TH == chunkid) & (chartevents_resampled_mean.ITEMID == itemid)][
    ['CHARTTIME','VALUENUM_CLEAN']].sort_values(by=['CHARTTIME']).rename(columns={"VALUENUM_CLEAN": "MEAN"}).set_index('CHARTTIME')

In [None]:
merged_resampling_methods = pd.concat([after_median,after_mean],axis=1)
# Melt for seaborn plot
after = pd.melt(merged_resampling_methods.reset_index(),'CHARTTIME')
after = after.sort_values(by=['CHARTTIME']).rename(columns={"variable": "RESAMPLING_METHOD", "value": "VALUENUM_CLEAN"}).reset_index(drop=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt

sns.set_style("whitegrid")

fig, axs = plt.subplots(
    2,
    1,
    figsize = (15, 8),
    sharex = True,
    dpi = 72 # e.g. 72 for screen, 300 for print
    )
fig.suptitle(str("CHUNK ID: "+str(chunkid)), fontweight='bold', color= 'black', fontsize=14, y=0.95)
# fig.subplots_adjust(hspace = 0.6)

ylimits = (min(before.VALUENUM_CLEAN)-5, max(before.VALUENUM_CLEAN)+5)

ax = sns.lineplot(
    ax = axs[0],
    data = before,
    x = "CHARTTIME",
    y = "VALUENUM_CLEAN",
    drawstyle = 'steps-post', # Interpolate missing values by using the last available value
    marker="o",
    markersize = 5,
    dashes = False,
    palette = [sns.color_palette("colorblind")[0]]
        )
axs[0].set_ylim(ylimits)
axs[0].set_title("Heart Rate - Before Resampling", fontweight='bold', color= 'black', fontsize=12, y=1.025)
axs[0].set_xlabel("Time", fontsize=12, labelpad=15)
axs[0].set_ylabel("Beats per minute", fontsize=12, labelpad=15)
axs[0].tick_params(axis="x", rotation=90)

ax = sns.lineplot(
    ax = axs[1],
    data = after,
    x = "CHARTTIME",
    y = "VALUENUM_CLEAN",
    hue = "RESAMPLING_METHOD",
    drawstyle = 'steps-post', # Interpolate missing values by using the last available value
    marker="o",
    markersize = 5,
    dashes = False,
    palette = [sns.color_palette("colorblind")[0],sns.color_palette("colorblind")[1]]
        )
axs[1].legend(loc='center left', bbox_to_anchor=(1, 0.5))
axs[1].set_ylim(ylimits)
axs[1].set_title("Heart Rate - After Resampling", fontweight='bold', color= 'black', fontsize=12, y=1.025)
axs[1].set_xlabel("Time", fontsize=12, labelpad=15)
axs[1].set_ylabel("Beats per minute", fontsize=12, labelpad=15)
axs[1].tick_params(axis="x", rotation=90)

path = '../plots_week07/'
timestamp = dt.datetime.today().strftime('%Y-%m-%dT%H-%M-%S')
plt.savefig(str(path + '_resampling_plot_' + timestamp + '.png'), dpi=300, bbox_inches='tight')

# plt.show(fig)

## Validate resmapling

Ongoing, not complete

In [None]:
# Selection of a single chunk for testing purposes
dummy = [chartevents_to_be_resampled.CHUNK_ID_FILLED_TH == '260223.0_220045.0_2156-07-22 06:49:00']
display(dummy)

In [None]:
# Applying resampling to a single chunk for test purposes
dummy_resampled = dummy.groupby(['CHUNK_ID_FILLED_TH','ICUSTAY_ID','ITEMID', pd.Grouper(key='CHARTTIME', freq='1H')])['VALUENUM_CLEAN'].median().reset_index()
display(dummy_resampled)

In [None]:
import numpy as np

delta = dummy.CHARTTIME.max() - dummy.CHARTTIME.min()
delta_in_hours = delta/np.timedelta64(1,'h')
lenght_resampled = len(dummy_resampled)

print("Difference",delta)
print("Difference in hours",delta_in_hours)
print("Length",lenght_resampled)

In [None]:
# Quick and dirty plot for testing purposes
import seaborn as sns
sns.set(rc={"figure.figsize":(15, 5)})
sns.lineplot(data=dummy, x="CHARTTIME", y="VALUENUM_CLEAN", hue="ITEMID", marker="o", markersize = 5)

In [None]:
# Quick and dirty plot for testing purposes
import seaborn as sns
sns.set(rc={"figure.figsize":(15, 5)})
sns.lineplot(data=dummy_resampled, x="CHARTTIME", y="VALUENUM_CLEAN", hue="ITEMID", marker="o", markersize = 5)