In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
chartevents_subset = pd.read_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65_resampled.parquet', engine='pyarrow')

In [None]:
PARAMETER = 220045
CHUNKS = ['296490.0_220045.0_2192-09-26 23:51:00','260223.0_220045.0_2156-07-22 06:49:00']

TRAIN = 60 # 60 * 5 min = 5 hours of training
TEST = 12 # 12 * 5 min = 1 hour of testing
STEP = 6 # move 6 * 5 min = 0.5 hours per step

In [None]:
# subset data based on PARAMETER & CHUNKS
arima_data = chartevents_subset.loc[(chartevents_subset["ITEMID"] == PARAMETER) & (chartevents_subset.CHUNK_ID_FILLED_TH.isin(CHUNKS)) ,['CHUNK_ID_FILLED_TH','CHARTTIME','ITEMID','VALUENUM_CLEAN']]

In [None]:
all_chunks = arima_data.CHUNK_ID_FILLED_TH.value_counts()
relevant_chunks = all_chunks[all_chunks >= (TRAIN + TEST)].index
arima_data = arima_data.loc[arima_data.CHUNK_ID_FILLED_TH.isin(relevant_chunks)]

In [None]:
# Create new Column that holds difference to first measurement
import numpy as np
arima_data['HOURS_SINCE_FIRST'] = arima_data.groupby('CHUNK_ID_FILLED_TH')['CHARTTIME'].transform(lambda x: (x - x.min())/np.timedelta64(1,'h'))


In [None]:
# Create one row for each chunk; each column is a 'HOURS_SINCE_FIRST' value 
# index     | 1 | 2 | 3...
# firstChunk|89 | 93| 102...
#secondChunk| 77| 81|90...
measurements = []


for chunk in relevant_chunks:

    chunk_data = arima_data[arima_data.CHUNK_ID_FILLED_TH == chunk].copy()
    chunk_data.set_index('HOURS_SINCE_FIRST', inplace=True)
    chunk_data.sort_index(inplace=True)    
    measurements.append(chunk_data['VALUENUM_CLEAN'])


In [None]:
# Create multiple test & train sets for each chunk
chunk_with_test_train = pd.DataFrame(columns=["SUB_CHUNK_ID", "TRAIN_LIST","TEST_LIST"])

#merged_test_train = dict()

#single_test_train = dict()

for i,measurement in enumerate(measurements):
    #für jeden startpunkt eines neuen train/test-abschnittes diese chunks (von 0 bis (Gesamtlänge dieser Patientenmessreihe - (Train+Test)) gehe STEPS weiter )
    for start in range(0, len(measurement) - (TRAIN + TEST), STEP):
        sub_chunk_id = str(i)+str(start)
        train_list = measurement[start : start+TRAIN]
        test_list = measurement[start+TRAIN : start+TRAIN+TEST]
        a_new_row= {"SUB_CHUNK_ID":sub_chunk_id,"TRAIN_LIST":train_list,"TEST_LIST":test_list}
        a_new_row_series = pd.Series(a_new_row, name=sub_chunk_id)
        chunk_with_test_train = chunk_with_test_train.append(a_new_row_series)

In [None]:
# conduct arima
from progressbar import progressbar
import pmdarima as pm

condition = []
prediction = []
all_sub_chunk_ids = chunk_with_test_train.SUB_CHUNK_ID.value_counts()

for i, sub_chunk_id in enumerate(all_sub_chunk_ids):
    arima = pm.auto_arima(chunk_with_test_train['TRAIN_LIST'][i])
    forecast = arima.predict(TEST)

    condition.append(min(chunk_with_test_train["TEST_LIST"][i]) > 120)
    prediction.append(min(forecast) > 120)

In [None]:
tp, tn, fp, fn = 0, 0, 0, 0

for cond, pred in zip(condition, prediction):
    if cond and pred:
        tp += 1
    if cond and not pred:
        fn += 1
    if not cond and pred:
        fp += 1
    if not cond and not pred:
        tn += 1

In [None]:
print(f"TP = {tp}")
print(f"TN = {tn}")
print(f"FP = {fp}")
print(f"FN = {fn}")
print()
print(f"Sens = {tp/(tp+fn)} (recall)")
print(f"Spec = {tn/(tn+fp)}")
print(f"PPV  = {tp/(tp+fp)} (precision)")

## ARIMA Forecasting for Single Time Series

Perform ARIMA analysis and prediction for a single manually selected time series, i.e. a single chunk and thus a single parameter.

The following is an updated and annotated version of the steps performed above. I did not want to overwrite these. In the next step, we should consolidate so that we only use one version.

In [None]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
chartevents_subset = pd.read_parquet('../data/chartevents_clean_values_and_thresholds_with_chunkid_65_resampled.parquet', engine='pyarrow')

In [None]:
PARAMETER = 220045
CHUNKS = ['296490.0_220045.0_2192-09-26 23:51:00','260223.0_220045.0_2156-08-06 17:46:00'] # ['296490.0_220045.0_2192-09-26 23:51:00'] # Only a single chunk is selected

# Sampling rate of 1 data point per hour 
TRAIN = 60 # 60 * 1 h = 60 hour training period
TEST = 12 # 12 * 1 h = 12 hour testing period
STEP = 6 # move 6 * 1 h = 6 hours per step

In [None]:
# Subset data based on PARAMETER & CHUNKS
arima_data = chartevents_subset[
    (chartevents_subset["ITEMID"] == PARAMETER) & 
    (chartevents_subset.CHUNK_ID_FILLED_TH.isin(CHUNKS))
    ][['CHUNK_ID_FILLED_TH','CHARTTIME','ITEMID','VALUENUM_CLEAN']]
display(arima_data)

In [None]:
# Filter for chunks that have sufficient values to be used for training and testing the model
all_chunks_value_count = arima_data.CHUNK_ID_FILLED_TH.value_counts()
chunkid_filter = all_chunks_value_count[all_chunks_value_count >= (TRAIN + TEST)].index
arima_data = arima_data[arima_data.CHUNK_ID_FILLED_TH.isin(chunkid_filter)]
display(arima_data)

In [None]:
# Create new MINUTES_SINCE_FIRST_RECORD column containing the time difference that has passed since the first timestamp of the measurement series.
import numpy as np
arima_data['MINUTES_SINCE_FIRST_RECORD'] = arima_data.groupby('CHUNK_ID_FILLED_TH')['CHARTTIME'].transform(lambda x: (x - x.min())/np.timedelta64(1,'m'))
# Alternative for hours instead of minutes
# arima_data['HOURS_SINCE_FIRST_RECORD'] = arima_data.groupby('CHUNK_ID_FILLED_TH')['CHARTTIME'].transform(lambda x: (x - x.min())/np.timedelta64(1,'h'))
display(arima_data)

In [None]:
# Change data structure
# Create a list containing one element for each chunk, which are of type pandas series.
# Each of these series includes the measured values of the chunk with the MINUTES_SINCE_FIRST_RECORD as index.
# The data structure is transposed, so to speak, so that the MINUTES_SINCE_FIRST_RECORD that were previously in rows now serve as 'columns' (not literally; they are in the index of the series).

# MINUTES_SINCE_FIRST_RECORD  |     0 |    60 |   120 | ...
# ----------------------------------------------------- ...
# firstChunk                  |  95.0 |  90.5 |  91.0 | ...
# secondChunk                 | 110.5 | 108.0 | 110.0 | ...
# ...

# Set up list that will contain the chunk value series transformed as described above.
list_of_chunk_value_series = []

for chunkid in chunkid_filter:

    chunk_value_series = arima_data[arima_data.CHUNK_ID_FILLED_TH == chunkid].copy()
    chunk_value_series.set_index('MINUTES_SINCE_FIRST_RECORD', inplace=True)
    chunk_value_series.sort_index(inplace=True)    
    list_of_chunk_value_series.append(chunk_value_series['VALUENUM_CLEAN'])


In [None]:
# The step of creating multiple test & training sets per measurement series may be  skipped, as we currently see no use for it in the context of ARIMA.

# Create multiple test & training sets per chunk value series
chunk_value_series_with_test_and_train = pd.DataFrame(columns=["SUB_CHUNK_ID", "TRAIN_LIST","TEST_LIST"])

for i, chunk_value_series in enumerate(list_of_chunk_value_series):

    # For each starting point of a new train/test section of this chunk (from 0 to total length of this chunk value series - (TRAIN + TEST)) move STEPS 
    for start in range(0, len(chunk_value_series) - (TRAIN + TEST), STEP):

        sub_chunk_id = str(i)+str(start)
        train_list = chunk_value_series[start : start+TRAIN]
        test_list = chunk_value_series[start+TRAIN : start+TRAIN+TEST]
        a_new_row= {"SUB_CHUNK_ID":sub_chunk_id,"TRAIN_LIST":train_list,"TEST_LIST":test_list}
        a_new_row_series = pd.Series(a_new_row, name=sub_chunk_id)
        chunk_value_series_with_test_and_train = chunk_value_series_with_test_and_train.append(a_new_row_series)

display(chunk_value_series_with_test_and_train)

In [None]:
# Conduct ARIMA for single times series

# Used resources:
# https://kanoki.org/2020/04/30/time-series-analysis-and-forecasting-with-arima-python/
# https://medium.com/@josemarcialportilla/using-python-and-auto-arima-to-forecast-seasonal-time-series-90877adff03c

# Select single times series
data = pd.DataFrame(list_of_chunk_value_series[0])

# Find the best fit ARIMA model for the univariate time series data using auto_arima

# I assume that our value series are not seasonal (to be checked)
# Therefore, I set seasonal=False and m=1
# See also https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.auto_arima.html

from pmdarima.arima import auto_arima
auto_arima_model = auto_arima(data, start_p=1, start_q=1,
                            max_p=3, max_q=3, m=1,
                            start_P=0, seasonal=False,
                            d=1, D=1, trace=True,
                            error_action='ignore',  
                            suppress_warnings=True, 
                            stepwise=True)

print(auto_arima_model.aic())

In [None]:
# Train Test Split
split_position = round((len(data)*80)/100) # Position of the row which is positioned at 80% of the total number of rows
train = data.iloc[:split_position]
test = data.iloc[split_position:]

print("Total data length:",len(data))
print("Train data length:",len(train))
print("Test data length:",len(test))

In [None]:
# Train the Model
auto_arima_model.fit(train)

In [None]:
forecast = auto_arima_model.predict(len(test))
# This returns an array of predictions:
print(forecast)

In [None]:
# Reorganize set of predictions by creating a dataframe that contains forecast and then concatenating that with the original data.
forecast = pd.DataFrame(forecast,index = test.index,columns=['VALUENUM_PREDICTION'])

evaluation_data = pd.concat([test,forecast],axis=1)
display(evaluation_data)

In [None]:
# Quick and dirty plot for testing purposes
import seaborn as sns
sns.set(rc={"figure.figsize":(15, 5)})
sns.lineplot(
    data=pd.melt(evaluation_data.reset_index(),'MINUTES_SINCE_FIRST_RECORD'), # Reshape data frame for seaborn
    x="MINUTES_SINCE_FIRST_RECORD",
    y="value",
    hue="variable",
    marker="o",
    markersize = 5
    )

In [None]:
# Quick and dirty plot with training period
evaluation_data_with_train = pd.concat([train,test,forecast],axis=1)
import seaborn as sns
sns.set(rc={"figure.figsize":(15, 5)})
sns.lineplot(
    data=pd.melt(evaluation_data_with_train.reset_index(),'MINUTES_SINCE_FIRST_RECORD'), # Reshape data frame for seaborn
    x="MINUTES_SINCE_FIRST_RECORD",
    y="value",
    hue="variable",
    marker="o",
    markersize = 5
    )

In [None]:
# Observation: prediction is pretty crappy
# No idea what the most likely reason is. I'll try it with ARIMA default settings.

In [None]:
# Find the best fit ARIMA model for the univariate time series data using auto_arima
from pmdarima.arima import auto_arima
auto_arima_model = auto_arima(data)
print(auto_arima_model.aic())

# Train the Model
auto_arima_model.fit(train)

# Evaluate
forecast = auto_arima_model.predict(len(test))
forecast = pd.DataFrame(forecast,index = test.index,columns=['VALUENUM_PREDICTION'])
evaluation_data = pd.concat([test,forecast],axis=1)

import seaborn as sns
sns.set(rc={"figure.figsize":(15, 5)})
sns.lineplot(
    data=pd.melt(evaluation_data.reset_index(),'MINUTES_SINCE_FIRST_RECORD'), # Reshape data frame for seaborn
    x="MINUTES_SINCE_FIRST_RECORD",
    y="value",
    hue="variable",
    marker="o",
    markersize = 5
    )

In [None]:
# Observation: prediction is still crappy