# ARIMA

## Data Preparation

In [2]:
import pandas as pd
import pyarrow as pa

# Read chartevents_subset from parquet file to pandas data frame
chartevents_subset = pd.read_parquet('./data/chartevents_clean_values_and_thresholds_with_chunkid_65_resampled.parquet', engine='pyarrow')

In [3]:
PARAMETER = 220045
CHUNKS = ['296490.0_220045.0_2192-09-26 23:51:00']

# Sampling rate of 1 data point per hour - Test for different values in the future - e.g. longer training set
TRAIN = 12 # 12 * 1 h = 12 hour training period
TEST = 2 # 2 * 1 h = 2 hours testing period
STEP = 1 # move 1 * 1 h = 1 hour per step

In [4]:
# Subset data based on PARAMETER & CHUNKS
arima_data = chartevents_subset[
    (chartevents_subset["ITEMID"] == PARAMETER) & 
    (chartevents_subset.CHUNK_ID_FILLED_TH.isin(CHUNKS))
    ][['CHUNK_ID_FILLED_TH','CHARTTIME','ITEMID','VALUENUM_CLEAN']]
display(arima_data)

Unnamed: 0,CHUNK_ID_FILLED_TH,CHARTTIME,ITEMID,VALUENUM_CLEAN
4918097,296490.0_220045.0_2192-09-26 23:51:00,2192-09-26 23:00:00,220045.0,95.0
4918098,296490.0_220045.0_2192-09-26 23:51:00,2192-09-27 00:00:00,220045.0,90.5
4918099,296490.0_220045.0_2192-09-26 23:51:00,2192-09-27 01:00:00,220045.0,91.0
4918100,296490.0_220045.0_2192-09-26 23:51:00,2192-09-27 02:00:00,220045.0,91.0
4918101,296490.0_220045.0_2192-09-26 23:51:00,2192-09-27 03:00:00,220045.0,85.0
...,...,...,...,...
4918829,296490.0_220045.0_2192-09-26 23:51:00,2192-10-27 11:00:00,220045.0,97.0
4918830,296490.0_220045.0_2192-09-26 23:51:00,2192-10-27 12:00:00,220045.0,90.0
4918831,296490.0_220045.0_2192-09-26 23:51:00,2192-10-27 13:00:00,220045.0,88.0
4918832,296490.0_220045.0_2192-09-26 23:51:00,2192-10-27 14:00:00,220045.0,88.0


In [5]:
# Filter for chunks that have sufficient values to be used for training and testing the model
all_chunks_value_count = arima_data.CHUNK_ID_FILLED_TH.value_counts()
chunkid_filter = all_chunks_value_count[all_chunks_value_count >= (TRAIN + TEST)].index
arima_data = arima_data[arima_data.CHUNK_ID_FILLED_TH.isin(chunkid_filter)]
display(arima_data)

Unnamed: 0,CHUNK_ID_FILLED_TH,CHARTTIME,ITEMID,VALUENUM_CLEAN
4918097,296490.0_220045.0_2192-09-26 23:51:00,2192-09-26 23:00:00,220045.0,95.0
4918098,296490.0_220045.0_2192-09-26 23:51:00,2192-09-27 00:00:00,220045.0,90.5
4918099,296490.0_220045.0_2192-09-26 23:51:00,2192-09-27 01:00:00,220045.0,91.0
4918100,296490.0_220045.0_2192-09-26 23:51:00,2192-09-27 02:00:00,220045.0,91.0
4918101,296490.0_220045.0_2192-09-26 23:51:00,2192-09-27 03:00:00,220045.0,85.0
...,...,...,...,...
4918829,296490.0_220045.0_2192-09-26 23:51:00,2192-10-27 11:00:00,220045.0,97.0
4918830,296490.0_220045.0_2192-09-26 23:51:00,2192-10-27 12:00:00,220045.0,90.0
4918831,296490.0_220045.0_2192-09-26 23:51:00,2192-10-27 13:00:00,220045.0,88.0
4918832,296490.0_220045.0_2192-09-26 23:51:00,2192-10-27 14:00:00,220045.0,88.0


In [6]:
# Create new MINUTES_SINCE_FIRST_RECORD column containing the time difference that has passed since the first timestamp of the measurement series.
import numpy as np
#arima_data['MINUTES_SINCE_FIRST_RECORD'] = arima_data.groupby('CHUNK_ID_FILLED_TH')#['CHARTTIME'].transform(lambda x: (x - x.min())/np.timedelta64(1,'m'))
# Alternative for hours instead of minutes
arima_data['HOURS_SINCE_FIRST_RECORD'] = arima_data.groupby('CHUNK_ID_FILLED_TH')['CHARTTIME'].transform(lambda x: (x - x.min())/np.timedelta64(1,'h'))
display(arima_data)

Unnamed: 0,CHUNK_ID_FILLED_TH,CHARTTIME,ITEMID,VALUENUM_CLEAN,MINUTES_SINCE_FIRST_RECORD
4918097,296490.0_220045.0_2192-09-26 23:51:00,2192-09-26 23:00:00,220045.0,95.0,0.0
4918098,296490.0_220045.0_2192-09-26 23:51:00,2192-09-27 00:00:00,220045.0,90.5,60.0
4918099,296490.0_220045.0_2192-09-26 23:51:00,2192-09-27 01:00:00,220045.0,91.0,120.0
4918100,296490.0_220045.0_2192-09-26 23:51:00,2192-09-27 02:00:00,220045.0,91.0,180.0
4918101,296490.0_220045.0_2192-09-26 23:51:00,2192-09-27 03:00:00,220045.0,85.0,240.0
...,...,...,...,...,...
4918829,296490.0_220045.0_2192-09-26 23:51:00,2192-10-27 11:00:00,220045.0,97.0,43920.0
4918830,296490.0_220045.0_2192-09-26 23:51:00,2192-10-27 12:00:00,220045.0,90.0,43980.0
4918831,296490.0_220045.0_2192-09-26 23:51:00,2192-10-27 13:00:00,220045.0,88.0,44040.0
4918832,296490.0_220045.0_2192-09-26 23:51:00,2192-10-27 14:00:00,220045.0,88.0,44100.0


In [7]:
# reduce dataset to small amount in order to first test script
# Now we have 15 measurements for that chunk; With a TRAIN of 12, a TEST of 2 and a STEP of 1 we expect to receive two training sets and two test sets - looking at row ids they would look like the following:
# first train = 0:11 ; first test= 12:13
# second train = 1:12 ; second test= 13:14
arima_data = arima_data[:15]

In [9]:
** Old Coding** - Change cell type to Code when needed
### Change data structure
### Create a list containing one element for each chunk, which are of type pandas series.
### Each of these series includes the measured values of the chunk with the MINUTES_SINCE_FIRST_RECORD as index.
### The data structure is transposed, so to speak, so that the MINUTES_SINCE_FIRST_RECORD that were previously in rows now serve as 'columns' (not literally; they are in the index of the series).

### MINUTES_SINCE_FIRST_RECORD  |     0 |    60 |   120 | ...
### ----------------------------------------------------- ...
### firstChunk                  |  95.0 |  90.5 |  91.0 | ...
### secondChunk                 | 110.5 | 108.0 | 110.0 | ...
### ...

### Set up list that will contain the chunk value series transformed as described above.
list_of_chunk_value_series = []

for chunkid in chunkid_filter:

    chunk_value_series = arima_data[arima_data.CHUNK_ID_FILLED_TH == chunkid].copy()
    chunk_value_series.set_index('MINUTES_SINCE_FIRST_RECORD', inplace=True)
    chunk_value_series.sort_index(inplace=True)    
    list_of_chunk_value_series.append(chunk_value_series['VALUENUM_CLEAN'])

In [None]:
# Needed Adaption for following cell:
# Change list_of_chunk_value_series from List to Dictionary
# The CHUNK_ID is used as key and in this step one key holds three series: the vital parameter series, the low threshold series and the high threshold series. They need the same "sampling rate" - so that the high threshold with index 0 is the high threshold that applies at the time of the vital parameter with index 0 

# Vital parameter Series:
# index                       |     0 |    1  |   2   | ...
# ----------------------------------------------------- ...
# firstChunk - Vital Parameter|  95.0 |  90.5 |  91.0 | ...

# Threshold High Series:
# index                       |     0 |    1  |   2   | ...
# ----------------------------------------------------- ...
# firstChunk - Th. High       |  120.0 |  120.0 |  110.0 | ...

# Threshold Low Series:
# index                       |     0 |    1  |   2   | ...
# ----------------------------------------------------- ...
# firstChunk - Th. High       |  70.0 |  70.0 |  60.0 | ...

# Create a list containing one element for each chunk, which are of type pandas series.
# Each of these series includes the measured values of the chunk.
# The index can be used to regain the HOURS_SINCE_FIRST_RECORD information when the step sampling rate is known - As we know have 1 measureemnt per hour we can use the index to derive that info.
# The data structure is transposed, so to speak, so that the index serves as 'column' (not literally; they are in the index of the series).

# index                       |     0 |    1  |   2   | ...
# ----------------------------------------------------- ...
# firstChunk                  |  95.0 |  90.5 |  91.0 | ...
# secondChunk                 | 110.5 | 108.0 | 110.0 | ...
# ...

# Set up list that will contain the chunk value series transformed as described above.
list_of_chunk_value_series = []

for chunkid in chunkid_filter:

    chunk_value_data = arima_data[arima_data.CHUNK_ID_FILLED_TH == chunkid].copy()
    chunk_value_series = chunk_value_data['VALUENUM_CLEAN']
    chunk_value_series = chunk_value_series.reset_index(drop=True)    
    list_of_chunk_value_series.append(chunk_value_series)


In [None]:
# Needed Adaption for following cell:
# Change chunk_value_series_with_test_and_train from List to Dictionary
# The CHUNK_ID is used as key and in this step one key holds two series: The train_list and the test_list. As long as the index in train and test list is kept, we can still refer to the difference to the first measurement (as long as sampling rate is one hour)

In [13]:
# Create multiple test & train sets for each chunk to iteratively predict the next x measurements
chunk_value_series_with_test_and_train = pd.DataFrame(columns=["SUB_CHUNK_ID", "TRAIN_LIST","TEST_LIST"])
for i, chunk_value_series in enumerate(list_of_chunk_value_series):
    for start in range(0, len(chunk_value_series) - (TRAIN + TEST)+1, STEP):

        sub_chunk_id = str(i)+str(start)
        train_list = chunk_value_series[start : start+TRAIN]
        test_list = chunk_value_series[start+TRAIN : start+TRAIN+TEST]
        a_new_row= {"SUB_CHUNK_ID":sub_chunk_id,"TRAIN_LIST":train_list,"TEST_LIST":test_list}
        a_new_row_series = pd.Series(a_new_row, name=sub_chunk_id)
        chunk_value_series_with_test_and_train = chunk_value_series_with_test_and_train.append(a_new_row_series)

Unnamed: 0,SUB_CHUNK_ID,TRAIN_LIST,TEST_LIST
0,0,0 95.0 1 90.5 2 91.0 3 91.0 4 ...,"12 75.0 13 75.5 Name: VALUENUM_CLEAN, dt..."
1,1,1 90.5 2 91.0 3 91.0 4 85.0 5 ...,"13 75.5 14 74.0 Name: VALUENUM_CLEAN, dt..."


In [None]:
# Needed Adaption for following cell:
# Currently we only have a true values list and a predictions list. But we are not interested in whether the prediction is exactly the true value. We want to see if the prediction value also triggers an alarm if the true value does. Therefore we need the threshold values that apply at the time of the respective prediction/true value.
# A Final version should hold the following informations that can be traced back to a specific Chunk ID:
# * List of true values (vital parameters in test list)
# * List of Threshold High (for the time at which the predictions take place)
# * List of Threshold Low (for the time at which the predictions take place)
# * Arima Predictions (the predictions for the true values based on the train values)

# Our thoughts:
# Currently prediction looks as follows (two colums as TRAIN is 2; two rows as two chunk_value_series are created for our chunk (containing 15 values)):

#   | 0                                         | 1
# 0 | first prediction for chunk_value_series 1 | second prediction for chunk_value_series 1
# 1 | first prediction for chunk_value_series 2 | second prediction for chunk_value_series 2

# We wanted to add the last index of the train_list and the CHUNK_ID in a nested way to these predictions so that we can trace them back to the thresholds that apply at the time of the prediction

#   | CHUNK_ID | Time ref. | 0                                 | 1
# 0 |  xxxx    | 11        | 1st pred. for chunk_value_series 1| 2nd pred. for chunk_value_series 1 
# 1 |  xxxx    | 12        | 1st pred. for chunk_value_series 2| 2nd pred. for chunk_value_series 2



In [15]:
# Conduct Arima
from progressbar import progressbar
import pmdarima as pm

true_values = []
prediction = []
all_sub_chunk_ids = chunk_value_series_with_test_and_train.SUB_CHUNK_ID.value_counts()

for i, sub_chunk_id in enumerate(all_sub_chunk_ids):
    arima = pm.auto_arima(chunk_value_series_with_test_and_train['TRAIN_LIST'][i])
    forecast = arima.predict(TEST)
    
    test_list = chunk_value_series_with_test_and_train["TEST_LIST"][i]
    test_list = test_list.reset_index(drop=True)
    test_list = test_list.to_numpy()

    true_values.append(test_list)
    prediction.append(forecast)

Arima from previous version:
auto_arima_model = auto_arima(data, start_p=1, start_q=1,
                            max_p=3, max_q=3, m=1,
                            start_P=0, seasonal=False,
                            d=1, D=1, trace=True,
                            error_action='ignore',  
                            suppress_warnings=True, 
                            stepwise=True)