# Collection of Miscellaneous Scripts Needed for Prediction with RNNModels

## Extract Chunk IDs for Prediction with 20% of Chunks

This extraction is needed for the comparison with the ARIMA(X) approach when only 20% of the chunks are predicted. It assumes that the prediction series have the following naming convention: `pred_series_{parameter}_{n_chunks}.pickle`.

In [None]:
import os
import pickle5 as pickle

path = '../../data/chunk_ids/20_percent'

for file in os.listdir(path):
    if os.path.isfile(os.path.join(path, file)) and file.startswith('pred_series'):
        # Load current prediction series
        current_pred_series_f = open(f'{path}/{file}', 'rb')
        current_pred_series = pickle.load(current_pred_series_f)
        current_pred_series_f.close()

        # Extract substrings and current chunk ID list
        parameter = file.split('_')[2]
        n_chunks = file.split('_')[3].split('.')[0]
        current_chunk_ids = list(current_pred_series.keys())

        print(f'{parameter} with {n_chunks} chunks: {len(current_chunk_ids)} chunks for prediction')

        # Save current chunk ID list
        current_chunk_ids_f = open(f'{path}/chunk_ids_{parameter}_{n_chunks}.pickle', 'wb')
        pickle.dump(current_chunk_ids, current_chunk_ids_f, protocol=pickle.HIGHEST_PROTOCOL)
        current_chunk_ids_f.close()

## Extract Chunk IDs for Prediction with All Chunks and Verify Generated Chunk IDs

This extraction is performed to compare if the same chunks are considered in the ARIMA(X) and the RNNModel approach when all chunks are predicted (aka five times different 20% are predicted with the RNNModel approach).

In [None]:
from collections import defaultdict
import os
import pickle5 as pickle

path = '../../data/chunk_ids/all'
chunk_ids = defaultdict(list)

for file in os.listdir(path):
    if os.path.isfile(os.path.join(path, file)) and file.startswith('pred_series'):

        # Load current prediction series
        current_pred_series_f = open(f'{path}/{file}', 'rb')
        current_pred_series = pickle.load(current_pred_series_f)
        current_pred_series_f.close()

        # Extract substrings
        parameter = file.split('_')[2]
        n_chunks = file.split('_')[3]
        window = file.split('_')[4].split('.')[0]

        # Add partial list of chunk IDs to dict
        current_chunk_ids = list(current_pred_series.keys())
        if f'{parameter}_{n_chunks}' not in chunk_ids:
            chunk_ids[f'{parameter}_{n_chunks}'] = list()
        chunk_ids[f'{parameter}_{n_chunks}'] = chunk_ids[f'{parameter}_{n_chunks}'] + current_chunk_ids

# Combine partial lists of windows to final list and save it
for parameter in ['HR', 'BP', 'O2']:
    for n_chunks in [1000]:
        current_chunk_ids_f = open(f'{path}/chunk_ids_{parameter}_{n_chunks}.pickle', 'wb')
        pickle.dump(chunk_ids[f'{parameter}_{n_chunks}'], current_chunk_ids_f, protocol=pickle.HIGHEST_PROTOCOL)
        current_chunk_ids_f.close()

In [None]:
import pandas as pd

# Check if combined chunk IDs match expected ones
for parameter in ['HR', 'BP', 'O2']:
    for n_chunks in [1000]:

        # Extract list with chunk IDs from prediction
        current_chunk_ids_f = open(f'{path}/chunk_ids_{parameter}_{n_chunks}.pickle', 'rb')
        current_chunk_ids_pred = pickle.load(current_chunk_ids_f)
        current_chunk_ids_f.close()

        # Extract list with expected chunk IDs
        current_chunk_ids_original = list()
        resampled_chunks = pd.read_parquet(f'../../data/resampling/resample_output_{parameter}_first{n_chunks}.parquet',
                                           engine='pyarrow')

        for chunk_id in pd.unique(resampled_chunks.CHUNK_ID_FILLED_TH):
            current_series = resampled_chunks[resampled_chunks['CHUNK_ID_FILLED_TH'] == chunk_id]

            if len(current_series) > 12:
                current_chunk_ids_original.append(chunk_id)

        # Inform if chunk IDs from prediction don't match expected ones
        if set(current_chunk_ids_pred) != set(current_chunk_ids_original):
            print(f'There are different chunk IDs than expected for {parameter} with {n_chunks} chunks')

## Analyze ValueError

There were ValueErrors for the execution of the O2 runs with 1,000 chunks and for all runs with 15,000 chunks which were thrown in the confusion matrix generation and which led to predictions full of NaNs. Its origin lay in our resampling of the chunks, in which individual (very few) data points were missing and thus were filled in by Darts with NaN values by default. The following code cell only includes the final extraction of chunk IDs were values were missing.

Note: It does not matter which resampling method is investigated as they all are dealing with the same chunk IDs. We have randomly chosen the MEDIAN method.

In [3]:
from darts import TimeSeries
import pandas as pd

for n_chunks in [1000, 15000]:
    for parameter in ['hr', 'bp', 'o2']:

        resampled_chunks = pd.read_parquet(f'../../data/resampling/resample_output_{parameter}_first{n_chunks}.parquet',
                                           engine='pyarrow')

        # Extract relevant (= minimal length 13) chunks
        relevant_series = dict()

        for chunk_id in pd.unique(resampled_chunks.CHUNK_ID_FILLED_TH):
            current_series = resampled_chunks[resampled_chunks['CHUNK_ID_FILLED_TH'] == chunk_id]

            if len(current_series) > 12:
                relevant_series[chunk_id] = TimeSeries.from_dataframe(
                    df=current_series,
                    time_col='CHARTTIME',
                    value_cols=['VITAL_PARAMTER_VALUE_MEDIAN_RESAMPLING'],
                    freq='H')

        # Look for chunks with NaN values (missing values are filled by Darts per default)
        chunk_ids_with_nan = list()

        for chunk_id in relevant_series.keys():
            chunk_as_df = relevant_series[chunk_id].pd_dataframe()
            chunk_as_df.reset_index(level=0, inplace=True)
            chunk_as_df.columns = ['Time', 'Value']

            if chunk_as_df['Value'].isnull().values.any():
                chunk_ids_with_nan.append(chunk_id)

        print(f'Chunk IDs with missing values for {parameter.upper()} with {n_chunks} chunks: \n{chunk_ids_with_nan}\n')

Chunk IDs with missing values for HR with 1000 chunks: 
[] 

Chunk IDs with missing values for BP with 1000 chunks: 
[] 

Chunk IDs with missing values for O2 with 1000 chunks: 
['200238.0_220277.0_2117-04-22 21:31:00'] 

Chunk IDs with missing values for HR with 15000 chunks: 
['203781.0_220045.0_2195-07-29 08:00:00', '214944.0_220045.0_2115-10-28 15:16:00', '217172.0_220045.0_2121-06-25 17:39:00', '218982.0_220045.0_2162-10-16 07:01:00', '224573.0_220045.0_2108-12-25 19:04:00'] 

Chunk IDs with missing values for BP with 15000 chunks: 
['200944.0_220179.0_2110-08-28 23:59:00', '203781.0_220179.0_2195-07-30 19:14:00'] 

Chunk IDs with missing values for O2 with 15000 chunks: 
['200238.0_220277.0_2117-04-22 21:31:00', '201821.0_220277.0_2193-01-14 15:59:00', '203781.0_220277.0_2195-07-29 08:00:00', '212983.0_220277.0_2146-03-31 12:06:00', '213474.0_220277.0_2168-01-31 15:59:00'] 

