# Analysis of Predictions Produced on Chunk Level with RNNModel Class by Darts

This script analyzes all pickle files in `./data/{approach}/{n_chunks}_chunks/{style}/`, starting with `confusion_matrix_chunks`, i.e. all chunk level results. At the moment, the paths are adapted for local execution.

## Merge and Adjust Chunk Level Results

The following analysis steps are only performed for one chunk-specific matrix file.

### Define Variables to Adjust

In [None]:
# Adjust variables defining path
n_chunks = 1000
style = 'all'

# Adjust variables defining model
version = 'normal'
model_type = 'LSTM'
parameter = 'o2'
endogenous_input = 'MEDIAN'

# Adjust variable defining selected window of chunks to predict
window_idx = 0

# Adjust variable for correlation plot
input_length = 12

### Print One Chunk Level Matrix

In [None]:
from IPython.display import display
import pickle5 as pickle

# Define path to all chunk level matrices produced by prediction
path_to_chunk_matrices = f'../../data/darts/{n_chunks}_chunks/{style}'

# Read chunk-specific matrix
chunks_matrix_f = open(f'{path_to_chunk_matrices}/confusion_matrix_chunks_{model_type}_{parameter}_{endogenous_input}_'
                       f'{version}_window{window_idx}.pickle', 'rb')
chunks_matrix = pickle.load(chunks_matrix_f)
chunks_matrix_f.close()

# Show chunk-specific matrix
display(chunks_matrix)

### Add Metrics to Each Chunk

In [None]:
import numpy as np

# Note: To avoid dividing by zero, zeros are converted to NaN before division (since any value divided by NaN gives NaN)

chunks_matrix['fp_tn_divisor'] = chunks_matrix['FP'] + chunks_matrix['TN']
chunks_matrix['TNR'] = chunks_matrix.TN.div(chunks_matrix.fp_tn_divisor.where(chunks_matrix.fp_tn_divisor != 0, np.nan))
chunks_matrix['FPR'] = chunks_matrix.FP.div(chunks_matrix.fp_tn_divisor.where(chunks_matrix.fp_tn_divisor != 0, np.nan)) # 1 - TNR

chunks_matrix['fn_tp_divisor'] = chunks_matrix['FN'] + chunks_matrix['TP']
chunks_matrix['TPR'] = chunks_matrix.TP.div(chunks_matrix.fn_tp_divisor.where(chunks_matrix.fn_tp_divisor != 0, np.nan))
chunks_matrix['FNR'] = chunks_matrix.FN.div(chunks_matrix.fn_tp_divisor.where(chunks_matrix.fn_tp_divisor != 0, np.nan)) # 1 - TPR

chunks_matrix['F1S_divisor'] = chunks_matrix['TP'] + 0.5 * (chunks_matrix['FP'] + chunks_matrix['FN'])
chunks_matrix['F1S'] = chunks_matrix.TP.div(chunks_matrix.F1S_divisor.where(chunks_matrix.F1S_divisor != 0, np.nan))

chunks_matrix['ACC_dividend'] = chunks_matrix['TN'] + chunks_matrix['TP']
chunks_matrix['ACC_divisor'] = chunks_matrix['fp_tn_divisor'] + chunks_matrix['fn_tp_divisor']
chunks_matrix['ACC'] = chunks_matrix.ACC_dividend.div(chunks_matrix.ACC_divisor.where(chunks_matrix.ACC_divisor != 0,
                                                                                      np.nan))

# Round all floats to 4 decimal places
# Note: round() does not work for floats with many decimal places
decimals = 4
for col in ['FPR', 'TPR', 'FNR', 'TNR', 'ACC', 'F1S']:
    chunks_matrix[col] = chunks_matrix[col].apply(lambda x: round(x, decimals))

# Sort and remove helper columns for similarity with model level matrices
chunks_matrix = chunks_matrix[['CHUNK_ID', 'SCALED', 'PARAMETER', 'MODEL', 'ENDOGENOUS', 'EXOGENOUS', 'FIRST_FORECAST',
                               'ALARM_TYPE', 'FP', 'TP', 'FN', 'TN', 'FPR', 'TPR', 'FNR', 'TNR', 'ACC', 'F1S',
                               'N_HIGH_ALARMS', 'N_LOW_ALARMS', 'N_ITERATIONS']]

# Show complemented chunk level matrix for one chunk
display(chunks_matrix)

## Visualization of Chunk Level Results

### Plot Correlation Between Chunk Length and F1 Score/ Specificity (TNR) of Chunk

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Add column for chunk length to all chunks of matrix
chunks_matrix['LENGTH'] = chunks_matrix['N_ITERATIONS'] + input_length

for metric in ['F1S', 'TNR']:

    # Define background color, subplots and suptitle
    sns.set_style('whitegrid')
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))
    fig.suptitle(f'Correlation of Chunk Length and {metric} of Chunk ({style.replace("_", " ").upper()})', fontsize=14)

    if endogenous_input == 'MIN':
        ax1.set_visible(False)
        ax2.set_position([1, 0.2, 0.05, 0.2])
    else:
        # Extract chunks for high and low analysis plot
        high_chunks = chunks_matrix[chunks_matrix['ALARM_TYPE'] == 'High'][[metric, 'LENGTH']]

        # Introduce mean value for each length
        # Note: If mean value of metric is used, lines can be drawn again (with default of linestyle parameter)
        #high_chunks = high_chunks.astype(float)
        #high_chunks = high_chunks.groupby('LENGTH').mean()

        # Reset indices to make access via column names possible again
        high_chunks.reset_index(level=0, inplace=True, drop=True)

        # Add left plot (high threshold analysis)
        ax1.plot('LENGTH',
                 metric,
                 data=high_chunks,
                 marker='o',
                 color=sns.color_palette('colorblind')[0],
                 linestyle='None')
        ax1.set_title(f'{metric} Regarding High Thresholds', fontsize=10)
        ax1.set_xlabel('Chunk Length', fontsize=8)
        ax1.set_ylabel(f'{metric} of Chunk', fontsize=8)
        ax1.set_ylim(bottom=0, top=1.1)

    if endogenous_input == 'MAX':
        ax2.set_visible(False)
        ax1.set_position([0, 0.2, 0.05, 0.2])
    else:
        # Extract chunks for high and low analysis plot
        low_chunks = chunks_matrix[chunks_matrix['ALARM_TYPE'] == 'Low'][[metric, 'LENGTH']]

        # Introduce mean value for each length
        # Note: If mean value of metric is used, lines can be drawn again (with default of linestyle parameter)
        #low_chunks = vlow_chunks.astype(float)
        #low_chunks = low_chunks.groupby('LENGTH').mean()

        # Reset indices to make access via column names possible again
        low_chunks.reset_index(level=0, inplace=True, drop=True)

        # Add right plot (low threshold analysis)
        ax2.plot('LENGTH',
                 metric,
                 data=low_chunks,
                 marker='o',
                 color=sns.color_palette('colorblind')[1],
                 linestyle='None')
        ax2.set_title(f'{metric} Regarding Low Thresholds', fontsize=10)
        ax2.set_xlabel('Chunk Length', fontsize=8)
        ax2.set_ylabel(f'{metric} of Chunk', fontsize=8)
        ax2.set_ylim(bottom=0, top=1.1)

    # Improve layout and save figure
    fig.tight_layout()
    fig.show()
    fig.savefig(f'../../plots/darts/{n_chunks}_chunks/{style}/rnn_results_correlation_chunk_length_and_{metric}_{model_type}_{parameter}_'
                f'{endogenous_input}_{version}.png', dpi=72)

### Time-Series Plot of Chunk with Prediction

Note: `chunk_ids_plotting` have to be adjusted manually.

In [None]:
print(f'Original amount of chunks: {len(chunks_matrix)}\n')

interesting_chunks = chunks_matrix[chunks_matrix.FPR.notnull() & chunks_matrix.F1S.notnull()]
print(f'Amount of interesting chunks: {len(interesting_chunks)}\n')

print(interesting_chunks[['CHUNK_ID', 'FPR', 'TPR', 'FNR', 'TNR', 'ACC', 'F1S', 'N_HIGH_ALARMS', 'N_LOW_ALARMS']])

In [None]:
from darts import TimeSeries
import pandas as pd

chunk_ids_plotting = ['200098.0_220277.0_2136-03-27 12:00:00', '200061.0_220277.0_2134-01-24 14:15:00']

for chunk_id in chunk_ids_plotting:

    # Format chunk IDs into Windows format that have to be used when loading from or saving to Windows machine
    chunk_id_win10 = chunk_id.replace(':', '%3A')

    # Extract predicted series of chunk
    prediction_chunk_f = open(f'../../data/darts/{n_chunks}_chunks/{style}/{model_type}/{parameter}/{endogenous_input}/'
                              f'05_prediction_{chunk_id_win10}_{version}_window{window_idx}.pickle', 'rb')
    prediction_chunk = pickle.load(prediction_chunk_f)
    prediction_chunk_f.close()

    # Convert predicted series of chunk to TimeSeries object
    prediction_chunk = TimeSeries.from_dataframe(
        df=prediction_chunk,
        time_col='Time',
        value_cols=['Value'],
        freq='H')

    # Extract original series of chunk
    resampled_chunks = pd.read_parquet(f'../../data/resampling/resample_output_{parameter}_first{n_chunks}.parquet',
                                       engine='pyarrow')
    original_chunk = resampled_chunks[resampled_chunks['CHUNK_ID_FILLED_TH'] == chunk_id]

    # Convert original series of chunk to TimeSeries object
    original_chunk = TimeSeries.from_dataframe(
        df=original_chunk,
        time_col='CHARTTIME',
        value_cols=[f'VITAL_PARAMTER_VALUE_{endogenous_input}_RESAMPLING'],
        freq='H')

    # Actual plot
    sns.set_style('whitegrid')
    plt.figure(figsize=(8, 5))
    original_chunk.plot(label=f'{parameter.upper()} - actual')
    prediction_chunk.plot(label=f'{parameter.upper()} - predicted')

    # Adjust texts of plot
    plt.legend()
    plt.suptitle(f'Prediction of {parameter.upper()} with {n_chunks} Chunks, {endogenous_input} Input, and {model_type} '
                 f'\nModel ({style.replace("_", " ").upper()})', fontweight='bold')
    plt.xlabel('Time')
    plt.ylabel('Value')

    plt.show()
    plt.savefig(f'../../plots/darts/{n_chunks}_chunks/{style}/rnn_results_prediction_{model_type}_{parameter}_{endogenous_input}_'
                f'{chunk_id_win10}_{version}.png', dpi=72)