# Analysis of Predictions Produced with Chunks

This script analyzes all pickle files in `./data/darts/{n_chunks}_chunks/`, starting with `confusion_matrix`, i.e. all model-level and all chunk-level matrices. At the moment, the paths are adapted for local execution.

## Analysis of Model-level Matrices

### Extract All Generated Model-level Matrices

In [None]:
import os
import pandas as pd
import pickle5 as pickle

# Define number of chunks taken to adjust
n_chunks = 1000

# Define path to all model-level matrices produced by prediction
path_to_model_matrices = f'../../data/darts/{n_chunks}_chunks'

# Collect pickle file names of model-level matrices
model_matrix_filenames = list()

for file in os.listdir(path_to_model_matrices):
    if os.path.isfile(os.path.join(path_to_model_matrices, file)) and \
            file.startswith('confusion_matrix_models') and file.endswith('.pickle'):
        model_matrix_filenames.append(file)

# Concat all found matrices into result matrix
result_matrix_models = pd.DataFrame(columns=['ID', 'PARAMETER', 'MODEL', 'ENDOGENOUS', 'EXOGENOUS', 'FORECAST_TYPE',
                                             'FIRST_FORECAST', 'ALARM_TYPE', 'FP', 'TP', 'FN', 'TN', 'N_CHUNKS',
                                             'N_ITERATIONS'])

for filename in model_matrix_filenames:
    # Read file
    current_matrix_f = open(f'{path_to_model_matrices}/{filename}', 'rb')
    current_matrix = pickle.load(current_matrix_f)
    current_matrix_f.close()

    # Append current matrix to result matrix
    result_matrix_models = pd.concat([result_matrix_models, current_matrix])

# Sort result matrix for better readability
result_matrix_models.sort_values(by=['PARAMETER', 'MODEL', 'ENDOGENOUS'], inplace=True)

# Show result matrix
print(result_matrix_models)

### Add Accuracy Metrics and Save as Parquet File

In [None]:
# Calculate metrics (see https://en.wikipedia.org/wiki/Sensitivity_and_specificity for more information)
result_matrix_models['FPR'] = result_matrix_models['FP'] / (result_matrix_models['FP'] + result_matrix_models['TN'])
result_matrix_models['TPR'] = result_matrix_models['TP'] / (result_matrix_models['TP'] + result_matrix_models['FN'])
result_matrix_models['FNR'] = result_matrix_models['FN'] / (result_matrix_models['TP'] + result_matrix_models['FN'])
result_matrix_models['TNR'] = result_matrix_models['TN'] / (result_matrix_models['FP'] + result_matrix_models['TN'])

result_matrix_models['ACC'] = (result_matrix_models['TP'] + result_matrix_models['TN']) / \
                              (result_matrix_models['TP'] + result_matrix_models['FN'] + result_matrix_models['FP'] + result_matrix_models['TN'])
result_matrix_models['F1S'] = result_matrix_models['TP'] / \
                              (result_matrix_models['TP'] + 0.5 * (result_matrix_models['FP'] + result_matrix_models['FN']))

# Round all floats to 4 decimal places
# Note: round() does not work for floats with many decimal places
decimals = 4
for col in ['FPR', 'TPR', 'FNR', 'TNR', 'ACC', 'F1S']:
    result_matrix_models[col] = result_matrix_models[col].apply(lambda x: round(x, decimals))

# Move cols to end for similarity with ARIMA results and reset index
result_matrix_models = result_matrix_models[['ID', 'PARAMETER', 'MODEL', 'ENDOGENOUS', 'EXOGENOUS', 'FORECAST_TYPE',
                                             'FIRST_FORECAST', 'ALARM_TYPE', 'FP', 'TP', 'FN', 'TN', 'FPR', 'TPR',
                                             'FNR', 'TNR', 'ACC', 'F1S', 'N_CHUNKS', 'N_ITERATIONS']].reset_index(drop=True)

# Show complemented result matrix
print(result_matrix_models)

# Save result matrix as parquet
result_matrix_models.to_parquet('../../data/darts/1000_chunks/result_matrix_models.parquet', engine='pyarrow')

### Plot Ratios

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# "Group" result matrix by prefix of ID
plotdata = result_matrix_models.replace(['_H', '_L', '_B'], ['', '', ''], regex=True)

# Create subplots
sns.set_style('whitegrid')
fig, axs = plt.subplots(
    nrows=4,
    ncols=2, # TODO: adjust if O2 is added
    figsize=(15, 13),
    dpi=72
    )

# Define y-limits
fpr_ylimits = [0, max(result_matrix_models.FPR)]
tpr_ylimits = [0, max(result_matrix_models.TPR)]
fnr_ylimits = [0, max(result_matrix_models.FNR)]
tnr_ylimits = [0, max(result_matrix_models.TNR)]

# Actual plots
# TODO: execute again with O2
for i, parameter in enumerate(['HR', 'BP']):
    sns.barplot(
        ax=axs[0, i],
        data=plotdata[plotdata.PARAMETER == parameter],
        x='ID',
        y='FPR',
        hue='MODEL',
        palette=sns.color_palette('colorblind'),
        ci=None,
        order=[f'{parameter}_R_01', f'{parameter}_R_02', f'{parameter}_R_03'])
    axs[0, i].set_title(str(parameter), fontweight='bold', color= 'black', fontsize=14)
    axs[0, i].set_ylim(fpr_ylimits)

    sns.barplot(
        ax=axs[1, i],
        data=plotdata[plotdata.PARAMETER == parameter],
        x='ID',
        y='TPR',
        hue='MODEL',
        palette=sns.color_palette('colorblind'),
        ci=None,
        order=[f'{parameter}_R_01', f'{parameter}_R_02', f'{parameter}_R_03'])
    axs[1, i].set_ylim(tpr_ylimits)

    sns.barplot(
        ax=axs[2, i],
        data=plotdata[plotdata.PARAMETER == parameter],
        x='ID',
        y='FNR',
        hue='MODEL',
        palette=sns.color_palette('colorblind'),
        ci=None,
        order=[f'{parameter}_R_01', f'{parameter}_R_02', f'{parameter}_R_03'])
    axs[2, i].set_ylim(fnr_ylimits)

    sns.barplot(
        ax = axs[3, i],
        data = plotdata[plotdata.PARAMETER == parameter],
        x='ID',
        y='TNR',
        hue='MODEL',
        palette = sns.color_palette('colorblind'),
        ci=None,
        order=[f'{parameter}_R_01', f'{parameter}_R_02', f'{parameter}_R_03'])
    axs[3, i].set_ylim(tnr_ylimits)

plt.show(fig)

### Plot Accuracy and F1 Score

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# "Group" result matrix by prefix of ID
plotdata = result_matrix_models.replace(['_H', '_L', '_B'], ['', '', ''], regex=True)

# Create subplots
sns.set_style('whitegrid')
fig, axs = plt.subplots(
    nrows=2,
    ncols=2, # TODO: adjust if O2 is added
    figsize=(15, 7),
    dpi=72
    )

# Define y-limits
acc_ylimits = [0, max(result_matrix_models.ACC)]
f1s_ylimits = [0, max(result_matrix_models.F1S)]

# Actual plot
# TODO: execute again with O2
for i, parameter in enumerate(['HR', 'BP']):

    sns.barplot(
        ax=axs[0, i],
        data=plotdata[plotdata.PARAMETER == parameter],
        x='ID',
        y='ACC',
        hue='MODEL',
        palette=sns.color_palette('colorblind'),
        ci=None,
        order=[f'{parameter}_R_01', f'{parameter}_R_02', f'{parameter}_R_03'])
    axs[0, i].set_title(str(parameter), fontweight='bold', color= 'black', fontsize=14)
    axs[0, i].set_ylim(acc_ylimits)

    sns.barplot(
        ax=axs[1, i],
        data=plotdata[plotdata.PARAMETER == parameter],
        x='ID',
        y='F1S',
        hue='MODEL',
        palette=sns.color_palette('colorblind'),
        ci=None,
        order=[f'{parameter}_R_01', f'{parameter}_R_02', f'{parameter}_R_03'])
    axs[1, i].set_ylim(f1s_ylimits)

plt.show(fig)

## Analysis of Chunk-level Matrices

### Print Results of Selected Chunk/s

In [None]:
import pickle5 as pickle

# TODO: as soon as we decided for a uniform layout, complement chunk-level matrices

# => Proposal: change
#              ['CHUNK_ID', 'ALARM_TYPE', 'N_ITERATIONS', 'FP', 'TP', 'FN', 'TN']
#              to
#              ['CHUNK_ID', 'PARAMETER', 'MODEL', 'ENDOGENOUS', 'EXOGENOUS', 'FORECAST_TYPE', 'FIRST_FORECAST',
#               'ALARM_TYPE', 'FP', 'TP', 'FN', 'TN', 'N_ITERATIONS']
#              and insert metrics in analysis script

# Define parameters to adjust
n_chunks = 1000
model_type = 'RNN'
parameter = 'hr'
endogenous_input = 'MEDIAN'

# Define path to all chunk-level matrices produced by prediction
path_to_chunk_matrices = f'../../data/darts/{n_chunks}_chunks'

# Read chunk-specific matrix
chunks_matrix_f = open(f'{path_to_chunk_matrices}/confusion_matrix_chunks_{model_type}_{parameter}_{endogenous_input}.pickle', 'rb')
chunks_matrix = pickle.load(chunks_matrix_f)
chunks_matrix_f.close()

# Show chunk-specific matrix
#print(chunks_matrix)

# Show one chunk result
print(chunks_matrix[chunks_matrix['CHUNK_ID'] == '200033.0_220045.0_2198-08-07 19:53:00'])

### Plot Correlation Between Chunk Length and Chunk Accuracy

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define input length to adjust
input_length = 12

# Add accuracy column to all chunks of matrix
chunks_matrix['ACC'] = (chunks_matrix['TP'] + chunks_matrix['TN']) / \
                      (chunks_matrix['TP'] + chunks_matrix['FN'] + chunks_matrix['FP'] + chunks_matrix['TN'])

# Add column for chunk length to all chunks of matrix
chunks_matrix['LENGTH'] = chunks_matrix['N_ITERATIONS'] + input_length

# Extract chunks for high and low analysis plot
high_chunks = chunks_matrix[chunks_matrix['ALARM_TYPE'] == 'High'][['ACC', 'LENGTH']]
low_chunks = chunks_matrix[chunks_matrix['ALARM_TYPE'] == 'Low'][['ACC', 'LENGTH']]

# Introduce mean value for each length
high_chunks_prep = high_chunks.astype(float)
high_chunks_prep = high_chunks_prep.groupby('LENGTH').mean()

low_chunks_prep = low_chunks.astype(float)
low_chunks_prep = low_chunks_prep.groupby('LENGTH').mean()

# Reset indices to make access via column names possible again
high_chunks_prep.reset_index(level=0, inplace=True)
low_chunks_prep.reset_index(level=0, inplace=True)

# Define background color, subplots and suptitle
sns.set_style('whitegrid')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))
fig.suptitle('Correlation of Chunk Accuracy and Its Mean Chunk Length', fontsize=14)

# Add left plot (high threshold analysis)
ax1.plot('LENGTH', 'ACC', data=high_chunks_prep, marker='o', color=sns.color_palette('colorblind')[0])
ax1.set_title('Accuracy Regarding High Thresholds', fontsize=10)
ax1.set_xlabel('Chunk Length', fontsize=8)
ax1.set_ylabel('Chunk Accuracy', fontsize=8)
ax1.set_ylim(bottom=0, top=1.1)

# Add right plot (low threshold analysis)
ax2.plot('LENGTH', 'ACC', data=low_chunks_prep, marker='o', color=sns.color_palette('colorblind')[1])
ax2.set_title('Accuracy Regarding Low Thresholds', fontsize=10)
ax2.set_xlabel('Chunk Length', fontsize=8)
ax2.set_ylabel('Chunk Accuracy', fontsize=8)
ax2.set_ylim(bottom=0, top=1.1)

# Improve layout and save figure
fig.tight_layout()
#fig.show()
fig.savefig(f'../../plots/darts/1000_chunks/correlation_chunk_length_and_accuracy_{model_type}_{parameter}_{endogenous_input}.png', dpi=1200)