# Figures 4(a) and 4(d)

From the `src` directory, run with the default pipeline parameters:

```bash
> python run_pipeline.py --pipeline-type ad --app-id 0 --model-type ae --scoring-method mse
```

Then, run the following section of the notebook.

## Setup and Pipeline Parameters

In [None]:
%matplotlib inline

# enable imports directly from the src directory
import os
import sys
sys.path.append(os.path.join(os.pardir, 'src'))

In [None]:
import argparse

ARGS = {
    # data-specific arguments
    'data': 'spark',
    'app_id': 0,
    'trace_types': '.',
    'ignored_anomalies': 'none',
    
    # datasets constitution arguments
    'n_starting_removed': 0,
    'n_ending_removed': 0,
    # optional downsampling to perform on both data and labels to save storage
    'pre_sampling_period': '15s',

    # features alteration and transformation arguments
    'alter_bundles': 'spark_bundles',
    'alter_bundle_idx': 0,
    # final sampling period to use for data records
    'data_sampling_period': '15s',
    'data_downsampling_position': 'last',
    # final sampling period to use for labels
    'labels_sampling_period': '15s',
    'transform_chain': 'trace_scaling',
    # if a transformation step is repeated, the same arguments are used for all its instances
    'head_size': 240,
    'online_window_type': 'expanding',
    # if not -1, weight of a regular pretraining of the scaler in
    # the convex combination with its head/head-online training
    'regular_pretraining_weight': -1,
    'scaling_method': 'std',
    # only relevant for "regular" scaling
    'reg_scaler_training': 'all.training',
    'minmax_range': [0, 1],
    'pca_n_components': 13,
    'pca_kernel': 'linear',
    'pca_training': 'all.training',
    'fa_n_components': 13,
    'fa_training': 'all.training',

    # normality modeling arguments
    'modeling_n_periods': -1,
    'modeling_data_prop': 1.0,
    'modeling_data_seed': 0,
    'modeling_split': 'stratified.split',
    'modeling_split_seed': 21,
    'n_period_strata': 3,
    'modeling_val_prop': 0.15,
    'modeling_test_prop': 0.15,
    'model_type': 'ae',
    # FORECASTING MODELS #
    'n_back': 40,
    'n_forward': 1,
    # RNN
    'rnn_unit_type': 'lstm',
    'rnn_n_hidden_neurons': [144, 40],
    'rnn_dropout': 0.0,
    'rnn_rec_dropout': 0.0,
    'rnn_optimizer': 'adam',
    'rnn_learning_rate': 7.869 * (10 ** -4),
    'rnn_n_epochs': 200,
    'rnn_batch_size': 32,
    # RECONSTRUCTION MODELS #
    'window_size': 40,
    'window_step': 1,
    # autoencoder
    'ae_latent_dim': 32,
    'ae_type': 'dense',
    'ae_enc_n_hidden_neurons': [200],
    'ae_dec_last_activation': 'linear',
    'ae_dropout': 0.0,
    'ae_dense_layers_activation': 'relu',
    'ae_rec_unit_type': 'lstm',
    'ae_rec_dropout': 0.0,
    'ae_loss': 'mse',
    'ae_optimizer': 'adam',
    'ae_learning_rate': 3.602 * (10 ** -4),
    'ae_n_epochs': 200,
    'ae_batch_size': 32,
    # BiGAN
    'bigan_latent_dim': 32,
    'bigan_enc_type': 'rec',
    'bigan_enc_arch_idx': -1,
    'bigan_enc_rec_n_hidden_neurons': [100],
    'bigan_enc_rec_unit_type': 'lstm',
    'bigan_enc_conv_n_filters': 32,
    'bigan_enc_dropout': 0.0,
    'bigan_enc_rec_dropout': 0.0,
    'bigan_gen_type': 'rec',
    'bigan_gen_last_activation': 'linear',
    'bigan_gen_arch_idx': -1,
    'bigan_gen_rec_n_hidden_neurons': [100],
    'bigan_gen_rec_unit_type': 'lstm',
    'bigan_gen_conv_n_filters': 64,
    'bigan_gen_dropout': 0.0,
    'bigan_gen_rec_dropout': 0.0,
    'bigan_dis_type': 'conv',
    'bigan_dis_arch_idx': 0,
    'bigan_dis_x_rec_n_hidden_neurons': [30, 10],
    'bigan_dis_x_rec_unit_type': 'lstm',
    'bigan_dis_x_conv_n_filters': 32,
    'bigan_dis_x_dropout': 0.0,
    'bigan_dis_x_rec_dropout': 0.0,
    'bigan_dis_z_n_hidden_neurons': [32, 10],
    'bigan_dis_z_dropout': 0.0,
    'bigan_dis_threshold': 0.0,
    'bigan_dis_optimizer': 'adam',
    'bigan_enc_gen_optimizer': 'adam',
    'bigan_dis_learning_rate': 0.0004,
    'bigan_enc_gen_learning_rate': 0.0001,
    'bigan_n_epochs': 200,
    'bigan_batch_size': 32,

    # outlier score assignment arguments
    'scoring_method': 'mse',
    'mse_weight': 0.5,

    # supervised evaluation for assessing scoring performance
    'evaluation_type': 'ad2',
    'recall_alpha': 0.0,
    'recall_omega': 'default',
    'recall_delta': 'flat',
    'recall_gamma': 'dup',
    'precision_omega': 'default',
    'precision_delta': 'flat',
    'precision_gamma': 'dup',
    'f_score_beta': 1.0,

    # outlier score threshold selection arguments
    'thresholding_method': ['std', 'mad', 'iqr'],
    'thresholding_factor': [1.5, 2.0, 2.5, 3.0],
    'n_iterations': [1, 2],
    'removal_factor': [1.0],

    # explanation discovery arguments
    'explanation_method': 'exstream',
    'explained_predictions': 'ground.truth',
    # ED evaluation parameters
    'ed_eval_min_anomaly_length': 1,
    'ed1_consistency_n_disturbances': 5,
    # model-free evaluation
    'mf_eval_min_normal_length': 1,
    'mf_ed1_consistency_sampled_prop': 0.8,
    'mf_ed1_accuracy_n_splits': 5,
    'mf_ed1_accuracy_test_prop': 0.2,
    # model-dependent evaluation
    'md_eval_small_anomalies_expansion': 'before',
    'md_eval_large_anomalies_coverage': 'all',
    # EXstream
    'exstream_fp_scaled_std_threshold': 1.64,
    # MacroBase
    'macrobase_n_bins': 10,
    'macrobase_min_support': 0.4,
    'macrobase_min_risk_ratio': 1.5,
    # LIME
    'lime_n_features': 5,

    # pipeline execution shortcut arguments
    'pipeline_type': 'ad'
}
args = argparse.Namespace(**ARGS)

## Figure 4(a) - Trace-wise Separation for T2 Trace of Application 2

In [None]:
import importlib

from utils.common import PIPELINE_TEST_NAME, get_output_path, get_modeling_task_and_classes
from data.helpers import load_datasets_data

# set input and output paths
DATA_INFO_PATH = get_output_path(args, 'make_datasets')
DATA_INPUT_PATH = get_output_path(args, 'build_features', 'data')
MODEL_INPUT_PATH = get_output_path(args, 'train_model')
OUTPUT_PATH = get_output_path(args, 'train_detector', 'model')

# load test data
data = load_datasets_data(DATA_INPUT_PATH, DATA_INFO_PATH, [PIPELINE_TEST_NAME])

# initialize relevant scorer based on command-line arguments
task_type, model_classes = get_modeling_task_and_classes(args)
a_t = 'the type of task must be either `forecasting` or `reconstruction`'
assert task_type in ['forecasting', 'reconstruction'], a_t
scoring_classes = importlib.import_module(f'scoring.{task_type}.{task_type}_scorers').scoring_classes
if args.model_type == 'naive.forecasting':
    model = model_classes[args.model_type](args, '')
else:
    model = model_classes[args.model_type].from_file(args, MODEL_INPUT_PATH)
scorer = scoring_classes[args.scoring_method](args, model, OUTPUT_PATH)

In [None]:
import numpy as np

# derive outlier scores for T2 trace of application 2
trace_name = '2_2_200000_69'

trace_idx = [i for i, t in enumerate(data['test_info']) if t[0] == trace_name][0]
trace_data, trace_labels = np.array([data['test'][trace_idx]]), np.array([data['y_test'][trace_idx]])
trace_scores = scorer.score(trace_data)

In [None]:
from utils.spark import ANOMALY_TYPES
from visualization.periods.array import plot_scores_distributions
from visualization.helpers.spark import METRICS_COLORS
    
# plot outlier scores distribution
plot_scores_distributions(
    trace_scores, trace_labels,
    fig_title=f'Scores Distributions for Trace "{trace_name}"',
    type_colors=METRICS_COLORS,
    anomaly_types=ANOMALY_TYPES
)

## Figure 4(d) - Outlier Scores of Modeling Test Samples with "Best" Threshold

In [None]:
import pickle

from utils.common import PIPELINE_TRAIN_NAME, MODELING_TEST_NAME, get_best_thresholding_args
from data.helpers import load_mixed_formats
from modeling.data_splitters import get_splitter_classes
from modeling.forecasting.helpers import get_trimmed_periods

# load test samples of the modeling set
if task_type == 'forecasting':
    data[f'y_{PIPELINE_TEST_NAME}'] = get_trimmed_periods(data[f'y_{PIPELINE_TEST_NAME}'], args.n_back)
    kwargs = {'n_back': args.n_back, 'n_forward': args.n_forward}
else:
    kwargs = {'window_size': args.window_size, 'window_step': args.window_step}

print('loading training periods and information...', end=' ', flush=True)
modeling_files = load_mixed_formats(
    [DATA_INPUT_PATH, DATA_INFO_PATH],
    [PIPELINE_TRAIN_NAME, f'{PIPELINE_TRAIN_NAME}_info'],
    ['numpy', 'pickle']
)
print('done.')
print('recovering modeling test samples...', end=' ', flush=True)
data_splitter = get_splitter_classes()[args.modeling_split](args)
data = data_splitter.get_modeling_split(
    modeling_files[PIPELINE_TRAIN_NAME], modeling_files[f'{PIPELINE_TRAIN_NAME}_info'], **kwargs
)
modeling_test_data = {
    k.replace(f'_{MODELING_TEST_NAME}', ''): v for k, v in data.items() if MODELING_TEST_NAME in k
}
print('done.')

# load "best" outlier score threshold value
best_threshold = pickle.load(open(
    os.path.join(get_output_path(get_best_thresholding_args(args), 'train_detector'), 'threshold.pkl'), 'rb'
))

In [None]:
# format modeling test samples according to the type of method
if 'y' in modeling_test_data:
    if len(modeling_test_data['y'].shape) == 2:
        n_samples, n_features = modeling_test_data['y'].shape
        modeling_test_data['y'] = modeling_test_data['y'].reshape((n_samples, 1, n_features))
    modeling_test_samples = np.array([
        np.concatenate([X, y]) for X, y in zip(modeling_test_data['X'], modeling_test_data['y'])
    ])
else:
    modeling_test_samples = modeling_test_data['X']
                    
# derive modeling test outlier scores
modeling_test_scores = scorer.score_windows(modeling_test_samples)

In [None]:
# plot outlier scores distribution of modeling test samples, with highlighted "best" threshold value
metrics_colors = METRICS_COLORS.copy()
metrics_colors['normal'] = 'deepskyblue'

plot_scores_distributions(
    np.array([np.array(modeling_test_scores)]), np.array([np.zeros(shape=(len(modeling_test_scores),))]),
    fig_title=f'Scores Distributions of the Modeling Test Samples',
    threshold=best_threshold,
    type_colors=metrics_colors,
    anomaly_types=ANOMALY_TYPES
)

# Experiment 3 (Figure 5, Increasing Training Data)

From the `src` directory, run with the default pipeline parameters (in particular, with the default value of _"--app-id"_ being 0 in *utils.spark.py*, for "all applications"):

```bash
> python data/make_datasets.py
> python features/build_features.py

> cd experiments

> experiment_3.sh rnn 0 0
> experiment_3.sh rnn 1 1
> experiment_3.sh rnn 2 2
> experiment_3.sh rnn 3 3
> experiment_3.sh rnn 4 4

> experiment_3.sh ae 0 0
> experiment_3.sh ae 1 1
> experiment_3.sh ae 2 2
> experiment_3.sh ae 3 3
> experiment_3.sh ae 4 4

> experiment_3.sh bigan 0 0
> experiment_3.sh bigan 1 1
> experiment_3.sh bigan 2 2
> experiment_3.sh bigan 3 3
> experiment_3.sh bigan 4 4
```

Then, run the following section of the notebook.

In [None]:
%matplotlib inline

# enable imports directly from the src directory
import os
import sys
sys.path.append(os.path.join(os.pardir, 'src'))

In [None]:
import argparse

ARGS = {
    # data-specific arguments
    'data': 'spark',
    'app_id': 0,
    'trace_types': '.',
    'ignored_anomalies': 'none',
    
    # datasets constitution arguments
    'n_starting_removed': 0,
    'n_ending_removed': 0,
    # optional downsampling to perform on both data and labels to save storage
    'pre_sampling_period': '15s',

    # features alteration and transformation arguments
    'alter_bundles': 'spark_bundles',
    'alter_bundle_idx': 0,
    # final sampling period to use for data records
    'data_sampling_period': '15s',
    'data_downsampling_position': 'last',
    # final sampling period to use for labels
    'labels_sampling_period': '15s',
    'transform_chain': 'trace_scaling',
    # if a transformation step is repeated, the same arguments are used for all its instances
    'head_size': 240,
    'online_window_type': 'expanding',
    # if not -1, weight of a regular pretraining of the scaler in
    # the convex combination with its head/head-online training
    'regular_pretraining_weight': -1,
    'scaling_method': 'std',
    # only relevant for "regular" scaling
    'reg_scaler_training': 'all.training',
    'minmax_range': [0, 1],
    'pca_n_components': 13,
    'pca_kernel': 'linear',
    'pca_training': 'all.training',
    'fa_n_components': 13,
    'fa_training': 'all.training',

    # normality modeling arguments
    'modeling_n_periods': -1,
    'modeling_data_prop': 1.0,
    'modeling_data_seed': 0,
    'modeling_split': 'stratified.split',
    'modeling_split_seed': 21,
    'n_period_strata': 3,
    'modeling_val_prop': 0.15,
    'modeling_test_prop': 0.15,
    'model_type': 'ae',
    # FORECASTING MODELS #
    'n_back': 40,
    'n_forward': 1,
    # RNN
    'rnn_unit_type': 'lstm',
    'rnn_n_hidden_neurons': [144, 40],
    'rnn_dropout': 0.0,
    'rnn_rec_dropout': 0.0,
    'rnn_optimizer': 'adam',
    'rnn_learning_rate': 7.869 * (10 ** -4),
    'rnn_n_epochs': 200,
    'rnn_batch_size': 32,
    # RECONSTRUCTION MODELS #
    'window_size': 40,
    'window_step': 1,
    # autoencoder
    'ae_latent_dim': 32,
    'ae_type': 'dense',
    'ae_enc_n_hidden_neurons': [200],
    'ae_dec_last_activation': 'linear',
    'ae_dropout': 0.0,
    'ae_dense_layers_activation': 'relu',
    'ae_rec_unit_type': 'lstm',
    'ae_rec_dropout': 0.0,
    'ae_loss': 'mse',
    'ae_optimizer': 'adam',
    'ae_learning_rate': 3.602 * (10 ** -4),
    'ae_n_epochs': 200,
    'ae_batch_size': 32,
    # BiGAN
    'bigan_latent_dim': 32,
    'bigan_enc_type': 'rec',
    'bigan_enc_arch_idx': -1,
    'bigan_enc_rec_n_hidden_neurons': [100],
    'bigan_enc_rec_unit_type': 'lstm',
    'bigan_enc_conv_n_filters': 32,
    'bigan_enc_dropout': 0.0,
    'bigan_enc_rec_dropout': 0.0,
    'bigan_gen_type': 'rec',
    'bigan_gen_last_activation': 'linear',
    'bigan_gen_arch_idx': -1,
    'bigan_gen_rec_n_hidden_neurons': [100],
    'bigan_gen_rec_unit_type': 'lstm',
    'bigan_gen_conv_n_filters': 64,
    'bigan_gen_dropout': 0.0,
    'bigan_gen_rec_dropout': 0.0,
    'bigan_dis_type': 'conv',
    'bigan_dis_arch_idx': 0,
    'bigan_dis_x_rec_n_hidden_neurons': [30, 10],
    'bigan_dis_x_rec_unit_type': 'lstm',
    'bigan_dis_x_conv_n_filters': 32,
    'bigan_dis_x_dropout': 0.0,
    'bigan_dis_x_rec_dropout': 0.0,
    'bigan_dis_z_n_hidden_neurons': [32, 10],
    'bigan_dis_z_dropout': 0.0,
    'bigan_dis_threshold': 0.0,
    'bigan_dis_optimizer': 'adam',
    'bigan_enc_gen_optimizer': 'adam',
    'bigan_dis_learning_rate': 0.0004,
    'bigan_enc_gen_learning_rate': 0.0001,
    'bigan_n_epochs': 200,
    'bigan_batch_size': 32,

    # outlier score assignment arguments
    'scoring_method': 'mse',
    'mse_weight': 0.5,

    # supervised evaluation for assessing scoring performance
    'evaluation_type': 'ad2',
    'recall_alpha': 0.0,
    'recall_omega': 'default',
    'recall_delta': 'flat',
    'recall_gamma': 'dup',
    'precision_omega': 'default',
    'precision_delta': 'flat',
    'precision_gamma': 'dup',
    'f_score_beta': 1.0,

    # outlier score threshold selection arguments
    'thresholding_method': ['std', 'mad', 'iqr'],
    'thresholding_factor': [1.5, 2.0, 2.5, 3.0],
    'n_iterations': [1, 2],
    'removal_factor': [1.0],

    # explanation discovery arguments
    'explanation_method': 'exstream',
    'explained_predictions': 'ground.truth',
    # ED evaluation parameters
    'ed_eval_min_anomaly_length': 1,
    'ed1_consistency_n_disturbances': 5,
    # model-free evaluation
    'mf_eval_min_normal_length': 1,
    'mf_ed1_consistency_sampled_prop': 0.8,
    'mf_ed1_accuracy_n_splits': 5,
    'mf_ed1_accuracy_test_prop': 0.2,
    # model-dependent evaluation
    'md_eval_small_anomalies_expansion': 'before',
    'md_eval_large_anomalies_coverage': 'all',
    # EXstream
    'exstream_fp_scaled_std_threshold': 1.64,
    # MacroBase
    'macrobase_n_bins': 10,
    'macrobase_min_support': 0.4,
    'macrobase_min_risk_ratio': 1.5,
    # LIME
    'lime_n_features': 5,

    # pipeline execution shortcut arguments
    'pipeline_type': 'ad'
}
args = argparse.Namespace(**ARGS)

In [None]:
import pandas as pd

from utils.common import get_args_string, get_output_path
from data.helpers import load_files


def get_data_prop_and_performance(args, n_periods, data_seed, split_seed, agg='median', granularity='app_avg'):
    """Returns the proportion of normal data used by the method along with its median/best test F1-score.
        
    The test F1-score is returned for the provided data granularity, either as the median or "best"
    score across the evaluated thresholding parameters.
    
    Args:
        args (argparse.Namespace): parsed command-line arguments.
        n_periods (int): number of periods used by the method for modeling the normal behavior. 
        data_seed (int): modeling data selection random seed. 
        split_seed (int): modeling data splitting random seed. 
        agg (str): performance aggregation to perform across the test F1-scores (either "best" or "median").
        granularity (str): evaluation granularity (index of the "granularity" column of the evaluation spreadsheet).

    Returns:
        float, float: proportion of normal data used by the method along with its median/best test F1-score.
    """
    assert agg in ['best', 'median'], 'the provided aggregation method must be either "best" or "median"'
    args_dict, fn = vars(args), 'selected_periods_info'
    keys = ['modeling_n_periods', 'modeling_data_seed', 'modeling_split_seed']
    values = [n_periods, data_seed, split_seed]
    for k, v in zip(keys, values):
        args_dict[k] = v
    args_copy = argparse.Namespace(**args_dict)
    data_prop = load_files(get_output_path(args_copy, 'train_model'), [fn], 'json')[fn]['data_prop']
    evaluation_fn = f'{get_args_string(args, "ad_evaluation")}_detection_comparison.csv'
    evaluation_path = os.path.join(get_output_path(args_copy, 'train_scorer'), evaluation_fn)
    evaluation_df = pd.read_csv(evaluation_path, index_col=[0, 1])
    if agg == 'median': 
        median_block = evaluation_df.groupby(evaluation_df.index.get_level_values('granularity')).median()
        filtered_block = median_block.loc[median_block.index == granularity]
        f_score = filtered_block['TEST_GLOBAL_F1.0_SCORE'].iloc[0]
    else:
        f_score = eval_df.loc[
            evaluation_df.index.get_level_values('granularity') == granularity
        ].sort_values('TEST_GLOBAL_F1.0_SCORE', ascending=False).iloc[0]['TEST_GLOBAL_F1.0_SCORE']        
    return data_prop, f_score

In [None]:
# total number of undisturbed (i.e., normal) traces
TOT_N_TRACES = 52

# performance curves grouped by AD method and random seed
seed_curves = dict()
for model_type, scoring_method in zip(['ae', 'rnn', 'bigan'], ['mse', 're', 'mse.ft']):
    args_copy = argparse.Namespace(**vars(args))
    args_copy.model_type, args_copy.scoring_method = model_type, scoring_method
    # performance curves for the AD method
    seed_curves[model_type] = dict()
    for random_seed in range(5):
        # performance curve for the random seed (`x` is proportion of normal data, `y` is performance)
        seed_curve = {'x': [], 'y': []}
        for n_periods in range(1, TOT_N_TRACES + 1):
            # ignore the step if data is not available
            try: 
                x, y = get_data_prop_and_performance(args_copy, n_periods, random_seed, random_seed)
                for k, v in zip(['x', 'y'], [x, y]): 
                    seed_curve[k].append(v)
            except FileNotFoundError:
                pass
        # only add seed keys for which data could be loaded
        if not any([len(seed_curve[k]) == 0 for k in ['x', 'y']]): 
            seed_curves[model_type][random_seed] = seed_curve

In [None]:
from scipy.interpolate import make_interp_spline

def get_curve_interpolation(x, y, n_points=300):
    """Returns interpolated (x, y) pairs as evenly-spaced points between the min and max x values."""
    new_x = np.linspace(min(x), max(x), num=n_points)
    # type BSpline
    spl = make_interp_spline(np.sort(x), y, k=1)  
    return new_x, spl(new_x)

def plot_average_curve(ax, curves_dict, curve_label, curve_color, n_points=300):
    """From the provided curves dictionary, plots the average curve across keys, highlighting the std.
    
    The x values of the curves do not need to be the same, as long as they share the same minimum and
    maximum values, since interpolation will be used to make each curve have `n_points` points.
    
    Args:
        ax (AxesSubplot): plt.axis on which to plot the average curve.
        curves_dict (dict): curves dictionary, with each curve as a dict with keys `x` and `y`.
        curve_label (str): label to use for the average curve in the legend.
        curve_label (int|str): color of the average curve.
        n_points (int): number of points to show for the curves after interpolation. 
    """
    # get interpolated curves from the provided curves dictionary
    interp_curves = dict()
    for k in curves_dict:
        x, y = get_curve_interpolation(curves_dict[k]['x'], curves_dict[k]['y'], n_points)
        interp_curves[k] = {'x': x, 'y': y}
    
    # get mean and standard deviation for the curves
    interp_mean = {'x': interp_curves[list(interp_curves.keys())[0]]['x']}
    interp_mean['y'] = np.mean([interp_curves[k]['y'] for k in interp_curves], axis=0)
    interp_std = np.std([interp_curves[k]['y'] for k in interp_curves], axis=0)
    
    # plot the average curve, highlighting the standard deviation
    ax.plot(interp_mean['x'], interp_mean['y'], label=curve_label, c=curve_color)
    ax.fill_between(interp_mean['x'], interp_mean['y'] - interp_std, interp_mean['y'] + interp_std, alpha=0.2)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# plot average curve for each AD method
labels_dict = {'ae': 'Autoencoder', 'rnn': 'LSTM', 'bigan': 'BiGAN'}
colors_dict = {'ae': 'blue', 'rnn': 'red', 'bigan': 'green'}
fontsizes = {
    'labels': 20,
    'ticks': 18,
    'legend': 18
}
fig, ax = plt.subplots(1, 1, figsize=(12, 3))
for model_type in seed_curves:
    plot_average_curve(ax, seed_curves[model_type], labels_dict[model_type], colors_dict[model_type])

# set figure options
ax.legend(prop={'size': fontsizes['legend']}, frameon=False, bbox_to_anchor=(1.05, 1), loc='upper left')
ax.set_xlabel('Proportion of Normal Data', size=fontsizes['labels'])
ax.set_ylabel('Median F1-Score', size=fontsizes['labels'])
ax.tick_params(axis='both', which='major', labelsize=fontsizes['ticks'])
ax.tick_params(axis='both', which='minor', labelsize=fontsizes['ticks'])
ax.grid(False)