### Library and file imports 

In [1]:
import glob
import os
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, roc_curve, auc, recall_score, precision_score, f1_score

In [2]:
path_to_normal_operation_files = r"./data/0"
path_to_flow_instability_files = r"./data/4"

all_normal_operation_files = glob.glob(os.path.join(path_to_normal_operation_files, "*.csv"))
all_flow_instability_files = glob.glob(os.path.join(path_to_flow_instability_files, "*.csv"))

### Base samples

In [3]:
list_files = []
for filename in all_normal_operation_files:
    df_normal = pd.read_csv(filename, index_col=None, header=0)
    list_files.append(df_normal)

normal_frame = pd.concat(list_files, axis=0)
del list_files, df_normal
normal_frame.head(3)

Unnamed: 0,timestamp,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
0,2017-02-01 02:02:07.000000,0.0,10092110.0,119.0944,1609800.0,84.59782,1564147.0,,0.0,0.0
1,2017-02-01 02:02:08.000000,0.0,10092000.0,119.0944,1618206.0,84.58997,1564148.0,,0.0,0.0
2,2017-02-01 02:02:09.000000,0.0,10091890.0,119.0944,1626612.0,84.58213,1564148.0,,0.0,0.0


In [4]:
list_files = []
for filename in all_flow_instability_files:
    df_anomaly = pd.read_csv(filename, index_col=None, header=0)
    list_files.append(df_anomaly)

flow_instability_frame = pd.concat(list_files, axis=0)
del list_files, df_anomaly
flow_instability_frame.head(3)

Unnamed: 0,timestamp,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
0,2017-03-16 12:02:03.000000,38265830.0,13654450.0,117.1953,6029680.0,68.64587,3283309.0,,0.0,4
1,2017-03-16 12:02:04.000000,38265830.0,13654520.0,117.1947,6030228.0,68.64333,3283309.0,,0.0,4
2,2017-03-16 12:02:05.000000,38265830.0,13654580.0,117.1942,6030777.0,68.6408,3283308.0,,0.0,4


In [5]:
df_source = pd.concat([normal_frame, flow_instability_frame])
del normal_frame, flow_instability_frame

### Preprocessing of the whole base

In [6]:
# observations without classification are discarded
target_class = 'class'
df_pipe = df_source.dropna(subset=[target_class])

X_pipe = df_pipe.drop(target_class, axis=1)
y_pipe = df_pipe.loc[:, target_class]

del df_pipe

X_pipe_train, X_pipe_test, y_pipe_train, y_pipe_test = train_test_split(
    X_pipe, y_pipe, test_size=0.2, random_state=666
)

del X_pipe, y_pipe

### Pipeline

#### Assembling Pipeline

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FunctionTransformer

In [8]:
columns_to_drop = ['timestamp', 'T-JUS-CKGL']

drop_function = FunctionTransformer(
        lambda x: x.drop(columns=columns_to_drop, axis=1)
)

fill_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

pipe = Pipeline(
            steps=[
                ('dropout', drop_function),
                ('fillna', fill_mean),
                ('transform', MinMaxScaler()),
                ('clf', GaussianNB()),
            ],
            verbose=True
        )

#### Tuning Hyperparameters and Tracking Experiment with MLFlow

In [9]:
import mlflow
mlflow.set_experiment("mack-tac-experiment")

import copy

In [10]:
imps

NameError: name 'imps' is not defined

In [12]:
mlflow.start_run()
    mlflow.set_tags(
        {
            "model": "GaussianNB",
            "class": "experiment"
            }
    )
    var_smoothings = np.logspace(-9, 0, num=10)
    for value in var_smoothings:
        mlflow.start_run(nested=True)
        print(f"Fitting for var_smoothing={value}")
        mlflow.log_param('var_smoothing', value)
        experiment_pipe = copy.deepcopy(pipe)
        experiment_pipe.set_params(clf__var_smoothing=value)
        experiment_pipe.fit(X_pipe_train, y_pipe_train)
        experiment_predictions = experiment_pipe.predict(X_pipe_test)
        mlflow.log_metrics(
        {'accuracy': (experiment_pipe.score(X_pipe_test, y_pipe_test)),
        'recall': (recall_score(y_pipe_test.values, experiment_predictions, pos_label=4)),
        'precision': (precision_score(y_pipe_test.values, experiment_predictions, pos_label=4)),
        'f1-score': (f1_score(y_pipe_test.values, experiment_predictions, pos_label=4))
        }
        )
        mlflow.end_run()

Fitting for var_smoothing=1e-09
[Pipeline] ........... (step 1 of 4) Processing dropout, total=   0.1s
[Pipeline] ............ (step 2 of 4) Processing fillna, total=   1.6s
[Pipeline] ......... (step 3 of 4) Processing transform, total=   0.4s
[Pipeline] ............... (step 4 of 4) Processing clf, total=   1.4s
Fitting for var_smoothing=1e-08
[Pipeline] ........... (step 1 of 4) Processing dropout, total=   0.1s
[Pipeline] ............ (step 2 of 4) Processing fillna, total=   1.5s
[Pipeline] ......... (step 3 of 4) Processing transform, total=   0.4s
[Pipeline] ............... (step 4 of 4) Processing clf, total=   1.4s
Fitting for var_smoothing=1e-07
[Pipeline] ........... (step 1 of 4) Processing dropout, total=   0.1s
[Pipeline] ............ (step 2 of 4) Processing fillna, total=   1.5s
[Pipeline] ......... (step 3 of 4) Processing transform, total=   0.4s
[Pipeline] ............... (step 4 of 4) Processing clf, total=   1.4s
Fitting for var_smoothing=1e-06
[Pipeline] .........

Exception: Run with UUID 7693004e008c41f9804f3ef47d736411 is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

#### Metrics of Optimal Model

In [19]:
with mlflow.start_run():
    mlflow.set_tags(
        {
            "model": "GaussianNB",
            "class": "optimal"
            }
    )

    optimal_var_smoothing = np.exp(-9)
    mlflow.log_param("var_smoothing", optimal_var_smoothing)

    pipe.set_params(clf__var_smoothing=optimal_var_smoothing)
    pipe.fit(X_pipe_train, y_pipe_train)

    predictions = pipe.predict(X_pipe_test)

    metrics_data = {
        'accuracy': (pipe.score(X_pipe_test, y_pipe_test)),
        'recall': (recall_score(y_pipe_test.values, predictions, pos_label=4)),
        'precision': (precision_score(y_pipe_test.values, predictions, pos_label=4)),
        'f1-score': (f1_score(y_pipe_test.values, predictions, pos_label=4))
    }

    mlflow.log_metrics(metrics_data)
    #mlflow.sklearn.save_model(pipe, "anomaly_detector_clf")

[Pipeline] ........... (step 1 of 4) Processing dropout, total=   0.2s
[Pipeline] ............ (step 2 of 4) Processing fillna, total=   1.5s
[Pipeline] ......... (step 3 of 4) Processing transform, total=   0.4s
[Pipeline] ............... (step 4 of 4) Processing clf, total=   1.4s


In [None]:
# add confusion matrix
metrics_data['cm'] = confusion_matrix(y_pipe_test, predictions)

print(metrics_data)

In [10]:
with open('results/metrics.txt', 'w') as outfile:
    outfile.write(f"\nAccuracy = {round(metrics_data['accuracy'], 4)}, " + 
                    f"Recall = {round(metrics_data['recall'], 4)}, " +
                    f"Precision = {round(metrics_data['precision'], 4)}, " +
                    f"F1 Score = {round(metrics_data['f1-score'], 4)}")

#### Results Report

In [None]:
sns.set_theme()
fig, axs = plt.subplots(1, 2, figsize=(10, 3))

sns.heatmap(metrics_data['cm']/np.sum(metrics_data['cm']), annot=True, fmt='.2%', cmap='Blues', ax=axs[0])
axs[0].set_xlabel('Predicted Labels')
axs[0].set_ylabel('True Labels')
axs[0].xaxis.set_ticklabels(['normal', 'anomaly'])
axs[0].yaxis.set_ticklabels(['normal', 'anomaly'])
axs[0].set_title('Confusion Matrix')

fpr, tpr, thresholds = roc_curve(y_pipe_test, predictions, pos_label=4)
auc_score = (auc(fpr, tpr))

axs[1].plot(fpr, tpr, label=f'(AUC = {auc_score:.2f})', color='red')
axs[1].plot([0, 1], [0, 1], color='black', linestyle='--')
axs[1].set_xlim([0.0, 1.0])
axs[1].set_ylim([0.0, 1.05])
axs[1].set_xlabel('False Positive Rate')
axs[1].set_ylabel('True Positive Rate')
axs[1].set_title(f'ROC Curve')
axs[1].legend(loc="lower right")

plt.plot()
plt.savefig("results/model_results.png", dpi=120)

#### Saving Model

In [12]:
from cloudpickle import dump

In [14]:
model_file = "model/anomaly_detector_pipeline.pkl"

with open(model_file, 'wb') as pkl_file:
    dump(pipe, pkl_file)