## Support vector machine as a fault detection model

In [2]:
from fermfaultdetect.data.utils import load_batchset, dataloader
from fermfaultdetect.utils import get_simulation_dir, get_models_dir
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from fermfaultdetect.fault_detect_models.ml_models import svm_fdm
from fermfaultdetect.model_evaluation import plot_example_set
from fermfaultdetect import model_evaluation as eval
import joblib
from fermfaultdetect.visualizations import visualize
from datetime import datetime
import json

### Load training and test data

In [3]:
seed = 42 # set seeding

sim_dir = get_simulation_dir() # get directory of simulation data

############################################
model_name = "FILL_IN_MODEL_NAME" # set the name of model (e.g. date or specific name)
train_set_name = "FILL_IN_TRAINING_SET_NAME"
val_set_name = "FILL_IN_VALIDATION_SET_NAME"
############################################

train_path = os.path.join(sim_dir, train_set_name)
val_path = os.path.join(sim_dir, val_set_name)

# set directory to save model and metrics
model_dir = os.path.join(get_models_dir(), model_name)
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

train_set = load_batchset(train_path)
val_set = load_batchset(val_path)

# Load train data into dataloader and standardize
target_cols = ['defect_steambarrier', 'steam_in_feed', 'blocked_spargers', 'airflow_OOC', 'OUR_OOC', 'no_fault'] # set target columns
train_dl = dataloader(batchset = train_set[:], seed=seed) # load copy of the list
train_dl.shuffle_batches()
train_dl.standardize_data(exclude_cols=target_cols)

# Load test data into dataloader and standardize
val_dl = dataloader(batchset = val_set[:], seed=seed)
#test_dl.shuffle_batches()
val_dl.import_standardization(train_dl)
val_dl.standardize_data(exclude_cols=target_cols)

# Retrieve data from dataloader with separate and fused target columns
train_X, train_Y = train_dl.get_data(split_batches=False, target_cols=target_cols, separate_target_matrix=True, fuse_target_cols=True)
val_X, val_Y = val_dl.get_data(split_batches=False, target_cols=target_cols, separate_target_matrix=True, fuse_target_cols=True)
_, val_Y_unfused = val_dl.get_data(split_batches=False, target_cols=target_cols, separate_target_matrix=True, fuse_target_cols=False)


# Cut target column to 1D-array
train_Y = train_Y["fault"]
val_Y = val_Y["fault"]

## Optimize C and gamma

### Run gridsearch

In [None]:
# Define the range of values for hyperparameters
C_values = [0.1, 1, 10, 100]
gamma_values = [0.1 , 1, 10, 100]
mw = 1

# Prepare to collect results
results = []

# Loop through all possible combinations
for C in C_values:
    for gamma in gamma_values:
        print(f"Training model with C={C} and gamma={gamma}")
        svm_model_grid = svm_fdm(kernel='rbf', C=C, gamma=gamma, mw = mw, seed=seed)
        svm_model_grid.train(train_X, train_Y)
        accuracy = svm_model_grid.prediction_accuracy(val_X, val_Y)
        results.append({
            'C': C,
            'gamma': gamma,
            'accuracy': accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Pivot the DataFrame for heatmap plotting
pivot_table = results_df.pivot(index='C', columns='gamma', values='accuracy')

# Save the pivot table to a CSV file
table_path = os.path.join(model_dir, "svm_gridsearch_heatmap.csv")
pivot_table.to_csv(table_path)

### Visualize gridsearch

In [None]:
# Print optimal hyperparameters
best_row = results_df.loc[results_df['accuracy'].idxmax()]
print(f"Optimal parameters: accuracy = {best_row['accuracy']:.3f}, C = {best_row['C']:.3f}, gamma = {best_row['gamma']}")

# Plotting the results using seaborn heatmap
plt.figure(figsize=(10, 8))
visualize.set_plot_params(high_res=True)
ax = sns.heatmap(pivot_table, annot=False, cmap=visualize.get_hotcold_colormap(), fmt=".3f")
ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
#plt.title('SVM Model Accuracy Heatmap')
plt.xlabel('γ [-]')
plt.ylabel('C [-]')
plt.savefig(os.path.join(model_dir, "svm_gridsearch_heatmap_"+model_name+".png"), dpi=300)
plt.show()

### Retrain model with optimal C and gamma

In [6]:
### Decide on optimal configuration based on gridsearch ###
C = 1
gamma = 10
###########################################################

svm_model_best = svm_fdm(kernel='rbf', C=C, gamma=gamma, mw = 1, seed=seed)
svm_model_best.train(train_X, train_Y)

### Optimize moving time window

In [None]:
mw_grid = [1, 2, 3, 4, 5, 10, 15, 20]
mw_accuracy = []

for mw in mw_grid:
    svm_model_best.mw = mw
    accuracy = svm_model_best.prediction_accuracy(val_X, val_Y)
    mw_accuracy.append(accuracy)
print(mw_accuracy)

# Plotting the cumulative explained variance to determine the number of components to retain
colors = visualize.get_thesis_colors()
plt.figure(figsize=(6, 4))
plt.plot(mw_grid, mw_accuracy, color=colors["blue"])
plt.xlabel(r'Moving time window size $\it{n}$ [-]')
plt.ylabel('Accuracy [-]')

plt.rcParams['figure.dpi'] = 300  # High resolution for saving
plt.rcParams['font.size'] = 14
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14
plt.xticks([1, 5, 10, 15, 20])

# Save the plot
plt.savefig(os.path.join(model_dir, "SVM_mw_accuracy.png"), dpi=300)
plt.show()

### Analyse and save best model

In [None]:
# Evaluate the model
predictions = svm_model_best.predict(val_X)
metrics_path = os.path.join(model_dir, "svm_metrics_opt_"+model_name+".csv")
metrics = eval.metrics_table_oneclass(val_Y_unfused, predictions["fault"], save_path=metrics_path)
eval.visualize_metrics(metrics)

In [9]:
### Choose optimal moving time window ###
optimal_mw = 1
#########################################

# create folder to save model and metrics
model_dir = os.path.join(get_models_dir(), model_name)
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save model
filename = 'svm_model_best.joblib' # set model name
save_path = os.path.join(model_dir, filename)
joblib.dump(svm_model_best, save_path)

# Create and save config
config_model = {
    "model": "SVM",
    "date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "train_set": train_set_name,
    "name": model_name,
    "gamma": best_row['gamma'],
    "C": best_row['C'],
    "moving time window": optimal_mw
}

# Save the model config as a json file
config_name = "config.json"
config_path = os.path.join(model_dir, config_name)
with open(config_path, 'w') as json_file:
    json.dump(config_model, json_file, indent=4)

### Show performance with exemplatory validation set

In [None]:
example_setname = "FILL_IN_EXAMPLE_SET_NAME"
plot_example_set(model=svm_model_best, dataset_name=example_setname, parameter_plotted="weight")