# Model to predict clusters given by the first stage
This notebook contains the code to predict the first stage cluster of a given cyberattack. The idea is, once a new cyberattack come in, our model is capable of classfy the attack in one of the three clusters given by the first stage.

In [None]:
# Required imports 

# Utils functions
from utils import *

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# General imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter


# ML imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


## Read data

In [None]:
# Download data
df = pd.read_csv("../Data/Cluster_data_wlabels.csv")
# The target has 4 nan, drop them
df = df.dropna(subset=['spectral_cluster'])

## Train/test split
First we need to prepare the commands for the train/test split. Pre-processing of commands is needed to ensure that we only take unique commands, otherwise data leak wil appear. Let us perform an improve preprocessing of the commands to ensure that.

In [None]:
# Take just commands and the labels
df_commands = df[['_source.commands','spectral_cluster']]

# Process the commands and take the unique ones
df_commands = command_improve_normalization(df_commands)
df_commands = df_commands.drop_duplicates(subset=['_source.commands'])

# Split the data into training and test sets
X = df_commands.drop('spectral_cluster', axis=1)
y = df_commands['spectral_cluster']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Careful!! Imbalanced dataset!!

In [None]:
# Clustering distribution for unique commands
label_counts = Counter(y_train)
labels, counts = zip(*label_counts.items())

# Create a figure and axes for the bar chart
fig, ax = plt.subplots(figsize=(8, 5))

# Customize the bar chart appearance
ax.bar(labels, counts, color='skyblue', edgecolor='navy', alpha=0.7)
ax.set_xlabel('Cluster', fontsize=14)
ax.set_ylabel('Count', fontsize=14)
ax.set_title('Cluster Distribution', fontsize=16, fontweight='bold')

# Add grid lines for better readability
ax.grid(axis='y', linestyle='--', alpha=0.6)


# Set the x-ticks to show only 0, 1, and 2
ax.set_xticks([0, 1, 2])

# Show the plot
plt.tight_layout()
plt.show()


## Model training

In [None]:
# Train a RandomForest and a SVM
best_models = train(X_train, X_test, y_train, y_test)

## Results

In [None]:
# Set the figure size
plt.figure(figsize=(8, 6))

# Customize the confusion matrix display, e.g., using the 'display_labels' parameter
disp = ConfusionMatrixDisplay(confusion_matrix(y_test, best_models[0]['output']))

# Plot the confusion matrix
disp.plot(values_format='.0f')

# Add labels and a title
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix Random Forest')

# Show the plot
plt.show()
plt.show()

In [None]:
# Set the figure size
plt.figure(figsize=(8, 6))

# Customize the confusion matrix display, e.g., using the 'display_labels' parameter
disp = ConfusionMatrixDisplay(confusion_matrix(y_test, best_models[1]['output']))

# Plot the confusion matrix
disp.plot(values_format='.0f')

# Add labels and a title
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix SVC')

# Show the plot
plt.show()
plt.show()

In [None]:
print('F1-score of Random forest: {:.3f}'.format(best_models[0]['f1_score']))
print('F1-score of SVM: {:.3f}'.format(best_models[1]['f1_score']))