In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
import warnings
warnings.filterwarnings('ignore')

from tabulate import tabulate

from tqdm import tqdm

import joblib

from sklearn.cluster import KMeans, DBSCAN, Birch, AgglomerativeClustering, SpectralClustering, OPTICS, MeanShift, AffinityPropagation 
from sklearn.cluster import MeanShift, AffinityPropagation, DBSCAN
import numpy as np

## Supervised Learning

### Binary

In [11]:
df = pd.read_parquet('../archive/preprocessed_DNN.parquet')
target = "Attack_label"
to_drop = ["Attack_type", "Attack_label"]
X_train, X_test, y_train, y_test = train_test_split(df.drop(to_drop, axis=1), df[target], test_size=0.2, random_state=42)

In [14]:
xgb = XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.05, objective='binary:logistic',tree_method='gpu_hist', random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
dt = DecisionTreeClassifier(random_state=42)
lr = LogisticRegression(random_state=42)
gnb = GaussianNB()

models = [('xgb', xgb), ('knn', knn), ('dt', dt), ('lr', lr), ('gnb', gnb)]

results = []
for model_name, model in tqdm(models):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = round(accuracy_score(y_test, preds), 3)
    precision = round(precision_score(y_test, preds), 3)
    recall = round(recall_score(y_test, preds), 3)
    f1 = round(f1_score(y_test, preds, average='weighted'), 3)
    results.append([model_name, accuracy, precision, recall, f1])
    
    joblib.dump(model, f'models/binary_{model_name}.joblib')
    
table = tabulate(results, headers=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'], tablefmt='grid')
print(table)    

100%|██████████| 5/5 [20:47<00:00, 249.57s/it]

+---------+------------+-------------+----------+------------+
| Model   |   Accuracy |   Precision |   Recall |   F1 Score |
| xgb     |          1 |           1 |        1 |          1 |
+---------+------------+-------------+----------+------------+
| knn     |          1 |           1 |        1 |          1 |
+---------+------------+-------------+----------+------------+
| dt      |          1 |           1 |        1 |          1 |
+---------+------------+-------------+----------+------------+
| lr      |          1 |           1 |        1 |          1 |
+---------+------------+-------------+----------+------------+
| gnb     |          1 |           1 |        1 |          1 |
+---------+------------+-------------+----------+------------+





### 15 multi class

In [2]:
df = pd.read_parquet('../archive/preprocessed_DNN.parquet')
target = "Attack_type"
to_drop = ["Attack_type", "Attack_label"]
X_train, X_test, y_train, y_test = train_test_split(df.drop(to_drop, axis=1), df[target], test_size=0.2, random_state=42)

In [3]:
xgb = XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.05, objective='multi:softmax',tree_method='gpu_hist', random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
dt = DecisionTreeClassifier(random_state=42)
lr = LogisticRegression(random_state=42)
gnb = GaussianNB()

models = [('xgb', xgb), ('knn', knn), ('dt', dt), ('lr', lr), ('gnb', gnb)]

results = []
for model_name, model in tqdm(models):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = round(accuracy_score(y_test, preds), 3)
    precision = round(precision_score(y_test, preds, average='weighted'), 3)
    recall = round(recall_score(y_test, preds, average='weighted'), 3)
    f1 = round(f1_score(y_test, preds, average='weighted'), 3)
    results.append([model_name, accuracy, precision, recall, f1])
    
    joblib.dump(model, f'models/multi_15_{model_name}.joblib')
    
table = tabulate(results, headers=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'], tablefmt='grid')
print(table)    

100%|██████████| 5/5 [20:55<00:00, 251.06s/it]

+---------+------------+-------------+----------+------------+
| Model   |   Accuracy |   Precision |   Recall |   F1 Score |
| xgb     |      0.985 |       0.987 |    0.985 |      0.985 |
+---------+------------+-------------+----------+------------+
| knn     |      0.953 |       0.953 |    0.953 |      0.953 |
+---------+------------+-------------+----------+------------+
| dt      |      0.981 |       0.981 |    0.981 |      0.981 |
+---------+------------+-------------+----------+------------+
| lr      |      0.95  |       0.954 |    0.95  |      0.951 |
+---------+------------+-------------+----------+------------+
| gnb     |      0.902 |       0.942 |    0.902 |      0.89  |
+---------+------------+-------------+----------+------------+





## Unsupervised Learning

In [7]:
def tune_unsupervised_model(model_name, X, n_clusters):

    def tune_meanshift(X, n_clusters):
        bandwidth = 2.0
        model = MeanShift(bandwidth=bandwidth)
        attempts = 0
        while attempts < 10:
            print(attempts)
            num_clusters = len(np.unique(model.fit_predict(X)))
            if num_clusters == n_clusters:
                return ('meanshift', model)
            elif num_clusters < n_clusters:
                bandwidth += 0.1
            else:
                bandwidth -= 0.1
            model = MeanShift(bandwidth=bandwidth)
            attempts += 1
        return ('meanshift_no', model)
    
    def tune_affinity_propagation(X, n_clusters):
        damping = 0.9
        preference = -50
        model = AffinityPropagation(damping=damping, preference=preference)
        attempts = 0
        while attempts < 10:
            print(attempts)
            num_clusters = len(np.unique(model.fit_predict(X)))
            if num_clusters == n_clusters:
                return ('affinity', model)
            elif num_clusters < n_clusters:
                damping += 0.1
                preference -= 5
            else:
                damping -= 0.1
                preference += 5
            model = AffinityPropagation(damping=damping, preference=preference)
            attempts += 1
        return ('affinity_no', model)

    def tune_dbscan(X, n_clusters):
        epsilon = 0.5
        min_samples = 5
        model = DBSCAN(eps=epsilon, min_samples=min_samples)
        attempts = 0
        while attempts < 10:
            print(attempts)
            num_clusters = len(np.unique(model.fit_predict(X)))
            if num_clusters == n_clusters:
                return ('dbscan', model)
            elif num_clusters < n_clusters:
                epsilon += 0.1
                min_samples += 1
            else:
                epsilon -= 0.1
                min_samples -= 1
            model = DBSCAN(eps=epsilon, min_samples=min_samples)
            attempts += 1
        return ('dbscan_no', model)

    def tune_optics(X, n_clusters):
        min_samples = 5
        xi = 0.05
        model = OPTICS(min_samples=min_samples, xi=xi)
        attempts = 0
        while attempts < 10:
            print(attempts)
            num_clusters = len(np.unique(model.fit_predict(X)))
            if num_clusters == n_clusters:
                return ('optics', model)
            elif num_clusters < n_clusters:
                min_samples += 1
                xi += 0.01
            else:
                min_samples -= 1
                xi -= 0.01
            model = OPTICS(min_samples=min_samples, xi=xi)
            attempts += 1
        return ('optics_no', model)

    # Map model names to tuning functions
    tuning_functions = {
        'MeanShift': tune_meanshift,
        'AffinityPropagation': tune_affinity_propagation,
        'DBSCAN': tune_dbscan,
        'OPTICS': tune_optics
    }

    # Get the tuning function for the given model name
    tuning_function = tuning_functions.get(model_name)

    # If the tuning function exists, call it and return the result
    if tuning_function:
        return tuning_function(X,n_clusters)
    else:
        raise ValueError(f'No tuning function found for model name: {model_name}')

In [3]:
def evaluate_unsupervised_model(model_name, model, X, y):
    preds = model.fit_predict(X)
    accuracy = round(accuracy_score(y, preds), 3)
    precision = round(precision_score(y, preds), 3)
    recall = round(recall_score(y, preds), 3)
    f1 = round(f1_score(y, preds, average='weighted'), 3)
    return [model_name, accuracy, precision, recall, f1]

### Binary unsupervised

In [9]:
df = pd.read_parquet('../archive/preprocessed_DNN.parquet')
to_drop = ["Attack_type", "Attack_label"]
X = df.drop(to_drop, axis=1)
y = df["Attack_label"]

In [7]:
kmeans = KMeans(n_clusters=2, random_state=42)
birch = Birch(n_clusters=2)
optics = OPTICS(min_samples=2)

models = [("kmeans", kmeans), ("birch", birch)]

results = []
for model_name, model in tqdm(models):  
    model.fit(X)
    preds = model.labels_
    results.append(evaluate_unsupervised_model(model_name, model, X, y))
    joblib.dump(model, f'models/unsupervised_binary_{model_name}.joblib')

table = tabulate(results, headers=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'], tablefmt='grid')
print(table)

100%|██████████| 2/2 [04:27<00:00, 133.91s/it]

+---------+------------+-------------+----------+------------+
| Model   |   Accuracy |   Precision |   Recall |   F1 Score |
| kmeans  |      0.673 |           0 |        0 |      0.577 |
+---------+------------+-------------+----------+------------+
| birch   |      0.716 |           0 |        0 |      0.598 |
+---------+------------+-------------+----------+------------+





In [10]:
results = []
for model_name in tqdm(('DBSCAN', 'OPTICS', 'MeanShift', 'AffinityPropagation')):
    model_name, model = tune_unsupervised_model(model_name, X, 2)
    results.append(evaluate_unsupervised_model(model_name, model, X, y))
    print(model.get_params())
    joblib.dump(model, f'models/unsupervised_binary_{model_name}.joblib')

table = tabulate(results, headers=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'], tablefmt='grid')
print(table)

  0%|          | 0/4 [00:00<?, ?it/s]

0


: 

### Multi Unsupervised

In [5]:
df = pd.read_parquet('../archive/preprocessed_DNN.parquet')
to_drop = ["Attack_type", "Attack_label"]
X = df.drop(to_drop, axis=1)
y = df["Attack_type"]

In [12]:
kmeans = KMeans(n_clusters=15, random_state=42)
birch = Birch(n_clusters=15)
models = [("kmeans", kmeans), ("birch", birch)]

results = []
for model_name, model in tqdm(models):
    model.fit(X)
    preds = model.labels_
    preds = model.fit_predict(X)
    accuracy = round(accuracy_score(y, preds), 3)
    precision = round(precision_score(y, preds, average='weighted'), 3)
    recall = round(recall_score(y, preds, average='weighted'), 3)
    f1 = round(f1_score(y, preds, average='weighted'), 3)
    results.append([model_name, accuracy, precision, recall, f1])
    joblib.dump(model, f'models/unsupervised_multi_{model_name}.joblib')

table = tabulate(results, headers=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'], tablefmt='grid')
print(table)

100%|██████████| 2/2 [04:28<00:00, 134.01s/it]

+---------+------------+-------------+----------+------------+
| Model   |   Accuracy |   Precision |   Recall |   F1 Score |
| kmeans  |      0.001 |       0     |    0.001 |          0 |
+---------+------------+-------------+----------+------------+
| birch   |      0.012 |       0.027 |    0.012 |          0 |
+---------+------------+-------------+----------+------------+





In [None]:
results = []
for model_name in ('DBSCAN', 'OPTICS', 'MeanShift', 'AffinityPropagation'):
    model_name, model = tune_unsupervised_model(model_name, X, 15)
    results.append(evaluate_unsupervised_model(model_name, model, X, y))
    joblib.dump(model, f'models/unsupervised_multi_{model_name}.joblib')

table = tabulate(results, headers=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'], tablefmt='grid')
print(table)

## Anomaly Detection

In [15]:
df = pd.read_parquet('../archive/preprocessed_DNN.parquet')
to_drop = ["Attack_type", "Attack_label"]
X = df.drop(to_drop, axis=1)
y = df["Attack_label"]

In [18]:
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest


ocsvm = OneClassSVM(nu=0.1)
iso_forest = IsolationForest(contamination=0.1)
results = []
for model_name, model in tqdm([("iso_forest",iso_forest), ("ocsvm",ocsvm)]):
    model.fit(X)
    preds = model.predict(X)
    preds = [1 if i == -1 else 0 for i in preds]
    accuracy = round(accuracy_score(y, preds), 3)
    precision = round(precision_score(y, preds), 3)
    recall = round(recall_score(y, preds), 3)
    f1 = round(f1_score(y, preds, average='weighted'), 3)
    results.append([model_name, accuracy, precision, recall, f1])
    joblib.dump(model, f'models/anomaly_{model_name}.joblib')

table = tabulate(results, headers=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'], tablefmt='grid')
print(table) 


[A
[A