In [36]:
import pandas as pd
import numpy as np
import tensorflow as tf
import datetime

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, recall_score, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

from IPython.display import clear_output

### ***Data Loading***

In [3]:
df = pd.read_csv('/kaggle/input/cicids2017-improved-preprocessed/CICIDS2017_improved-preprocessed.csv')
df.drop(df.columns[0], axis=1, inplace=True)
df.shape

(1715326, 49)

In [4]:
df_benign = df[df['Label'] == 0]
df_anomalous = df[df['Label'] == 1]
columns = df.copy().drop('Label', axis=1)

print(f'Number of benign samples: {df_benign.shape[0]}')
print(f'Number of anomalous samples: {df_anomalous.shape[0]}')

Number of benign samples: 1432918
Number of anomalous samples: 282408


### ***Data splitting***

In [5]:
def data_splitting(df_benign, df_anomalous):
    df_anomalous_train, df_anomalous_test = train_test_split(df_anomalous, test_size=0.25, random_state=42)
    df_benign_train, df_benign_test = train_test_split(df_benign, test_size=0.05, random_state=42)

    df_benign_train = df_benign_train.sample(frac=0.10)
    df_anomalous_train = df_anomalous_train.sample(frac=0.6)

    print(f'Anomalous samples train/test: {df_anomalous_train.shape[0]} - {df_anomalous_test.shape[0]}')
    print(f'Benign samples train/test: {df_benign_train.shape[0]} - {df_benign_test.shape[0]}')
    print()
    
    X_train = pd.concat([df_benign_train, df_anomalous_train], ignore_index=True).sample(frac=1)
    X_test = pd.concat([df_benign_test, df_anomalous_test], ignore_index=True).sample(frac=1)
    y_train = X_train.pop('Label')
    y_test = X_test.pop('Label')

    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = data_splitting(df_benign, df_anomalous)

Anomalous samples train/test: 127084 - 70602
Benign samples train/test: 136127 - 71646



### ***IDS-Anta solution***

In [7]:
# Implement ACO
class AntColony:
    def __init__(self, num_ants, num_iterations, num_features):
        self.num_ants = num_ants
        self.num_iterations = num_iterations
        self.num_features = num_features

    def select_features(self, X_train, y_train):
        # Implement feature selection using ACO
        # For simplicity, we'll randomly select features
        return np.random.choice(range(self.num_features), size=self.num_features // 2, replace=False)

In [8]:
class MultiArmedBanditThompsonSampling:
    def __init__(self, num_classifiers):
        self.num_classifiers = num_classifiers
        self.successes = defaultdict(int)
        self.failures = defaultdict(int)
        self.selected_classifier = None

    def select_classifier(self):
        max_ucb = -float('inf')
        for clf in range(self.num_classifiers):
            beta_sample = np.random.beta(self.successes[clf] + 1, self.failures[clf] + 1)
            if beta_sample > max_ucb:
                max_ucb = beta_sample
                self.selected_classifier = clf
        return self.selected_classifier

    def update(self, clf_index, success):
        if success:
            self.successes[clf_index] += 1
        else:
            self.failures[clf_index] += 1

In [9]:
# Initialize ACO
aco = AntColony(num_ants=10, num_iterations=50, num_features=X_train.shape[1])
# Select features using ACO
selected_features = aco.select_features(X_train, y_train)

# Extract selected features from the training and testing sets
X_train_aco = X_train.iloc[:, selected_features]
X_test_aco = X_test.iloc[:, selected_features]

#scale data
std_scaler = StandardScaler()
X_train_aco = std_scaler.fit_transform(X_train_aco)
X_test_aco = std_scaler.transform(X_test_aco)

In [10]:
# Initialize classifiers
classifiers = [
    RandomForestClassifier(n_estimators=100, random_state=42),
    LogisticRegression(max_iter=1000, random_state=42),
    #SVC(kernel='linear', random_state=42),
    tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train_aco.shape[1],)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(2, activation='softmax')
    ])
]
# Initialize Thompson Sampling Multi-Armed Bandit
bandit = MultiArmedBanditThompsonSampling(num_classifiers=len(classifiers))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [12]:
# Perform Thompson Sampling for a fixed number of rounds
num_rounds = 50
for round in range(num_rounds):
    clear_output(wait=True)
    print('iter: ' + str(round))
    print('current time: ' + str(datetime.datetime.now().time()))
    selected_clf_index = bandit.select_classifier()
    selected_clf = classifiers[selected_clf_index]

    if isinstance(selected_clf, tf.keras.Sequential):
        # Compile and train the DNN model
        selected_clf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        print('Start DNN fitting...')
        selected_clf.fit(X_train_aco, y_train, epochs=10, batch_size=32, validation_data=(X_test_aco, y_test), verbose=0)
        y_pred_probs = selected_clf.predict(X_test_aco)
        y_pred = np.argmax(y_pred_probs, axis=1)
    else:
        # Train the classifier
        print(f'Start ML classifier {selected_clf_index} fitting...')
        selected_clf.fit(X_train_aco, y_train)
        y_pred = selected_clf.predict(X_test_aco)

    # Evaluate the selected classifier and update the bandit
    report = classification_report(y_test, y_pred, output_dict=True)
    accuracy = report['accuracy']
    bandit.update(selected_clf_index, accuracy)
    
    print('Round Done')

iter: 49
current time: 14:52:59.236231
Start ML classifier 0 fitting...
Round Done


In [17]:
# Generate a classification report for the classifier
best_clf_index = max(bandit.successes, key=bandit.successes.get)
best_clf = classifiers[best_clf_index]

['./best_clf']

In [15]:
if isinstance(best_clf, tf.keras.Sequential):
    best_clf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    best_clf.fit(X_train_aco, y_train, epochs=10, batch_size=32, validation_data=(X_test_aco, y_test), verbose=0)
    y_pred_probs = best_clf.predict(X_test_aco)
    y_pred = np.argmax(y_pred_probs, axis=1)
else:
    best_clf.fit(X_train_aco, y_train)
    y_pred = best_clf.predict(X_test_aco)

report = classification_report(y_test, y_pred)
test_accuracy = accuracy_score(y_test, y_pred)

### ***Load Black Box samples***

In [18]:
def get_data_with_advs(advs_path):
    df_advs = pd.read_csv(advs_path).assign(Label=1)
    df_advs.drop(df_advs.columns[0], axis=1, inplace=True)

    X = df_advs.copy()
    y = X.pop('Label')

    return df_advs, X, y

def get_advs_samples(preds, df):
    df = df.assign(Pred=preds)
    df_advs = df[df['Pred'] != df['Label']]    
    df_advs = df_advs.drop(['Pred'], axis=1)

    return df_advs

In [19]:
base_advs_csv_path = '/kaggle/input/advs-input/'
df_jsj, X_hsj, y_hsj = get_data_with_advs(base_advs_csv_path + 'hsj_new.csv')
df_boundary, X_boundary, y_boundary = get_data_with_advs(base_advs_csv_path + 'boundary_new.csv')
df_zoo, X_zoo, y_zoo = get_data_with_advs(base_advs_csv_path + 'zoo_new.csv')
df_query_eff, X_query_eff, y_query_eff = get_data_with_advs(base_advs_csv_path + 'query_eff_new.csv')

In [22]:
X_hsj = X_hsj.iloc[:, selected_features]
X_boundary = X_boundary.iloc[:, selected_features]
X_zoo = X_zoo.iloc[:, selected_features]
X_query_eff = X_query_eff.iloc[:, selected_features]

In [25]:
y_pred_hsj = best_clf.predict(np.array(X_hsj))
y_pred_boundary = best_clf.predict(np.array(X_boundary))
y_pred_zoo = best_clf.predict(np.array(X_zoo))
y_pred_query_eff = best_clf.predict(np.array(X_query_eff))

In [38]:
test_accuracy_hsj = accuracy_score(y_hsj, y_pred_hsj) * 100
print(f"Test Accuracy (HSJ): {test_accuracy_hsj} %")

Test Accuracy (HSJ): 74.67200000000001 %


In [33]:
test_accuracy_boundary = accuracy_score(y_boundary, y_pred_boundary) * 100
print(f"Test Accuracy (BOUNDARY): {test_accuracy_boundary} %")

Test Accuracy (BOUNDARY): 0.036000000000000004 %


In [34]:
test_accuracy_zoo = accuracy_score(y_zoo, y_pred_zoo) * 100
print(f"Test Accuracy (ZOO): {test_accuracy_zoo} %")

Test Accuracy (ZOO): 91.598 %


In [39]:
test_accuracy_query_eff = accuracy_score(y_query_eff, y_pred_query_eff) * 100
print(f"Test Accuracy (QUERY EFFICIENT): {test_accuracy_query_eff} %")

Test Accuracy (QUERY EFFICIENT): 51.507999999999996 %
