In [9]:
# Import the necessary files
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, StackingClassifier
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import random
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

#### ALL labels Algorithm Implementation

In [10]:
# Read from the important features from features_with_all_labels.txt
subset_feature_dict = {}
delimiter = "------------------"
with open("./data/features_with_all_labels.txt", "r") as file:
    lines = file.readlines()

features_dict = {}

for line in lines[1:]:
    line = line.strip()
    splited_lines = line.split(delimiter)
    features_dict[splited_lines[0]]=splited_lines[1]
    
subset_feature_dict['lasso']=features_dict['lasso']

In [11]:
# Print the subset_feature_dict
subset_feature_dict

{'lasso': 'Fwd IAT Max,PSH Flag Count,ACK Flag Count'}

In [12]:
# read the csv file
df = pd.read_csv('./data/final_data_all_labels.csv')

In [13]:
# Change the dataframe into X and y labels
X = df.drop("Label", axis=1)
y = df["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

all_X_test, all_y_test= X_test,y_test

In [14]:
# Create a pipeline for lgbm classifier

pipeline = Pipeline([
    ('lgbm', lgb.LGBMClassifier())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'lgbm__n_estimators': [10 ,20, 30],
    'lgbm__learning_rate': [0.01, 0.1, 0.2],
    'lgbm__max_depth': [3, 5, 7]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

lgb_classifier = grid_search.best_estimator_

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003670 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11025
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 65
[LightGBM] [Info] Start training from score -0.950192
[LightGBM] [Info] Start training from score -3.198257
[LightGBM] [Info] Start training from score -2.995732
[LightGBM] [Info] Start training from score -1.071484
[LightGBM] [Info] Start training from score -7.090077
[LightGBM] [Info] Start training from score -1.972083
[LightGBM] [Info] Start training from score -3.218876
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000931 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11016
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 65
[LightGBM] [Info] Start training from score 

In [16]:
# Create a pipeline for Adaboost classifier

pipeline = Pipeline([
    ('adaboost', AdaBoostClassifier())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'adaboost__n_estimators': [10, 20, 30],
    'adaboost__learning_rate': [0.01, 0.1, 0.2],
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

ada_classifier = grid_search.best_estimator_


Best Parameters: {'adaboost__learning_rate': 0.1, 'adaboost__n_estimators': 10}
Best Accuracy: 0.6716666666666666
Test Accuracy: 0.711


In [17]:
# Create a pipeline for Logistic classifier

pipeline = Pipeline([
    ('logreg', LogisticRegression())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'logreg__penalty': ['l1', 'l2'],
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

logistic_classifier = grid_search.best_estimator_


Best Parameters: {'logreg__C': 100, 'logreg__penalty': 'l2'}
Best Accuracy: 0.8876666666666667
Test Accuracy: 0.883


In [18]:
# Create a pipeline for Naive Bayes classifier

pipeline = Pipeline([
    ('nb', MultinomialNB())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'nb__alpha': [0.1, 0.5, 1.0]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

naivebayes_classifier = grid_search.best_estimator_



Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.7416666666666666
Test Accuracy: 0.75


In [19]:
# Create a pipeline for Xgboost Classifier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with XGBoost classifier
pipeline = Pipeline([
    ('xgb', xgb.XGBClassifier())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'xgb__n_estimators': [10, 20, 30],
    # 'xgb__learning_rate': [0.01, 0.1, 0.2],
    # 'xgb__max_depth': [3, 5, 7]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

xgb_classifier = grid_search.best_estimator_

Best Parameters: {'xgb__n_estimators': 20}
Best Accuracy: 0.9896875000000002
Test Accuracy: 0.98875


In [20]:
# Create a stacking classifier from all the best model objects

base_models = [
    ('xgboost', xgb_classifier),
    ('lightgbm', lgb_classifier),
    ('adaboost', ada_classifier),
    ('logistic', logistic_classifier),
    ('naive_bayes', naivebayes_classifier)
]
meta_model = LogisticRegression()
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_classifier.fit(X_train, y_train)
y_predict_stacking = stacking_classifier.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_predict_stacking)
print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))

all_stacking_classifier=stacking_classifier

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007950 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11557
[LightGBM] [Info] Number of data points in the train set: 3200, number of used features: 65
[LightGBM] [Info] Start training from score -0.955324
[LightGBM] [Info] Start training from score -3.226719
[LightGBM] [Info] Start training from score -2.958918
[LightGBM] [Info] Start training from score -1.064211
[LightGBM] [Info] Start training from score -7.377759
[LightGBM] [Info] Start training from score -1.981861
[LightGBM] [Info] Start training from score -3.211094
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11124
[LightGBM] [Info] Number of data points in the train set: 2560, number of used features: 65
[LightGBM] [Info] Start training from score 

#### model with features

In [21]:
# Print the features dict
features_dict

{'variance_threshold': 'Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt

In [22]:
# Create different dataframes for all the important features
df_variance_threshold = df[[x for x in features_dict['variance_threshold'].split(',')]]
df_lasso = df[[x for x in features_dict['lasso'].split(',')]]
df_random_forest_feature_importance = df[[x for x in features_dict['random_forest_feature_importance'].split(',')]]
df_permutation_importance = df[[x for x in features_dict['permutation_importance'].split(',')]]

In [23]:
# Add label column to every df

df_features_dict={
    "df_variance_threshold":df_variance_threshold,
    "df_lasso":df_lasso,
    "df_random_forest_feature_importance":df_random_forest_feature_importance,
    "df_permutation_importance":df_permutation_importance,
}

for df_feature in df_features_dict.keys():
    df_features_dict[df_feature]['Label'] = df['Label']

In [26]:
# A func
def train_classifier(X_train, X_test, y_train, y_test):
    """
        Train and evaluate LGB, AdaBoost, Logistic Regression, Naive Bayes, XGBoost,
        and a Stacking Classifier on the given datasets.

        Parameters:
        - X_train: Training features
        - X_test: Testing features
        - y_train: Training labels
        - y_test: Testing labels

        Returns:
            None
    """

    print("\n")
    print("-"*40)
    print("Running LGB classifier")
    print("-"*40)
    print("\n")

    # LGB classifier
    pipeline = Pipeline([
        ('lgbm', lgb.LGBMClassifier())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'lgbm__n_estimators': [10, 20, 30],
        'lgbm__learning_rate': [0.01, 0.1, 0.2],
        'lgbm__max_depth': [3, 5, 7]
    }

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    lgb_classifier = grid_search.best_estimator_

    print("\n")
    print("-"*40)
    print("Running Adaboost classifier")
    print("-"*40)
    print("\n")

    pipeline = Pipeline([
        ('adaboost', AdaBoostClassifier())
    ])

    param_grid = {
        'adaboost__n_estimators': [10, 20, 30],
        'adaboost__learning_rate': [0.01, 0.1, 0.2],
    }

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)
    ada_classifier = grid_search.best_estimator_


    # Logistic Classsifier

    print("\n")
    print("-"*40)
    print("Running Logistic classifier")
    print("-"*40)
    print("\n")


    pipeline = Pipeline([
        ('logreg', LogisticRegression())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'logreg__C': [0.001, 0.01, 0.1],
        'logreg__penalty': ['l1', 'l2'],
    }

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)
    logistic_classifier = grid_search.best_estimator_

    print("\n")
    print("-"*40)
    print("Running Naive bayes classifier")
    print("-"*40)
    print("\n")

    pipeline = Pipeline([
        ('nb', MultinomialNB())
    ])
    param_grid = {
        'nb__alpha': [0.1, 0.5, 1.0]
    }
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)
    naivebayes_classifier = grid_search.best_estimator_

    print("\n")
    print("-"*40)
    print("Running XGB classifier")
    print("-"*40)
    print("\n")

    pipeline = Pipeline([
        ('xgb', xgb.XGBClassifier())
    ])

    param_grid = {
        'xgb__n_estimators': [10, 20, 30],
        'xgb__learning_rate': [0.01, 0.1],
        'xgb__max_depth': [3, 5]
    }

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    grid_search.fit(X_train, y_train)

    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    xgb_classifier = grid_search.best_estimator_


    print("\n")
    print("-"*40)
    print("Running Stacking based classifier")
    print("-"*40)
    print("\n")

    # Stacking Classifier
    base_models = [
        ('xgboost', xgb_classifier),
        ('lightgbm', lgb_classifier),
        ('adaboost', ada_classifier),
        ('logistic', logistic_classifier),
        ('naive_bayes', naivebayes_classifier)
    ]
    meta_model = LogisticRegression()
    stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
    stacking_classifier.fit(X_train, y_train)
    y_predict_stacking = stacking_classifier.predict(X_test)
    stacking_accuracy = accuracy_score(y_test, y_predict_stacking)
    print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))



In [28]:
# Split the dataset into training and testing sets
X=df_variance_threshold.drop('Label',axis=1)
y=df_variance_threshold['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002305 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11124
[LightGBM] [Info] Number of data points in the train set: 2560, number of used features: 65
[LightGBM] [Info] Start training from score -0.955121
[LightGBM] [Info] Start training from score -3.222790
[LightGBM] [Info] Start training from score -2.964961
[LightGBM] [Info] Start training from score -1.064437
[LightGBM] [Info] Start training from score -7.154615
[LightGBM] [Info] Start training from score -1.981294
[LightGBM] [Info] Start training from score -3.213034
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001176 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11109
[LightGBM] [Info] Number of

In [29]:
# Split the dataset into training and testing sets
X=df_permutation_importance.drop('Label',axis=1)
y=df_permutation_importance['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2539
[LightGBM] [Info] Number of data points in the train set: 2560, number of used features: 17
[LightGBM] [Info] Start training from score -0.955121
[LightGBM] [Info] Start training from score -3.222790
[LightGBM] [Info] Start training from score -2.964961
[LightGBM] [Info] Start training from score -1.064437
[LightGBM] [Info] Start training from score -7.154615
[LightGBM] [Info] Start training from score -1.981294
[LightGBM] [Info] Start training from score -3.213034
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_row_wise=true` to remove the overhead.

In [30]:
# Split the dataset into training and testing sets
X=df_random_forest_feature_importance.drop('Label',axis=1)
y=df_random_forest_feature_importance['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [31]:
# Split the dataset into training and testing sets
print("Model running on multiclass labels")
X=df_lasso.drop('Label',axis=1)
y=df_lasso['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)

Model running on multiclass labels


----------------------------------------
Running LGB classifier
----------------------------------------


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 257
[LightGBM] [Info] Number of data points in the train set: 2560, number of used features: 3
[LightGBM] [Info] Start training from score -0.955121
[LightGBM] [Info] Start training from score -3.222790
[LightGBM] [Info] Start training from score -2.964961
[LightGBM] [Info] Start training from score -1.064437
[LightGBM] [Info] Start training from score -7.154615
[LightGBM] [Info] Start training from score -1.981294
[LightGBM] [Info] Start training from score -3.213034
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2

#### TWO labels Algorithm Implementation

In [32]:
# Load data from the 'final_data_two_labels.csv' file into a DataFrame
df = pd.read_csv('./data/final_data_two_labels.csv')

In [33]:
# Read and parse data from a text file containing important features with two labels

delimiter = "------------------"
with open("./data/features_with_two_labels.txt", "r") as file:
    lines = file.readlines()

features_dict = {}
for line in lines[1:]:
    line = line.strip()
    splited_lines = line.split(delimiter)
    features_dict[splited_lines[0]]=splited_lines[1]


In [34]:
# Extract features from DataFrame according to the important features

df_variance_threshold_two = df[[x for x in features_dict['variance_threshold'].split(',')]]
df_random_forest_feature_importance_two = df[[x for x in features_dict['random_forest_feature_importance'].split(',')]]
df_recursive_feature_elimination_two = df[[x for x in features_dict['recursive_feature_elimination'].split(',')]]
df_permutation_importance_two = df[[x for x in features_dict['permutation_importance'].split(',')]]

In [35]:
# Add labels to the dataframe
df_features_dict={
    "df_variance_threshold":df_variance_threshold_two,
    "df_random_forest_feature_importance":df_random_forest_feature_importance_two,
    "df_permutation_importance":df_permutation_importance_two,
    "recursive_feature_elimination":df_recursive_feature_elimination_two
}

df_variance_threshold_two['Label'] = df['Label']
df_random_forest_feature_importance_two['Label'] = df['Label']
df_recursive_feature_elimination_two['Label'] = df['Label']
df_permutation_importance_two['Label'] = df['Label']

In [36]:
# Print the columns
df_permutation_importance_two.columns

Index(['Init_Win_bytes_forward', 'Fwd IAT Min', 'Init_Win_bytes_backward',
       'Bwd IAT Min', 'Packet Length Std', 'Bwd IAT Std', 'PSH Flag Count',
       'Bwd IAT Mean', 'Fwd IAT Mean', 'Subflow Bwd Bytes', 'Idle Max',
       'SYN Flag Count', 'Bwd Packet Length Std', 'Bwd URG Flags',
       'Fwd URG Flags', 'Bwd PSH Flags', 'Fwd PSH Flags', 'FIN Flag Count',
       'Label'],
      dtype='object')

In [37]:
# Split the dataset into training and testing sets
X=df_variance_threshold_two.drop('Label',axis=1)
y=df_variance_threshold_two['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


[LightGBM] [Info] Number of positive: 942, number of negative: 1458
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000752 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5176
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.392500 -> initscore=-0.436816
[LightGBM] [Info] Start training from score -0.436816
[LightGBM] [Info] Number of positive: 942, number of negative: 1458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000245 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5200
[LightGBM] [Info] Number of data points in the train set: 2400, number of

In [38]:
# Split the dataset into training and testing sets
X=df_random_forest_feature_importance_two.drop('Label',axis=1)
y=df_random_forest_feature_importance_two['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


[LightGBM] [Info] Number of positive: 942, number of negative: 1458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000259 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3244
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.392500 -> initscore=-0.436816
[LightGBM] [Info] Start training from score -0.436816
[LightGBM] [Info] Number of positive: 942, number of negative: 1458
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000307 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3244
[LightGBM] [Info] Number of data points in the train set: 2400, number of

In [39]:
# Split the dataset into training and testing sets
X=df_recursive_feature_elimination_two.drop('Label',axis=1)
y=df_recursive_feature_elimination_two['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


[LightGBM] [Info] Number of positive: 942, number of negative: 1458
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000603 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3244
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.392500 -> initscore=-0.436816
[LightGBM] [Info] Start training from score -0.436816
[LightGBM] [Info] Number of positive: 942, number of negative: 1458
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000408 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3244
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pa

In [40]:
# Split the dataset into training and testing sets
X=df_permutation_importance_two.drop('Label',axis=1)
y=df_permutation_importance_two['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


[LightGBM] [Info] Number of positive: 942, number of negative: 1458
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000244 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2320
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.392500 -> initscore=-0.436816
[LightGBM] [Info] Start training from score -0.436816
[LightGBM] [Info] Number of positive: 942, number of negative: 1458
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000337 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2219
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pa

In [41]:
# Generate Subset features for pca
for key, item in features_dict.items():
    if key=='lasso' or key=='variance_threshold':
        continue
    subset_feature_dict[key]=item

### PCA Data preparation

In [44]:
# Split the dataset into training and testing sets
subset_feature_dict

{'lasso': 'Fwd IAT Max,PSH Flag Count,ACK Flag Count',
 'random_forest_feature_importance': 'Init_Win_bytes_forward,Fwd Packet Length Min,Init_Win_bytes_backward,Min Packet Length,Bwd Packet Length Mean,Packet Length Std,Bwd Packet Length Min,Avg Bwd Segment Size,Fwd Packet Length Max,Bwd Packet Length Std,Fwd Header Length,Fwd Header Length.1,Average Packet Size,Total Length of Fwd Packets,Packet Length Mean,Subflow Fwd Bytes,Avg Fwd Segment Size,Packet Length Variance',
 'recursive_feature_elimination': 'Total Length of Fwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Fwd Header Length,Min Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,Average Packet Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward',
 'permutation_importance': 'Init_Win_bytes_forward,Fwd IAT Min,Init_Win_bytes_backward,Bwd IAT Min,Pac

In [46]:
# Combine and filter important features from different methods
imp1 = set([x for x in subset_feature_dict['random_forest_feature_importance'].split(',') if x ])
imp2 = set([x for x in subset_feature_dict['recursive_feature_elimination'].split(',') if x ])
imp3 = set([x for x in subset_feature_dict['permutation_importance'].split(',') if x ])
imp4 = set([x for x in subset_feature_dict['lasso'].split(',') if x ])

finalset = imp1.intersection(imp2).intersection(imp3)
finalset=list(finalset.union(imp4))
print(finalset)
print(len(finalset))

['Bwd Packet Length Std', 'Init_Win_bytes_forward', 'PSH Flag Count', 'Packet Length Std', 'Init_Win_bytes_backward', 'Fwd IAT Max', 'ACK Flag Count']
7


In [47]:
# Read the two labels csv file to dataframe
df_twolabel= pd.read_csv("./data/final_data_two_labels.csv")

In [48]:
# Training on the final set 
X=df_twolabel[finalset]
y=df_twolabel['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)




----------------------------------------
Running LGB classifier
----------------------------------------


[LightGBM] [Info] Number of positive: 942, number of negative: 1458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 893
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.392500 -> initscore=-0.436816
[LightGBM] [Info] Start training from score -0.436816
[LightGBM] [Info] Number of positive: 942, number of negative: 1458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 896
[LightGBM] [

In [49]:
# Run a base model stacking classifier
base_models = [
    ('xgboost', xgb.XGBClassifier()),
    ('lightgbm', lgb.LGBMClassifier()),
    ('adaboost', AdaBoostClassifier()),
    ('logistic', LogisticRegression()),
    ('naive_bayes', MultinomialNB())
]
meta_model = LogisticRegression()
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_classifier.fit(X_train, y_train)
y_predict_stacking = stacking_classifier.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_predict_stacking)
print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))

[LightGBM] [Info] Number of positive: 1178, number of negative: 1822
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000134 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 945
[LightGBM] [Info] Number of data points in the train set: 3000, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.392667 -> initscore=-0.436117
[LightGBM] [Info] Start training from score -0.436117
[LightGBM] [Info] Number of positive: 942, number of negative: 1458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 893
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 7
[LightGBM] [Info] [binary:Boo

In [50]:
# Predicting the label from stacking classifier and finding out the incorrectly classified labels
y_predict = stacking_classifier.predict(X)
incorrect_twolabel_idx = (y != y_predict)

df_exclude_from_pca = X[incorrect_twolabel_idx]
df_exclude_from_pca['Label']=y[incorrect_twolabel_idx]

In [51]:
df_exclude_from_pca

Unnamed: 0,Bwd Packet Length Std,Init_Win_bytes_forward,PSH Flag Count,Packet Length Std,Init_Win_bytes_backward,Fwd IAT Max,ACK Flag Count,Label
309,0.0,0.445572,1.0,0.0,0.44191,0.05011146,0.0,1
833,0.0,0.445572,1.0,0.0,0.44191,0.04560646,0.0,1
1041,0.0,0.003632,0.0,0.0,0.003906,0.0,1.0,1
1173,0.0,0.250153,0.0,0.0,0.003632,0.0,1.0,0
1222,0.0,0.003922,0.0,0.0,0.0,0.07547008,1.0,1
1255,0.020776,0.445572,1.0,0.035068,0.003601,0.08403235,0.0,1
1461,0.0,0.003922,0.0,0.0,0.00351,0.0,1.0,0
1481,0.0,0.445572,1.0,0.0,0.44191,0.04863134,0.0,1
1553,0.0,0.003662,0.0,0.0,0.003601,0.0,1.0,1
1725,0.0,0.01564,1.0,0.00049,1.5e-05,0.0,0.0,1


## WorkIng with PCA Dataset

In [52]:
# Read the csv file
pca_df = pd.read_csv('./data/sample_pca_test.csv')

In [53]:
# Print the shape
pca_df.shape

(4000, 19)

In [54]:
# From the pca_df get the data which doesnt consist the incorrect two label indexes
X_train, y_train =  pca_df[~incorrect_twolabel_idx].drop('Label',axis=1),pca_df[~incorrect_twolabel_idx]['Label']
X_test, y_test =  pca_df[incorrect_twolabel_idx].drop('Label',axis=1),pca_df[incorrect_twolabel_idx]['Label']

In [55]:
X_train.shape,X_test.shape

((3982, 18), (18, 18))

In [56]:
# Use minmax scaler to scale the df
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [57]:
# Use stacking classifier with  base models and run the model
base_models = [
    ('xgboost', xgb.XGBClassifier()),
    ('lightgbm', lgb.LGBMClassifier()),
    ('adaboost', AdaBoostClassifier()),
    ('logistic', LogisticRegression()),
    ('naive_bayes', MultinomialNB())
]
meta_model = LogisticRegression()
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_classifier.fit(X_train, y_train)
y_predict_stacking = stacking_classifier.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_predict_stacking)

print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))

[LightGBM] [Info] Number of positive: 390, number of negative: 3592
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000356 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 3982, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097941 -> initscore=-2.220318
[LightGBM] [Info] Start training from score -2.220318
[LightGBM] [Info] Number of positive: 312, number of negative: 2873
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 3185, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097959 -> initscore=-2.220109
[LightGBM] [Info] Start training from score -2.220109
[LightGBM] [Info] Nu