In [88]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, StackingClassifier
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import random
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

#### ALL labels Algorithm Implementation

In [89]:
subset_feature_dict = {}
delimiter = "------------------"
with open("./data/features_with_all_labels.txt", "r") as file:
    lines = file.readlines()

features_dict = {}

for line in lines[1:]:
    line = line.strip()
    splited_lines = line.split(delimiter)
    features_dict[splited_lines[0]]=splited_lines[1]

subset_feature_dict['lasso']=features_dict['lasso']

In [90]:
subset_feature_dict

{'lasso': 'Fwd IAT Max,PSH Flag Count,ACK Flag Count'}

In [91]:
df = pd.read_csv('./data/final_data_all_labels.csv')

In [92]:
X = df.drop("Label", axis=1)
y = df["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

all_X_test, all_y_test= X_test,y_test

In [93]:
pipeline = Pipeline([
    ('lgbm', lgb.LGBMClassifier())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'lgbm__n_estimators': [50, 100, 200],
    'lgbm__learning_rate': [0.01, 0.1, 0.2],
    'lgbm__max_depth': [3, 5, 7]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

lgb_classifier = grid_search.best_estimator_

Best Parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 5, 'lgbm__n_estimators': 50}
Best Accuracy: 0.9890000000000001
Test Accuracy: 0.99


In [94]:
pipeline = Pipeline([
    ('adaboost', AdaBoostClassifier())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'adaboost__n_estimators': [50, 100, 200],
    'adaboost__learning_rate': [0.01, 0.1, 0.2],
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

ada_classifier = grid_search.best_estimator_


Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50}
Best Accuracy: 0.6016666666666668
Test Accuracy: 0.622


In [95]:

pipeline = Pipeline([
    ('logreg', LogisticRegression())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'logreg__penalty': ['l1', 'l2'],
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

logistic_classifier = grid_search.best_estimator_


Best Parameters: {'logreg__C': 100, 'logreg__penalty': 'l2'}
Best Accuracy: 0.8876666666666667
Test Accuracy: 0.883


In [96]:

pipeline = Pipeline([
    ('nb', MultinomialNB())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'nb__alpha': [0.1, 0.5, 1.0]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

naivebayes_classifier = grid_search.best_estimator_



Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.7416666666666666
Test Accuracy: 0.75


In [97]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with XGBoost classifier
pipeline = Pipeline([
    ('xgb', xgb.XGBClassifier())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'xgb__n_estimators': [50, 100, 200],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__max_depth': [3, 5, 7]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

xgb_classifier = grid_search.best_estimator_

Best Parameters: {'xgb__learning_rate': 0.1, 'xgb__max_depth': 7, 'xgb__n_estimators': 50}
Best Accuracy: 0.9896874999999999
Test Accuracy: 0.99


In [98]:
base_models = [
    ('xgboost', xgb_classifier),
    ('lightgbm', lgb_classifier),
    ('adaboost', ada_classifier),
    ('logistic', logistic_classifier),
    ('naive_bayes', naivebayes_classifier)
]
meta_model = LogisticRegression()
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_classifier.fit(X_train, y_train)
y_predict_stacking = stacking_classifier.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_predict_stacking)
print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))

all_stacking_classifier=stacking_classifier

Accuracy of Stacking Classifier: 0.98875


#### model with features

In [11]:
features_dict.keys()

dict_keys(['variance_threshold', 'lasso', 'random_forest_feature_importance', 'recursive_feature_elimination', 'permutation_importance'])

In [12]:
df_variance_threshold = df[[x for x in features_dict['variance_threshold'].split(',')]]
df_lasso = df[[x for x in features_dict['lasso'].split(',')]]
df_random_forest_feature_importance = df[[x for x in features_dict['random_forest_feature_importance'].split(',')]]
df_permutation_importance = df[[x for x in features_dict['permutation_importance'].split(',')]]

In [13]:
df_variance_threshold

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,7.121764e-01,0.001010,0.000607,0.000122,5.393023e-04,0.015026,0.000000,0.007385,0.017604,0.497937,...,0.000167,0.533333,0.000020,0.0,0.000020,0.000020,0.716807,0.000000,0.716807,0.716807
1,1.100000e-06,0.000144,0.000000,0.000013,0.000000e+00,0.001584,0.000000,0.003114,0.003711,0.000000,...,0.000000,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,1.952333e-04,0.000000,0.000101,0.000016,3.581395e-06,0.001926,0.022693,0.007575,0.000000,0.006620,...,0.000000,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,4.452895e-01,0.000144,0.000202,0.000036,1.381395e-05,0.002483,0.023197,0.008753,0.001204,0.015990,...,0.000167,0.533333,0.000307,0.0,0.000307,0.000307,0.448739,0.000000,0.448739,0.448739
4,6.053253e-01,0.001010,0.000506,0.000020,5.398605e-04,0.000856,0.000000,0.001178,0.000802,0.753095,...,0.001001,0.333333,0.000010,0.0,0.000010,0.000010,0.305042,0.586006,0.543697,0.065990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,1.128042e-03,0.000000,0.000101,0.000016,6.651163e-06,0.001926,0.022693,0.007575,0.000000,0.012294,...,0.000000,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3996,4.416667e-07,0.000000,0.000101,0.000000,2.790698e-07,0.000000,0.000000,0.000000,0.000000,0.000516,...,0.000000,0.666667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3997,4.665850e-03,0.000289,0.000607,0.000009,5.398605e-04,0.000856,0.000000,0.001459,0.001456,0.494756,...,0.000334,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3998,7.134503e-01,0.000866,0.000709,0.000119,5.393023e-04,0.013870,0.000000,0.008224,0.017216,0.746905,...,0.000501,0.333333,0.000109,0.0,0.000109,0.000109,0.710924,0.000000,0.710924,0.710924


In [14]:
# Add label column to every df

df_features_dict={
    "df_variance_threshold":df_variance_threshold,
    "df_lasso":df_lasso,
    "df_random_forest_feature_importance":df_random_forest_feature_importance,
    "df_permutation_importance":df_permutation_importance,
}

for df_feature in df_features_dict.keys():
    df_features_dict[df_feature]['Label'] = df['Label']

In [73]:
def train_classifier(X_train, X_test, y_train, y_test):

    print("\n")
    print("-"*40)
    print("Running LGB classifier")
    print("-"*40)
    print("\n")

    # LGB classifier
    pipeline = Pipeline([
        ('lgbm', lgb.LGBMClassifier())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'lgbm__n_estimators': [50, 100, 200],
        'lgbm__learning_rate': [0.01, 0.1, 0.2],
        'lgbm__max_depth': [3, 5, 7]
    }

    # Create GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Print the best parameters and their corresponding accuracy
    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    # Evaluate the model on the test set
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    lgb_classifier = grid_search.best_estimator_

    print("\n")
    print("-"*40)
    print("Running Adaboost classifier")
    print("-"*40)
    print("\n")

    # Adaboost
    pipeline = Pipeline([
        ('adaboost', AdaBoostClassifier())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'adaboost__n_estimators': [50, 100, 200],
        'adaboost__learning_rate': [0.01, 0.1, 0.2],
    }

    # Create GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    # Fit the model
    grid_search.fit(X_train, y_train)

    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    ada_classifier = grid_search.best_estimator_


    # Logistic

    print("\n")
    print("-"*40)
    print("Running Logistic classifier")
    print("-"*40)
    print("\n")


    pipeline = Pipeline([
        ('logreg', LogisticRegression())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'logreg__C': [0.001, 0.01, 0.1],
        'logreg__penalty': ['l1', 'l2'],
    }

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    grid_search.fit(X_train, y_train)

    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    # Evaluate the model on the test set
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    logistic_classifier = grid_search.best_estimator_

    print("\n")
    print("-"*40)
    print("Running Naive bayes classifier")
    print("-"*40)
    print("\n")

    #Naive bayes

    pipeline = Pipeline([
        ('nb', MultinomialNB())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'nb__alpha': [0.1, 0.5, 1.0]
    }

    # Create GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Print the best parameters and their corresponding accuracy
    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    # Evaluate the model on the test set
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    naivebayes_classifier = grid_search.best_estimator_

    print("\n")
    print("-"*40)
    print("Running XGB classifier")
    print("-"*40)
    print("\n")

    pipeline = Pipeline([
        ('xgb', xgb.XGBClassifier())
    ])

    param_grid = {
        'xgb__n_estimators': [50, 100],
        'xgb__learning_rate': [0.01, 0.1],
        'xgb__max_depth': [3, 5]
    }

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    grid_search.fit(X_train, y_train)

    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    xgb_classifier = grid_search.best_estimator_


    print("\n")
    print("-"*40)
    print("Running Stacking based classifier")
    print("-"*40)
    print("\n")

    # Stacking
    base_models = [
        ('xgboost', xgb_classifier),
        ('lightgbm', lgb_classifier),
        ('adaboost', ada_classifier),
        ('logistic', logistic_classifier),
        ('naive_bayes', naivebayes_classifier)
    ]
    meta_model = LogisticRegression()
    stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
    stacking_classifier.fit(X_train, y_train)
    y_predict_stacking = stacking_classifier.predict(X_test)
    stacking_accuracy = accuracy_score(y_test, y_predict_stacking)
    print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))



In [16]:
X=df_variance_threshold.drop('Label',axis=1)
y=df_variance_threshold['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------




Best Parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9896875000000002
Test Accuracy: 0.98125


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50}
Best Accuracy: 0.60125
Test Accuracy: 0.62875


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.776875
Test Accuracy: 0.825


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.74
Test Accuracy: 0.76


----------------------------------------
Running XGB classifier
----------------------------------------


Best Parameters: {'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__n_estimators'

In [17]:
X=df_permutation_importance.drop('Label',axis=1)
y=df_permutation_importance['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9893750000000001
Test Accuracy: 0.98125


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50}
Best Accuracy: 0.60125
Test Accuracy: 0.62875


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.6496875000000001
Test Accuracy: 0.6525


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.566875
Test Accuracy: 0.5975


----------------------------------------
Running XGB classifier
--

In [137]:
X=df_random_forest_feature_importance.drop('Label',axis=1)
y=df_random_forest_feature_importance['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [19]:

print("Model running on multiclass labels")

X=df_lasso.drop('Label',axis=1)
y=df_lasso['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)

Model running on multiclass labels


----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.01, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.8521875
Test Accuracy: 0.855


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.2, 'adaboost__n_estimators': 200}
Best Accuracy: 0.6915625000000001
Test Accuracy: 0.7175


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.60375
Test Accuracy: 0.60625


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.54
Test Accuracy: 0.56875


----------------------------------------
Run

#### TWO labels Algorithm Implementation

In [186]:
df = pd.read_csv('./data/final_data_two_labels.csv')

In [195]:
delimiter = "------------------"
with open("./data/features_with_two_labels.txt", "r") as file:
    lines = file.readlines()

features_dict = {}
for line in lines[1:]:
    line = line.strip()
    splited_lines = line.split(delimiter)
    features_dict[splited_lines[0]]=splited_lines[1]


In [178]:
df_variance_threshold_two = df[[x for x in features_dict['variance_threshold'].split(',')]]
df_random_forest_feature_importance_two = df[[x for x in features_dict['random_forest_feature_importance'].split(',')]]
df_recursive_feature_elimination_two = df[[x for x in features_dict['recursive_feature_elimination'].split(',')]]
df_permutation_importance_two = df[[x for x in features_dict['permutation_importance'].split(',')]]

In [179]:
df_features_dict={
    "df_variance_threshold":df_variance_threshold_two,
    "df_random_forest_feature_importance":df_random_forest_feature_importance_two,
    "df_permutation_importance":df_permutation_importance_two,
    "recursive_feature_elimination":df_recursive_feature_elimination_two
}

for df_feature in df_features_dict.keys():
    df_features_dict[df_feature]['Label'] = df['Label']

    df_recursive_feature_elimination_two

In [180]:
X=df_variance_threshold_two.drop('Label',axis=1)
y=df_variance_threshold_two['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9896666666666667
Test Accuracy: 0.991


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.2, 'adaboost__n_estimators': 200}
Best Accuracy: 0.9650000000000001
Test Accuracy: 0.973


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.825
Test Accuracy: 0.858


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.5853333333333334
Test Accuracy: 0.627


----------------------------------------
Running XGB classifier


In [181]:
X=df_random_forest_feature_importance_two.drop('Label',axis=1)
y=df_random_forest_feature_importance_two['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9873333333333333
Test Accuracy: 0.985


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.2, 'adaboost__n_estimators': 200}
Best Accuracy: 0.9536666666666666
Test Accuracy: 0.956


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.669
Test Accuracy: 0.686


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.7023333333333334
Test Accuracy: 0.714


----------------------------------------
Running XGB classifier


In [182]:
X=df_recursive_feature_elimination_two.drop('Label',axis=1)
y=df_recursive_feature_elimination_two['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.2, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9873333333333333
Test Accuracy: 0.987


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.2, 'adaboost__n_estimators': 200}
Best Accuracy: 0.9523333333333334
Test Accuracy: 0.957


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.667
Test Accuracy: 0.686


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.6913333333333334
Test Accuracy: 0.729


----------------------------------------
Running XGB classifier


In [183]:
X=df_permutation_importance_two.drop('Label',axis=1)
y=df_permutation_importance_two['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 5, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9906666666666666
Test Accuracy: 0.99


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.2, 'adaboost__n_estimators': 200}
Best Accuracy: 0.9526666666666668
Test Accuracy: 0.966


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.8013333333333333
Test Accuracy: 0.822


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.4576666666666666
Test Accuracy: 0.486


----------------------------------------
Running XGB

In [101]:
for key, item in features_dict.items():
    if key=='lasso' or key=='variance_threshold':
        continue
    subset_feature_dict[key]=item

In [103]:
# TODO separate into columns
print("Model running on two labels")
X = df.drop("Label", axis=1)
y = df["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


Model running on two labels


In [104]:
base_models = [
    ('xgboost', xgb.XGBClassifier()),
    ('lightgbm', lgb.LGBMClassifier()),
    ('adaboost', AdaBoostClassifier()),
    ('logistic', LogisticRegression()),
    ('naive_bayes', MultinomialNB())
]
meta_model = LogisticRegression()
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_classifier.fit(X_train, y_train)
y_predict_stacking = stacking_classifier.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_predict_stacking)
print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))

Accuracy of Stacking Classifier: 0.99


### PCA implementation

In [187]:
# y_pred = stacking_classifier.predict(df.drop('Label',axis=1))
# incorrect_twolabel_idx = (df['Label'] != y_pred)
# incorrect_predicted_df = df[incorrect_twolabel_idx]

ValueError: Feature shape mismatch, expected: 18, got 77

In [209]:
subset_feature_dict

{'lasso': 'Fwd IAT Max,PSH Flag Count,ACK Flag Count',
 'random_forest_feature_importance': 'Init_Win_bytes_forward,Init_Win_bytes_backward,Min Packet Length,Packet Length Std,Fwd Packet Length Min,Avg Bwd Segment Size,Bwd Packet Length Min,Bwd Packet Length Mean,Fwd Packet Length Max,Packet Length Mean,Packet Length Variance,Total Length of Fwd Packets,Bwd Packet Length Max,Fwd Header Length,Avg Fwd Segment Size,Average Packet Size,Max Packet Length,Bwd Packet Length Std',
 'recursive_feature_elimination': 'Total Length of Fwd Packets,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Fwd Header Length,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,Average Packet Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward',
 'permutation_importance': 'Init_Win_bytes_forward,Init_Win_bytes_backward,Fwd IAT Min,PSH Flag Count,Bw

In [228]:
# TODO
imp1 = set([x for x in subset_feature_dict['random_forest_feature_importance'].split(',') if x ])
imp2 = set([x for x in subset_feature_dict['recursive_feature_elimination'].split(',') if x ])
imp3 = set([x for x in subset_feature_dict['permutation_importance'].split(',') if x ])
imp4 = set([x for x in subset_feature_dict['lasso'].split(',') if x ])


# for key in subset_feature_dict.keys():
#     if key != "random_forest_feature_importance" or "lasso":
#         # b = set([x for x in subset_feature_dict[key].split(',') if x])
#         seta = temp_lis.union(set.intersection(seta,set([x for x in subset_feature_dict[key].split(',') if x])))


finalset = imp1.intersection(imp2).intersection(imp3)
finalset=list(finalset.union(imp4))
print(finalset)
print(len(finalset))


['Bwd Packet Length Mean', 'Packet Length Std', 'PSH Flag Count', 'Packet Length Variance', 'Init_Win_bytes_backward', 'Fwd IAT Max', 'ACK Flag Count', 'Bwd Packet Length Std', 'Max Packet Length', 'Init_Win_bytes_forward', 'Total Length of Fwd Packets', 'Packet Length Mean']
12


In [230]:
df_twolabel= pd.read_csv("./data/final_data_two_labels.csv")

In [234]:
X=df_twolabel[finalset]
y=df_twolabel['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [235]:
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.2, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.986
Test Accuracy: 0.985


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.2, 'adaboost__n_estimators': 200}
Best Accuracy: 0.9443333333333334
Test Accuracy: 0.961


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.7933333333333333
Test Accuracy: 0.817


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 1.0}
Best Accuracy: 0.4716666666666667
Test Accuracy: 0.487


----------------------------------------
Running XGB classifier


In [236]:
base_models = [
    ('xgboost', xgb.XGBClassifier()),
    ('lightgbm', lgb.LGBMClassifier()),
    ('adaboost', AdaBoostClassifier()),
    ('logistic', LogisticRegression()),
    ('naive_bayes', MultinomialNB())
]
meta_model = LogisticRegression()
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_classifier.fit(X_train, y_train)
y_predict_stacking = stacking_classifier.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_predict_stacking)
print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))

Accuracy of Stacking Classifier: 0.989


In [251]:
y_predict = stacking_classifier.predict(X_test)
incorrect_twolabel_idx = (df_twolabel['Label'] != y_pred)

df_exclude_from_pca = X[incorrect_twolabel_idx]

In [252]:
df_exclude_from_pca['Label']=y[incorrect_twolabel_idx]

In [253]:
df_exclude_from_pca

Unnamed: 0,Bwd Packet Length Mean,Packet Length Std,PSH Flag Count,Packet Length Variance,Init_Win_bytes_backward,Fwd IAT Max,ACK Flag Count,Bwd Packet Length Std,Max Packet Length,Init_Win_bytes_forward,Total Length of Fwd Packets,Packet Length Mean,Label
543,0.004588,0.00326,1.0,1.061279e-05,0.003479,0.001034,0.0,0.002463,0.002226,0.445572,6.1e-05,0.00648,1
605,0.0,0.0,0.0,0.0,0.003754,0.0,1.0,0.0,0.0,0.00383,0.0,0.0,1
1146,0.001551,0.000736,1.0,5.405405e-07,1.5e-05,0.0,0.0,0.0,0.000257,1.0,0.0,0.000969,1
1876,0.001551,0.0,0.0,0.0,0.031311,0.0,1.0,0.0,0.000257,0.003891,2e-06,0.002906,1
2464,0.0,0.0,0.0,0.0,0.0,2e-06,1.0,0.0,0.000257,0.003876,4e-06,0.002906,0
2584,0.001551,0.0,0.0,0.0,0.005127,0.0,1.0,0.0,0.000257,0.003952,2e-06,0.002906,1
2665,0.0,0.243128,0.0,0.05904338,0.003601,0.0,1.0,0.0,0.084889,0.003601,0.000692,0.640333,0
3201,0.146485,0.168647,1.0,0.02840912,0.003601,0.008554,0.0,0.169321,0.097046,0.445572,0.00015,0.163232,1
3265,0.001551,0.0,0.0,0.0,0.00383,0.0,1.0,0.0,0.000257,0.003754,2e-06,0.002906,1
3284,0.001551,0.0,0.0,0.0,0.003906,0.0,1.0,0.0,0.000257,0.003632,2e-06,0.002906,1


## PCA WITH ZISHAN

In [260]:
pca_df = pd.read_csv('./data/sample_pca_test.csv')

In [265]:
pca_df.shape

(4000, 19)

In [264]:
pca_df['Label'].value_counts()

0    3608
1     392
Name: Label, dtype: int64

In [262]:
pca_df[incorrect_twolabel_idx]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,Label
543,-0.69928,0.536016,0.099406,0.296172,-0.030088,-0.043975,0.026154,-0.142628,0.078807,0.01207,-0.007546,0.006533,-0.082427,-0.018856,0.014345,-0.033516,0.027057,-0.022204,0
605,-0.326403,-0.960372,-0.196894,0.045892,-0.502529,0.578415,0.222577,0.015269,-0.060615,-0.023221,-0.01028,-0.016808,-0.02914,-0.005123,0.032271,-0.084165,0.041195,0.00415,0
1146,-0.742576,0.618895,0.096048,0.475762,-0.029228,-0.024974,-0.010756,-0.324608,0.588025,0.061011,0.036499,-0.002365,-0.151749,-0.007891,0.03559,-0.052594,0.060702,-0.038425,0
1876,-0.325169,-0.959099,-0.199298,0.045988,-0.502822,0.579819,0.221138,0.012802,-0.058163,-0.026946,-0.006538,-0.014236,-0.004334,0.003987,0.02652,-0.08046,0.034919,0.003708,0
2464,-0.297795,-0.794256,-0.118694,0.004892,-0.00453,-0.079659,-0.28763,0.017805,-0.062198,-0.008251,-0.200533,-0.032424,-0.019212,-0.009085,0.009743,-0.023946,0.011255,-0.000897,0
2584,-0.299788,-0.788474,-0.118841,-0.006989,-0.015315,-0.067762,-0.265721,0.022541,-0.065481,0.012513,-0.191929,-0.036803,-0.043544,0.00438,0.008305,-0.018841,-0.017912,-0.050077,0
2665,-0.032323,-0.715937,-0.606624,-0.121761,1.282308,0.451636,0.256235,0.014913,-0.00373,0.250798,0.089385,-0.141799,-0.086782,0.7675,-0.117314,0.415653,0.079065,-0.025331,0
3201,-0.480385,0.691755,-0.190966,0.143876,-0.000306,0.024129,-0.002778,-0.108111,0.125467,0.038387,-0.000717,0.011689,-0.065764,-0.009786,0.012102,-0.000806,0.034421,-0.014888,0
3265,-0.327387,-0.954122,-0.198746,0.032571,-0.51244,0.590624,0.242851,0.020329,-0.065137,0.000528,0.00399,-0.021279,-0.056913,0.012066,0.031859,-0.079921,0.013561,-0.044888,0
3284,-0.324859,-0.959767,-0.199639,0.044136,-0.501331,0.578815,0.21928,0.014722,-0.061912,-0.020214,0.000925,-0.015487,-0.026703,0.00094,0.032442,-0.086213,0.041209,0.004328,0


In [255]:
pca_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,Label
0,-0.143753,-0.782227,-0.055468,0.065188,-0.01813,-0.113716,-0.243861,0.006122,-0.073843,-0.011362,-0.193028,-0.032397,-0.018387,-0.002426,0.009735,-0.029356,0.023656,-0.0078,1
1,-0.331393,-0.883429,-0.17453,0.279142,1.201023,0.336282,0.245202,-0.177226,0.296793,-0.062374,-0.16111,-0.047008,-0.029037,0.054116,0.040903,-0.19877,0.125681,-0.001151,1
2,-0.470115,-0.155329,0.254311,-0.425293,-0.012344,-0.217913,0.135217,0.105302,0.099356,0.032333,0.038704,0.006909,-0.004991,-0.034499,-0.037582,0.044955,-0.011845,0.005155,1
3,-0.109564,0.625984,0.458123,0.411551,0.008078,0.17552,-0.122121,0.466561,-0.144026,-0.097631,0.03729,-0.171365,0.008826,-0.062001,0.061098,-0.035839,0.035015,-0.037644,1
4,-0.676691,0.464214,0.111799,0.16393,-0.03068,-0.064653,0.053822,-0.008131,-0.313954,-0.019238,-0.019592,0.017496,-0.033681,-0.035608,-0.003461,-0.011992,-0.00399,-0.009252,1


In [None]:
X=pca_df.drop('Label', axis=1)
y= pca_df['Label']

In [133]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [134]:

base_models = [
    ('xgboost', xgb.XGBClassifier()),
    ('lightgbm', lgb.LGBMClassifier()),
    ('adaboost', AdaBoostClassifier()),
    ('logistic', LogisticRegression()),
    ('naive_bayes', MultinomialNB())
]
meta_model = LogisticRegression()
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_classifier.fit(X_train, y_train)
y_predict_stacking = stacking_classifier.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_predict_stacking)

print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))

Accuracy of Stacking Classifier: 0.903


In [135]:
incorrect_predicted_df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
543,0.001149592,0.002742,0.00243,6.1e-05,1.981395e-05,0.001841,0.0,0.001481,0.001799,0.00447,...,0.533333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
605,1.491667e-06,0.0,0.000101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1146,0.0007054167,0.0,0.000101,0.0,2.790698e-07,0.0,0.0,0.0,0.0,0.000516,...,0.733333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1876,4.75e-07,0.0,0.000101,2e-06,2.790698e-07,0.000257,0.003026,0.00101,0.0,0.000516,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2464,1.783333e-06,0.000144,0.0,4e-06,0.0,0.000257,0.003026,0.00101,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2584,8e-07,0.0,0.000101,2e-06,2.790698e-07,0.000257,0.003026,0.00101,0.0,0.000516,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2665,2.5e-08,0.0,0.000101,0.000692,0.0,0.084889,1.0,0.33379,0.0,0.0,...,0.533333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3201,0.009037083,0.000289,0.000405,0.00015,0.0001054419,0.018365,0.0,0.024071,0.035135,0.194893,...,0.533333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3265,6.166667e-07,0.0,0.000202,2e-06,5.581395e-07,0.000257,0.003026,0.00101,0.0,0.000516,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3284,3.5e-07,0.0,0.000101,2e-06,2.790698e-07,0.000257,0.003026,0.00101,0.0,0.000516,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [138]:
stacking_classifier.predict(incorrect_predicted_df.drop('Label',axis=1))

ValueError: Feature shape mismatch, expected: 18, got 77