In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


#### ALL labels Algorithm Implementation

In [16]:
delimiter = "------------------"
with open("./data/features_with_all_labels.txt", "r") as file:
    lines = file.readlines()

features_dict = {}

for line in lines[1:]:
    line = line.strip()
    splited_lines = line.split(delimiter)

    features_dict[splited_lines[0]]=splited_lines[1]

In [3]:
df = pd.read_csv('./data/final_data_all_labels.csv')

In [4]:
X = df.drop("Label", axis=1)
y = df["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV


pipeline = Pipeline([
    ('lgbm', lgb.LGBMClassifier())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'lgbm__n_estimators': [50, 100, 200],
    'lgbm__learning_rate': [0.01, 0.1, 0.2],
    'lgbm__max_depth': [3, 5, 7]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

lgb_classifier = grid_search.best_estimator_



Best Parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 5, 'lgbm__n_estimators': 50}
Best Accuracy: 0.9890000000000001
Test Accuracy: 0.99


In [6]:
from sklearn.ensemble import AdaBoostClassifier


pipeline = Pipeline([
    ('adaboost', AdaBoostClassifier())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'adaboost__n_estimators': [50, 100, 200],
    'adaboost__learning_rate': [0.01, 0.1, 0.2],
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

ada_classifier = grid_search.best_estimator_




Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50}
Best Accuracy: 0.6016666666666668
Test Accuracy: 0.622


In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('logreg', LogisticRegression())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'logreg__penalty': ['l1', 'l2'],
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

logistic_classifier = grid_search.best_estimator_


Best Parameters: {'logreg__C': 100, 'logreg__penalty': 'l2'}
Best Accuracy: 0.8876666666666667
Test Accuracy: 0.883


In [9]:
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('nb', MultinomialNB())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'nb__alpha': [0.1, 0.5, 1.0]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

naivebayes_classifier = grid_search.best_estimator_



Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.7416666666666666
Test Accuracy: 0.75


In [10]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris  # Replace with your actual dataset

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with XGBoost classifier
pipeline = Pipeline([
    ('xgb', xgb.XGBClassifier())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'xgb__n_estimators': [50, 100, 200],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__max_depth': [3, 5, 7]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

xgb_classifier = grid_search.best_estimator_

Best Parameters: {'xgb__learning_rate': 0.1, 'xgb__max_depth': 7, 'xgb__n_estimators': 50}
Best Accuracy: 0.9896874999999999
Test Accuracy: 0.99


In [11]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB


base_models = [
    ('xgboost', xgb_classifier),
    ('lightgbm', lgb_classifier),
    ('adaboost', ada_classifier),
    ('logistic', logistic_classifier),
    ('naive_bayes', naivebayes_classifier)
]
meta_model = LogisticRegression()
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_classifier.fit(X_train, y_train)
y_predict_stacking = stacking_classifier.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_predict_stacking)
print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))

Accuracy of Stacking Classifier: 0.98875


#### model with features

In [36]:
features_dict.keys()

dict_keys(['variance_threshold', 'lasso', 'random_forest_feature_importance', 'recursive_feature_elimination', 'permutation_importance'])

In [69]:
df_variance_threshold = df[[x for x in features_dict['variance_threshold'].split(',')]]
df_lasso = df[[x for x in features_dict['lasso'].split(',')]]
df_random_forest_feature_importance = df[[x for x in features_dict['random_forest_feature_importance'].split(',')]]
df_permutation_importance = df[[x for x in features_dict['permutation_importance'].split(',')]]

In [70]:
df_variance_threshold

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,7.121764e-01,0.001010,0.000607,0.000122,5.393023e-04,0.015026,0.000000,0.007385,0.017604,0.497937,...,0.000167,0.533333,0.000020,0.0,0.000020,0.000020,0.716807,0.000000,0.716807,0.716807
1,1.100000e-06,0.000144,0.000000,0.000013,0.000000e+00,0.001584,0.000000,0.003114,0.003711,0.000000,...,0.000000,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,1.952333e-04,0.000000,0.000101,0.000016,3.581395e-06,0.001926,0.022693,0.007575,0.000000,0.006620,...,0.000000,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,4.452895e-01,0.000144,0.000202,0.000036,1.381395e-05,0.002483,0.023197,0.008753,0.001204,0.015990,...,0.000167,0.533333,0.000307,0.0,0.000307,0.000307,0.448739,0.000000,0.448739,0.448739
4,6.053253e-01,0.001010,0.000506,0.000020,5.398605e-04,0.000856,0.000000,0.001178,0.000802,0.753095,...,0.001001,0.333333,0.000010,0.0,0.000010,0.000010,0.305042,0.586006,0.543697,0.065990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,1.128042e-03,0.000000,0.000101,0.000016,6.651163e-06,0.001926,0.022693,0.007575,0.000000,0.012294,...,0.000000,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3996,4.416667e-07,0.000000,0.000101,0.000000,2.790698e-07,0.000000,0.000000,0.000000,0.000000,0.000516,...,0.000000,0.666667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3997,4.665850e-03,0.000289,0.000607,0.000009,5.398605e-04,0.000856,0.000000,0.001459,0.001456,0.494756,...,0.000334,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3998,7.134503e-01,0.000866,0.000709,0.000119,5.393023e-04,0.013870,0.000000,0.008224,0.017216,0.746905,...,0.000501,0.333333,0.000109,0.0,0.000109,0.000109,0.710924,0.000000,0.710924,0.710924


In [95]:
# Add label column to every df

df_features_dict={
    "df_variance_threshold":df_variance_threshold,
    "df_lasso":df_lasso,
    "df_random_forest_feature_importance":df_random_forest_feature_importance,
    "df_permutation_importance":df_permutation_importance,
}

for df_feature in df_features_dict.keys():
    df_features_dict[df_feature]['Label'] = df['Label']

In [97]:
def train_classifier(X_train, X_test, y_train, y_test):

    print("\n")
    print("-"*40)
    print("Running LGB classifier")
    print("-"*40)
    print("\n")

    # LGB classifier
    pipeline = Pipeline([
        ('lgbm', lgb.LGBMClassifier())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'lgbm__n_estimators': [50, 100, 200],
        'lgbm__learning_rate': [0.01, 0.1, 0.2],
        'lgbm__max_depth': [3, 5, 7]
    }

    # Create GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Print the best parameters and their corresponding accuracy
    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    # Evaluate the model on the test set
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    lgb_classifier = grid_search.best_estimator_

    print("\n")
    print("-"*40)
    print("Running Adaboost classifier")
    print("-"*40)
    print("\n")

    # Adaboost
    pipeline = Pipeline([
        ('adaboost', AdaBoostClassifier())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'adaboost__n_estimators': [50, 100, 200],
        'adaboost__learning_rate': [0.01, 0.1, 0.2],
    }

    # Create GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Print the best parameters and their corresponding accuracy
    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    # Evaluate the model on the test set
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    ada_classifier = grid_search.best_estimator_


    # Logistic

    print("\n")
    print("-"*40)
    print("Running Logistic classifier")
    print("-"*40)
    print("\n")


    pipeline = Pipeline([
        ('logreg', LogisticRegression())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'logreg__C': [0.001, 0.01, 0.1],
        'logreg__penalty': ['l1', 'l2'],
    }

    # Create GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Print the best parameters and their corresponding accuracy
    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    # Evaluate the model on the test set
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    logistic_classifier = grid_search.best_estimator_

    print("\n")
    print("-"*40)
    print("Running Naive bayes classifier")
    print("-"*40)
    print("\n")

    #Naive bayes

    pipeline = Pipeline([
        ('nb', MultinomialNB())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'nb__alpha': [0.1, 0.5, 1.0]
    }

    # Create GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Print the best parameters and their corresponding accuracy
    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    # Evaluate the model on the test set
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    naivebayes_classifier = grid_search.best_estimator_

    print("\n")
    print("-"*40)
    print("Running XGB classifier")
    print("-"*40)
    print("\n")

    pipeline = Pipeline([
        ('xgb', xgb.XGBClassifier())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'xgb__n_estimators': [50, 100],
        'xgb__learning_rate': [0.01, 0.1],
        'xgb__max_depth': [3, 5]
    }

    # Create GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Print the best parameters and their corresponding accuracy
    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    # Evaluate the model on the test set
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    xgb_classifier = grid_search.best_estimator_


    print("\n")
    print("-"*40)
    print("Running Stacking based classifier")
    print("-"*40)
    print("\n")

    # Stacking
    base_models = [
        ('xgboost', xgb_classifier),
        ('lightgbm', lgb_classifier),
        ('adaboost', ada_classifier),
        ('logistic', logistic_classifier),
        ('naive_bayes', naivebayes_classifier)
    ]
    meta_model = LogisticRegression()
    stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
    stacking_classifier.fit(X_train, y_train)
    y_predict_stacking = stacking_classifier.predict(X_test)
    stacking_accuracy = accuracy_score(y_test, y_predict_stacking)
    print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))



In [98]:
X=df_variance_threshold.drop('Label',axis=1)
y=df_variance_threshold['Label']
X_train, X_test, y_train, y_test = train_test_split(temp_df, y, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9896875000000002
Test Accuracy: 0.98125


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50}
Best Accuracy: 0.60125
Test Accuracy: 0.62875


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.776875
Test Accuracy: 0.825


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.74
Test Accuracy: 0.76


----------------------------------------
Running XGB classifier
-------------------

In [99]:
X=df_permutation_importance.drop('Label',axis=1)
y=df_permutation_importance['Label']
X_train, X_test, y_train, y_test = train_test_split(temp_df, y, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9893750000000001
Test Accuracy: 0.98125


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50}
Best Accuracy: 0.60125
Test Accuracy: 0.62875


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.6496875000000001
Test Accuracy: 0.6525


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.566875
Test Accuracy: 0.5975


----------------------------------------
Running XGB classifier
--

In [100]:
X=df_random_forest_feature_importance.drop('Label',axis=1)
y=df_random_forest_feature_importance['Label']
X_train, X_test, y_train, y_test = train_test_split(temp_df, y, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------




Best Parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9878125000000001
Test Accuracy: 0.9775


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50}
Best Accuracy: 0.6881250000000001
Test Accuracy: 0.7125


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.001, 'logreg__penalty': 'l2'}
Best Accuracy: 0.5878124999999998
Test Accuracy: 0.6125


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.5821875000000001
Test Accuracy: 0.60875


----------------------------------------
Running XGB classifier
----------------------------------------


Best Parameters: {'xgb__learning_rate': 0.1, '

In [101]:
X=df_lasso.drop('Label',axis=1)
y=df_lasso['Label']
X_train, X_test, y_train, y_test = train_test_split(temp_df, y, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.01, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.8521875
Test Accuracy: 0.855


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.2, 'adaboost__n_estimators': 200}
Best Accuracy: 0.6915625000000001
Test Accuracy: 0.7175


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.60375
Test Accuracy: 0.60625


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.54
Test Accuracy: 0.56875


----------------------------------------
Running XGB classifier
---------------