In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, StackingClassifier
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import random
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

#### ALL labels Algorithm Implementation

In [3]:
subset_feature_dict = {}
delimiter = "------------------"
with open("./data/features_with_all_labels.txt", "r") as file:
    lines = file.readlines()

features_dict = {}

for line in lines[1:]:
    line = line.strip()
    splited_lines = line.split(delimiter)
    features_dict[splited_lines[0]]=splited_lines[1]
    
subset_feature_dict['lasso']=features_dict['lasso']

In [4]:
subset_feature_dict

{'lasso': 'Fwd IAT Max,PSH Flag Count,ACK Flag Count'}

In [5]:
df = pd.read_csv('./data/final_data_all_labels.csv')

In [6]:
X = df.drop("Label", axis=1)
y = df["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

all_X_test, all_y_test= X_test,y_test

In [7]:
pipeline = Pipeline([
    ('lgbm', lgb.LGBMClassifier())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'lgbm__n_estimators': [50, 100, 200],
    'lgbm__learning_rate': [0.01, 0.1, 0.2],
    'lgbm__max_depth': [3, 5, 7]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

lgb_classifier = grid_search.best_estimator_

Best Parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 5, 'lgbm__n_estimators': 50}
Best Accuracy: 0.9890000000000001
Test Accuracy: 0.99


In [8]:
pipeline = Pipeline([
    ('adaboost', AdaBoostClassifier())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'adaboost__n_estimators': [50, 100, 200],
    'adaboost__learning_rate': [0.01, 0.1, 0.2],
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

ada_classifier = grid_search.best_estimator_


Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50}
Best Accuracy: 0.6016666666666668
Test Accuracy: 0.622


In [9]:

pipeline = Pipeline([
    ('logreg', LogisticRegression())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'logreg__penalty': ['l1', 'l2'],
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

logistic_classifier = grid_search.best_estimator_


Best Parameters: {'logreg__C': 100, 'logreg__penalty': 'l2'}
Best Accuracy: 0.8876666666666667
Test Accuracy: 0.883


In [10]:

pipeline = Pipeline([
    ('nb', MultinomialNB())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'nb__alpha': [0.1, 0.5, 1.0]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

naivebayes_classifier = grid_search.best_estimator_



Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.7416666666666666
Test Accuracy: 0.75


In [None]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with XGBoost classifier
pipeline = Pipeline([
    ('xgb', xgb.XGBClassifier())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'xgb__n_estimators': [50, 100],
    # 'xgb__learning_rate': [0.01, 0.1, 0.2],
    # 'xgb__max_depth': [3, 5, 7]
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the model on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

xgb_classifier = grid_search.best_estimator_


In [11]:
base_models = [
    # ('xgboost', xgb_classifier),
    ('lightgbm', lgb_classifier),
    ('adaboost', ada_classifier),
    ('logistic', logistic_classifier),
    ('naive_bayes', naivebayes_classifier)
]
meta_model = LogisticRegression()
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_classifier.fit(X_train, y_train)
y_predict_stacking = stacking_classifier.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_predict_stacking)
print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))

all_stacking_classifier=stacking_classifier

Accuracy of Stacking Classifier: 0.991


#### model with features

In [12]:
features_dict

{'variance_threshold': 'Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt

In [14]:
df_variance_threshold = df[[x for x in features_dict['variance_threshold'].split(',')]]
df_lasso = df[[x for x in features_dict['lasso'].split(',')]]
df_random_forest_feature_importance = df[[x for x in features_dict['random_forest_feature_importance'].split(',')]]
df_permutation_importance = df[[x for x in features_dict['permutation_importance'].split(',')]]

In [15]:
# Add label column to every df

df_features_dict={
    "df_variance_threshold":df_variance_threshold,
    "df_lasso":df_lasso,
    "df_random_forest_feature_importance":df_random_forest_feature_importance,
    "df_permutation_importance":df_permutation_importance,
}

for df_feature in df_features_dict.keys():
    df_features_dict[df_feature]['Label'] = df['Label']

In [16]:
def train_classifier(X_train, X_test, y_train, y_test):

    print("\n")
    print("-"*40)
    print("Running LGB classifier")
    print("-"*40)
    print("\n")

    # LGB classifier
    pipeline = Pipeline([
        ('lgbm', lgb.LGBMClassifier())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'lgbm__n_estimators': [50, 100, 200],
        'lgbm__learning_rate': [0.01, 0.1, 0.2],
        'lgbm__max_depth': [3, 5, 7]
    }

    # Create GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Print the best parameters and their corresponding accuracy
    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    # Evaluate the model on the test set
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    lgb_classifier = grid_search.best_estimator_

    print("\n")
    print("-"*40)
    print("Running Adaboost classifier")
    print("-"*40)
    print("\n")

    # Adaboost
    pipeline = Pipeline([
        ('adaboost', AdaBoostClassifier())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'adaboost__n_estimators': [50, 100, 200],
        'adaboost__learning_rate': [0.01, 0.1, 0.2],
    }

    # Create GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    # Fit the model
    grid_search.fit(X_train, y_train)

    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    ada_classifier = grid_search.best_estimator_


    # Logistic

    print("\n")
    print("-"*40)
    print("Running Logistic classifier")
    print("-"*40)
    print("\n")


    pipeline = Pipeline([
        ('logreg', LogisticRegression())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'logreg__C': [0.001, 0.01, 0.1],
        'logreg__penalty': ['l1', 'l2'],
    }

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    grid_search.fit(X_train, y_train)

    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    # Evaluate the model on the test set
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    logistic_classifier = grid_search.best_estimator_

    print("\n")
    print("-"*40)
    print("Running Naive bayes classifier")
    print("-"*40)
    print("\n")

    #Naive bayes

    pipeline = Pipeline([
        ('nb', MultinomialNB())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'nb__alpha': [0.1, 0.5, 1.0]
    }

    # Create GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Print the best parameters and their corresponding accuracy
    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    # Evaluate the model on the test set
    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    naivebayes_classifier = grid_search.best_estimator_

    print("\n")
    print("-"*40)
    print("Running XGB classifier")
    print("-"*40)
    print("\n")

    pipeline = Pipeline([
        ('xgb', xgb.XGBClassifier())
    ])

    param_grid = {
        'xgb__n_estimators': [50, 100],
        'xgb__learning_rate': [0.01, 0.1],
        'xgb__max_depth': [3, 5]
    }

    grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')

    grid_search.fit(X_train, y_train)

    print("Best Parameters:", grid_search.best_params_)
    print("Best Accuracy:", grid_search.best_score_)

    test_accuracy = grid_search.score(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    xgb_classifier = grid_search.best_estimator_


    print("\n")
    print("-"*40)
    print("Running Stacking based classifier")
    print("-"*40)
    print("\n")

    # Stacking
    base_models = [
        ('xgboost', xgb_classifier),
        ('lightgbm', lgb_classifier),
        ('adaboost', ada_classifier),
        ('logistic', logistic_classifier),
        ('naive_bayes', naivebayes_classifier)
    ]
    meta_model = LogisticRegression()
    stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
    stacking_classifier.fit(X_train, y_train)
    y_predict_stacking = stacking_classifier.predict(X_test)
    stacking_accuracy = accuracy_score(y_test, y_predict_stacking)
    print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))



In [17]:
X=df_variance_threshold.drop('Label',axis=1)
y=df_variance_threshold['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------




Best Parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9896875000000002
Test Accuracy: 0.98125


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50}
Best Accuracy: 0.60125
Test Accuracy: 0.62875


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.776875
Test Accuracy: 0.825


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.74
Test Accuracy: 0.76


----------------------------------------
Running XGB classifier
----------------------------------------


Best Parameters: {'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__n_estimators'

In [18]:
X=df_permutation_importance.drop('Label',axis=1)
y=df_permutation_importance['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9893750000000001
Test Accuracy: 0.98125


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50}
Best Accuracy: 0.60125
Test Accuracy: 0.62875


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.6496875000000001
Test Accuracy: 0.6525


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.566875
Test Accuracy: 0.5975


----------------------------------------
Running XGB classifier
--

In [19]:
X=df_random_forest_feature_importance.drop('Label',axis=1)
y=df_random_forest_feature_importance['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [20]:

print("Model running on multiclass labels")

X=df_lasso.drop('Label',axis=1)
y=df_lasso['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)

Model running on multiclass labels


----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.01, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.8521875
Test Accuracy: 0.855


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.2, 'adaboost__n_estimators': 200}
Best Accuracy: 0.6915625000000001
Test Accuracy: 0.7175


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.60375
Test Accuracy: 0.60625


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.54
Test Accuracy: 0.56875


----------------------------------------
Run

#### TWO labels Algorithm Implementation

In [48]:
df = pd.read_csv('./data/final_data_two_labels.csv')

In [49]:
delimiter = "------------------"
with open("./data/features_with_two_labels.txt", "r") as file:
    lines = file.readlines()

features_dict = {}
for line in lines[1:]:
    line = line.strip()
    splited_lines = line.split(delimiter)
    features_dict[splited_lines[0]]=splited_lines[1]


In [50]:
df_variance_threshold_two = df[[x for x in features_dict['variance_threshold'].split(',')]]
df_random_forest_feature_importance_two = df[[x for x in features_dict['random_forest_feature_importance'].split(',')]]
df_recursive_feature_elimination_two = df[[x for x in features_dict['recursive_feature_elimination'].split(',')]]
df_permutation_importance_two = df[[x for x in features_dict['permutation_importance'].split(',')]]

In [51]:
df_features_dict={
    "df_variance_threshold":df_variance_threshold_two,
    "df_random_forest_feature_importance":df_random_forest_feature_importance_two,
    "df_permutation_importance":df_permutation_importance_two,
    "recursive_feature_elimination":df_recursive_feature_elimination_two
}

df_variance_threshold_two['Label'] = df['Label']
df_random_forest_feature_importance_two['Label'] = df['Label']
df_recursive_feature_elimination_two['Label'] = df['Label']
df_permutation_importance_two['Label'] = df['Label']

In [52]:
df_permutation_importance_two.columns

Index(['Init_Win_bytes_forward', 'Init_Win_bytes_backward',
       'min_seg_size_forward', 'Flow IAT Min', 'Bwd Packet Length Std',
       'URG Flag Count', 'Fwd IAT Mean', 'Fwd IAT Total',
       'Total Backward Packets', 'Bwd IAT Max', 'Bwd IAT Mean',
       'Bwd Packet Length Max', 'Fwd Header Length.1', 'Min Packet Length',
       'Subflow Bwd Packets', 'Bwd IAT Min', 'Active Std', 'Idle Std',
       'Label'],
      dtype='object')

In [53]:
X=df_variance_threshold_two.drop('Label',axis=1)
y=df_variance_threshold_two['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.2, 'lgbm__max_depth': 5, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9886666666666667
Test Accuracy: 0.985


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.2, 'adaboost__n_estimators': 200}
Best Accuracy: 0.9416666666666668
Test Accuracy: 0.939


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.8170000000000002
Test Accuracy: 0.824


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 1.0}
Best Accuracy: 0.44700000000000006
Test Accuracy: 0.425


----------------------------------------
Running X

In [54]:
X=df_random_forest_feature_importance_two.drop('Label',axis=1)
y=df_random_forest_feature_importance_two['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.2, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9886666666666667
Test Accuracy: 0.989


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.2, 'adaboost__n_estimators': 200}
Best Accuracy: 0.9626666666666667
Test Accuracy: 0.958


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.7433333333333334
Test Accuracy: 0.809


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.711
Test Accuracy: 0.731


----------------------------------------
Running XGB classifier


In [55]:
X=df_recursive_feature_elimination_two.drop('Label',axis=1)
y=df_recursive_feature_elimination_two['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.2, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 100}
Best Accuracy: 0.9880000000000001
Test Accuracy: 0.987


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.2, 'adaboost__n_estimators': 200}
Best Accuracy: 0.9629999999999999
Test Accuracy: 0.957


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.63
Test Accuracy: 0.647


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.5}
Best Accuracy: 0.6486666666666666
Test Accuracy: 0.649


----------------------------------------
Running XGB classifier
-

In [56]:
X=df_permutation_importance_two.drop('Label',axis=1)
y=df_permutation_importance_two['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)



----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.2, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 200}
Best Accuracy: 0.9886666666666667
Test Accuracy: 0.992


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.2, 'adaboost__n_estimators': 200}
Best Accuracy: 0.9506666666666665
Test Accuracy: 0.94


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.6699999999999999
Test Accuracy: 0.63


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.5}
Best Accuracy: 0.5896666666666667
Test Accuracy: 0.626


----------------------------------------
Running XGB 

In [57]:
# Generate Subset features for pca
for key, item in features_dict.items():
    if key=='lasso' or key=='variance_threshold':
        continue
    subset_feature_dict[key]=item

In [58]:
# # TODO separate into columns
# print("Model running on two labels")
# X = df.drop("Label", axis=1)
# y = df["Label"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [59]:
# base_models = [
#     ('xgboost', xgb.XGBClassifier()),
#     ('lightgbm', lgb.LGBMClassifier()),
#     ('adaboost', AdaBoostClassifier()),
#     ('logistic', LogisticRegression()),
#     ('naive_bayes', MultinomialNB())
# ]
# meta_model = LogisticRegression()
# stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
# stacking_classifier.fit(X_train, y_train)
# y_predict_stacking = stacking_classifier.predict(X_test)
# stacking_accuracy = accuracy_score(y_test, y_predict_stacking)
# print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))

### PCA Data preparation

In [60]:
subset_feature_dict

{'lasso': 'Fwd IAT Max,PSH Flag Count,ACK Flag Count',
 'random_forest_feature_importance': 'Init_Win_bytes_forward,Init_Win_bytes_backward,Fwd Packet Length Min,Fwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Avg Bwd Segment Size,Min Packet Length,Packet Length Std,Avg Fwd Segment Size,Bwd Packet Length Max,Packet Length Mean,Average Packet Size,Total Length of Fwd Packets,Subflow Fwd Bytes,Fwd Header Length.1,Fwd Header Length,Packet Length Variance',
 'recursive_feature_elimination': 'Total Length of Fwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Packets/s,Fwd Header Length,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data

In [61]:
# whats this
# y_pred = stacking_classifier.predict(df.drop('Label',axis=1))
# incorrect_twolabel_idx = (df['Label'] != y_pred)
# incorrect_predicted_df = df[incorrect_twolabel_idx]

In [62]:
# TODO
imp1 = set([x for x in subset_feature_dict['random_forest_feature_importance'].split(',') if x ])
imp2 = set([x for x in subset_feature_dict['recursive_feature_elimination'].split(',') if x ])
imp3 = set([x for x in subset_feature_dict['permutation_importance'].split(',') if x ])
imp4 = set([x for x in subset_feature_dict['lasso'].split(',') if x ])

finalset = imp1.intersection(imp2).intersection(imp3)
finalset=list(finalset.union(imp4))
print(finalset)
print(len(finalset))

['Fwd IAT Max', 'Init_Win_bytes_backward', 'Min Packet Length', 'Init_Win_bytes_forward', 'PSH Flag Count', 'Fwd Header Length.1', 'Bwd Packet Length Max', 'ACK Flag Count']
8


In [36]:
df_twolabel= pd.read_csv("./data/final_data_two_labels.csv")

In [37]:
# Training on the final set 
X=df_twolabel[finalset]
y=df_twolabel['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
train_classifier(X_train, X_test, y_train, y_test)




----------------------------------------
Running LGB classifier
----------------------------------------


Best Parameters: {'lgbm__learning_rate': 0.2, 'lgbm__max_depth': 7, 'lgbm__n_estimators': 100}
Best Accuracy: 0.9896666666666667
Test Accuracy: 0.99


----------------------------------------
Running Adaboost classifier
----------------------------------------


Best Parameters: {'adaboost__learning_rate': 0.2, 'adaboost__n_estimators': 200}
Best Accuracy: 0.9390000000000001
Test Accuracy: 0.935


----------------------------------------
Running Logistic classifier
----------------------------------------


Best Parameters: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best Accuracy: 0.7836666666666667
Test Accuracy: 0.808


----------------------------------------
Running Naive bayes classifier
----------------------------------------


Best Parameters: {'nb__alpha': 0.1}
Best Accuracy: 0.6
Test Accuracy: 0.624


----------------------------------------
Running XGB classifier
---

In [63]:
base_models = [
    ('xgboost', xgb.XGBClassifier()),
    ('lightgbm', lgb.LGBMClassifier()),
    ('adaboost', AdaBoostClassifier()),
    ('logistic', LogisticRegression()),
    ('naive_bayes', MultinomialNB())
]
meta_model = LogisticRegression()
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_classifier.fit(X_train, y_train)
y_predict_stacking = stacking_classifier.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_predict_stacking)
print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))

Accuracy of Stacking Classifier: 0.986


In [64]:
# Predicting the label from stacking classifier and finding out the incorrectly classified labels
y_predict = stacking_classifier.predict(X)
incorrect_twolabel_idx = (y != y_predict)

df_exclude_from_pca = X[incorrect_twolabel_idx]
df_exclude_from_pca['Label']=y[incorrect_twolabel_idx]

In [65]:
df_exclude_from_pca

Unnamed: 0,Init_Win_bytes_forward,Init_Win_bytes_backward,min_seg_size_forward,Flow IAT Min,Bwd Packet Length Std,URG Flag Count,Fwd IAT Mean,Fwd IAT Total,Total Backward Packets,Bwd IAT Max,Bwd IAT Mean,Bwd Packet Length Max,Fwd Header Length.1,Min Packet Length,Subflow Bwd Packets,Bwd IAT Min,Active Std,Idle Std,Label
73,0.022049,0.01796,0.533333,6.890756e-07,0.0,1.0,0.0,0.0,0.000101,0.0,0.0,0.0,0.000209,0.0,0.000101,0.0,0.0,0.0,0
314,0.022049,0.017929,0.533333,7.226891e-07,0.0,1.0,0.0,0.0,0.000101,0.0,0.0,0.0,0.000209,0.0,0.000101,0.0,0.0,0.0,0
532,0.445572,0.44191,0.533333,7.310924e-07,0.0,0.0,0.024669,0.048927,0.000101,0.0,0.0,0.0,0.00068,0.0,0.000101,0.0,0.0,0.0,1
817,0.003723,0.508194,0.533333,1.588235e-06,0.0,0.0,0.001191,0.001181,0.000101,0.0,0.0,0.0,0.000497,0.0,0.000101,0.0,0.0,0.0,1
907,0.021973,0.0,0.533333,2.966387e-06,0.0,0.0,3e-06,3e-06,0.0,0.0,0.0,0.0,0.000418,0.0,0.0,0.0,0.0,0.0,0
923,0.445572,0.64711,0.533333,2.689076e-07,0.0,0.0,0.022919,0.045456,0.000101,0.0,0.0,0.0,0.00068,0.0,0.000101,0.0,0.0,0.0,1
1128,0.003922,0.0,0.333333,0.3714286,0.0,0.0,0.371429,0.368333,0.0,0.0,0.0,0.0,0.000262,0.004525,0.0,0.0,0.0,0.0,0
1268,0.003754,0.0,0.533333,1.537815e-06,0.0,0.0,2e-06,2e-06,0.0,0.0,0.0,0.0,0.000418,0.0,0.0,0.0,0.0,0.0,1
2131,1.5e-05,0.0,0.333333,9.245378e-05,0.0,0.0,9.2e-05,9.2e-05,0.0,0.0,0.0,0.0,0.000262,0.004525,0.0,0.0,0.0,0.0,0
2171,0.445572,0.44191,0.533333,5.462185e-07,0.0,0.0,0.021178,0.042004,0.000101,0.0,0.0,0.0,0.00068,0.0,0.000101,0.0,0.0,0.0,1


## WorkIng with PCA Dataset

In [81]:
pca_df = pd.read_csv('./data/sample_pca_test.csv')

In [82]:
pca_df.shape

(4000, 19)

In [83]:
X_train, y_train =  pca_df[~incorrect_twolabel_idx].drop('Label',axis=1),pca_df[~incorrect_twolabel_idx]['Label']
X_test, y_test =  pca_df[incorrect_twolabel_idx].drop('Label',axis=1),pca_df[incorrect_twolabel_idx]['Label']

In [85]:
X_train.shape,X_test.shape

((3986, 18), (14, 18))

In [84]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [86]:
base_models = [
    ('xgboost', xgb.XGBClassifier()),
    ('lightgbm', lgb.LGBMClassifier()),
    ('adaboost', AdaBoostClassifier()),
    ('logistic', LogisticRegression()),
    ('naive_bayes', MultinomialNB())
]
meta_model = LogisticRegression()
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_classifier.fit(X_train, y_train)
y_predict_stacking = stacking_classifier.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_predict_stacking)

print('Accuracy of Stacking Classifier: ' + str(stacking_accuracy))

Accuracy of Stacking Classifier: 0.42857142857142855
