In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src import data_processing, model_voting

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data
fp = "..//data/Darknet.CSV"
data_processor = data_processing.DataProcessor(fp)

# Preprocessing
data_processor.preprocess()

# Feature engineering
label = "Label"
dropped = ["Src Port", "Dst Port", "Timestamp", "Flow ID", "Src IP", "Dst IP", "Label.1"]
label_map = {"Tor": 1,
                 "VPN": 1, 
                 "Non-Tor": 0,
                 "NonVPN": 0}
one_hot_columns = ["Protocol"]

data_processor.feature_engineering(label=label, 
                                   corr_threshold=0.9, 
                                   dropped=dropped, 
                                   label_map=label_map, 
                                   one_hot_columns=one_hot_columns)

# Create train and test sets
test_size = 0.2
random_state = 42

data_processor.create_train_test(test_size=test_size, random_state=random_state)

X_train = data_processor.get_X_train()
X_test = data_processor.get_X_test()
y_train = data_processor.get_y_train()
y_test = data_processor.get_y_test()

In [3]:
# Save data
data_processor.save_data("darknet", "..//data/")

In [4]:
# Select models
models = [DecisionTreeClassifier(random_state=42),
          KNeighborsClassifier(n_neighbors=3),
          AdaBoostClassifier(n_estimators=10, random_state=42),
          xgb.XGBClassifier(eval_metric="mlogloss")]

voting_ensemble = model_voting.VotingEnsemble(models)
voting_ensemble.fit_voting_classifier(X_train, X_test, y_train, y_test, voting="hard")

In [5]:
voting_ensemble.get_accuracy()

0.9731773686256494

In [6]:
print(voting_ensemble.get_classification_report())

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     23429
           1       0.98      0.86      0.92      4868

    accuracy                           0.97     28297
   macro avg       0.98      0.93      0.95     28297
weighted avg       0.97      0.97      0.97     28297



In [7]:
voting_ensemble.save_model("voting_ensemble1.pkl", "..//models/")