In [None]:
import pandas as pd
import numpy as np
import joblib

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import LinearRegression

from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import VotingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report

import glob
import os
#merge all files starting in "part" & ending in "csv" inside the selected path
all_files = os.path.join("CIC-IoT-2023/","part*.csv")
#list of all merged files
joined_files = glob.glob(all_files)
#merged files
df = pd.concat(map(pd.read_csv, joined_files), ignore_index=True)
# pd.read_csv(joined_files, chunksize=1000)
#df = pd.read_csv(df, chunksize=1000)

df_test = df.head(5)
df_test

# df = pd.read_csv('part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv')
# df

In [None]:
# PREPARE DATA
#create input data
X = df.drop(columns=['label'])

#create output (prediction) data
y = df['label']

In [5]:
#split data - 80% training, 20% testing
# training input data, testing input data, training output data, testing output data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
# Baseline Random Classifier
dummy_classifier = DummyClassifier(strategy='stratified', random_state=42)
# Fit baseline classifier on training data
dummy_classifier.fit(X_train, y_train)
# Predictions from test data
y_pred_dummy = dummy_classifier.predict(X_test)

dummy_acc = accuracy_score(y_test, y_pred_dummy)
report = classification_report(y_test, y_pred_dummy)

print("Baseline Classifier Accuracy: ", dummy_acc)
print("Classification Report:", report)

Baseline Classifier Accuracy:  0.08728985931289034
Classification Report:                          precision    recall  f1-score   support

       Backdoor_Malware       0.00      0.00      0.00       623
          BenignTraffic       0.02      0.02      0.02    219458
       BrowserHijacking       0.00      0.00      0.00      1201
       CommandInjection       0.00      0.00      0.00      1049
 DDoS-ACK_Fragmentation       0.01      0.01      0.01     56930
        DDoS-HTTP_Flood       0.00      0.00      0.00      5656
        DDoS-ICMP_Flood       0.15      0.15      0.15   1441384
DDoS-ICMP_Fragmentation       0.01      0.01      0.01     90474
      DDoS-PSHACK_Flood       0.09      0.09      0.09    819229
       DDoS-RSTFINFlood       0.09      0.09      0.09    808828
         DDoS-SYN_Flood       0.09      0.09      0.09    812155
         DDoS-SlowLoris       0.00      0.00      0.00      4684
DDoS-SynonymousIP_Flood       0.08      0.08      0.08    719249
         DDoS-T

In [8]:
# RESHAPE DATA from 3d to 2d
#nx, ny = X_train.shape
#d2_train_dataset = X_train.values.reshape(nx,ny)
#d2_train_dataset
models = {}
results = {}

In [None]:
# models = [
#     ('dt', tree.DecisionTreeClassifier()),
#     ('rf', RandomForestClassifier())
# ]

In [9]:
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    # Train model
    model.fit(X_train, y_train)
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_probability = model.predict_proba(X_test)[:,1]

    # Scoring
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred_probability)

    print(f"\n{name} Results: ")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1: {f1:.4f}")
    print(f"ROC-AUC: {roc:.4f}")
    print(f"\nClassification Report: ")
    print(classification_report(y_test, y_pred))

    return{
        'model': model,
        'accuracy': acc,
        'f1_score': f1,
        'roc_auc': roc,
        'y_pred': y_pred,
        'y_pred_probability': y_pred_probability
    }

In [18]:
rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=15
)
results['RandomForest'] = evaluate_model('RandomForest', rf_model, X_train, y_train, X_test, y_test)

MemoryError: Unable to allocate 6.40 GiB for an array with shape (37349263, 46) and data type float32

In [None]:
dt_model = tree.DecisionTree()
results['DecisionTree'] = evaluate_model('DecisionTree', dt_model, X_train, y_train, X_test, y_test)

In [None]:
print("Model Comparison")
print("-"*40)

comparison = pd.df({
    'Model': list(results.keys()),
    'Accuracy': [results[m]['accuracy'] for m in results],
    'F1': [results[m]['f1_score'] for m in results],
    'ROC-AUC': [results[m]['roc_auc'] for m in results]
})

#comparison = comparison.sort_values('F1', ascending=False)
print('\n', comparison.to_string(index=False))

# Best model according to F1 score
best_model_name = comparison.iloc[0]['Model']
best_model_result = results[best_model_name]
best_model = best_model_result['model']

print(f"\nBest Model: {best_model_name}")
print(f"Accuracy: {best_model_result['accuracy']:.4f}")
print(f"F1: {best_model_result['f1_score']:.4f}")
print(f"ROC-AUC: {best_model_result['roc_auc']:.4f}")

In [None]:
# VOTING CLASSIFIER
vote_hard = VotingClassifier(estimators=models, voting='hard')
vote_hard.fit(X_train, y_train)

# PREDICT
y_pred = vote_hard.predict(X_test)

#dt = tree.DecisionTreeClassifier()
#rf = RandomForestClassifier()

#dt.fit(X_train, y_train, sample_weight=None)
#rf.fit(X_train, y_train, sample_weight=None)

# Make dt PERSISTANT
#store TRAINED dt in file called IDS_IoT.joblib
#joblib.dump(dt, 'IDS_IoT.joblib')
#load trained dt from file
#dt = joblib.load('IDS_IoT.joblib')

#enter data you want a prediction for
#rf_predictions = rf.predict(X_test)
#dt_predictions = dt.predict(X_test)
# rf_predictions

In [None]:
# SCORING
# accur_score = accuracy_score(y_test, y_pred)
# print("Hard Voting Accuracy: ", accur_score)

# prec_score = precision_score(y_test, y_pred, average='weighted') # "ill-defined" Warning
# print("Precision Score: ", prec_score)

# recall = recall_score(y_test, y_pred, average='weighted')
# print("Recall Score: ", recall)

# f1 = f1_score(y_test, y_pred, average='weighted')
# print("F1 Score: ", f1)

In [None]:
# SCORING
# rf_accur_score = accuracy_score(y_test, rf_predictions) #returns # from 0-1
# dt_accur_score = accuracy_score(y_test, dt_predictions)

# rf_precision = precision_score(y_test, rf_predictions, average=None)
# dt_precision = precision_score(y_test, dt_predictions, average=None)

# rf_recall = recall_score(y_test, rf_predictions, average=None)
# dt_recall = recall_score(y_test, dt_predictions, average=None)

# print("Random Forest\nAccuracy: ", rf_accur_score)
# print("Precision: ", rf_precision)
# print("Recall: ", rf_recall)
# print("-----------------------------------------")
# print("Decision Tree\nAccuracy: ", dt_accur_score)
# print("Precision: ", dt_precision)
# print("Recall: ", dt_recall)


In [None]:
# VISUALIZATIONS
#decisionTree, filename, display specified columns & rules, output data in alphabetical order, writes info on graph, round box edges, color boxes
#tree.export_graphviz(decisionTree, out_file='IDS_IoT_decisionTree.dot', feature_names=X_test.columns, class_names=sorted(y.unique()), label='all', rounded=True, filled=True)
