In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.linear_model import LinearRegression

from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import VotingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report

import glob
import os

# for reproduceability in model evaluation
rand_state = 42
np.random.seed(rand_state)

#merge all files starting in "part-0002" & ending in "csv" inside the selected path
all_files = os.path.join("CIC-IoT-2023/","part-00004*.csv")
#list of all merged files
joined_files = glob.glob(all_files)
#merged files
df = pd.concat(map(pd.read_csv, joined_files), ignore_index=True)
# pd.read_csv(joined_files, chunksize=1000)
#df = pd.read_csv(df, chunksize=1000)
# for chunk in pd.read_csv(joined_files, chunksize=1000):
#     print(chunk.shape)

df_test = df.head(5)
df_test

# df = pd.read_csv('part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv')
# df

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.662395,410946.12,17.0,64.0,1114.559417,1114.559417,0.0,0.0,0.0,0.0,...,0.0,554.0,83767550.0,9.5,33.286634,0.0,0.0,0.0,141.55,Mirai-udpplain
1,0.0,54.0,6.0,65.91,230.285448,230.285448,0.0,0.0,0.0,0.0,...,0.0,54.0,83314670.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-PSHACK_Flood
2,0.0,0.0,1.0,64.0,7.949886,7.949886,0.0,0.0,0.0,0.0,...,0.0,42.0,83128680.0,9.5,9.165151,0.0,0.0,0.0,141.55,DDoS-ICMP_Flood
3,0.013891,68.56,6.22,64.0,2.693528,2.693528,0.0,1.0,0.0,1.0,...,13.754549,56.64,83344210.0,9.5,10.832015,19.471851,1046.392845,0.27,141.55,DDoS-RSTFINFlood
4,0.0,0.0,47.0,64.0,43.098954,43.098954,0.0,0.0,0.0,0.0,...,0.0,578.0,83647270.0,9.5,34.0,0.0,0.0,0.0,141.55,Mirai-greip_flood


In [2]:
df.shape

(227491, 47)

In [8]:
# count null values
print(df.isnull().sum())

flow_duration      0
Header_Length      0
Protocol Type      0
Duration           0
Rate               0
Srate              0
Drate              0
fin_flag_number    0
syn_flag_number    0
rst_flag_number    0
psh_flag_number    0
ack_flag_number    0
ece_flag_number    0
cwr_flag_number    0
ack_count          0
syn_count          0
fin_count          0
urg_count          0
rst_count          0
HTTP               0
HTTPS              0
DNS                0
Telnet             0
SMTP               0
SSH                0
IRC                0
TCP                0
UDP                0
DHCP               0
ARP                0
ICMP               0
IPv                0
LLC                0
Tot sum            0
Min                0
Max                0
AVG                0
Std                0
Tot size           0
IAT                0
Number             0
Magnitue           0
Radius             0
Covariance         0
Variance           0
Weight             0
label              0
dtype: int64


In [9]:
# count duplicate values
print(df.duplicated().sum())

0


In [11]:
# check data types
print(df.dtypes)

flow_duration      float64
Header_Length      float64
Protocol Type      float64
Duration           float64
Rate               float64
Srate              float64
Drate              float64
fin_flag_number    float64
syn_flag_number    float64
rst_flag_number    float64
psh_flag_number    float64
ack_flag_number    float64
ece_flag_number    float64
cwr_flag_number    float64
ack_count          float64
syn_count          float64
fin_count          float64
urg_count          float64
rst_count          float64
HTTP               float64
HTTPS              float64
DNS                float64
Telnet             float64
SMTP               float64
SSH                float64
IRC                float64
TCP                float64
UDP                float64
DHCP               float64
ARP                float64
ICMP               float64
IPv                float64
LLC                float64
Tot sum            float64
Min                float64
Max                float64
AVG                float64
S

In [12]:
# PREPARE DATA
#create input data
X = df.drop(columns=['label'])

#create output (prediction) data
y = df['label']

In [13]:
#split data - 80% training, 20% testing
# training input data, testing input data, training output data, testing output data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rand_state)

In [14]:
# Baseline Random Classifier
dummy_classifier = DummyClassifier(strategy='stratified', random_state=rand_state)
# Fit baseline classifier on training data
dummy_classifier.fit(X_train, y_train)
# Predictions from test data
y_pred_dummy = dummy_classifier.predict(X_test)

dummy_acc = accuracy_score(y_test, y_pred_dummy)
report = classification_report(y_test, y_pred_dummy)

print("Baseline Classifier Accuracy: ", dummy_acc)
print("Classification Report:", report)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Baseline Classifier Accuracy:  0.08800193410844194
Classification Report:                          precision    recall  f1-score   support

       Backdoor_Malware       0.00      0.00      0.00         3
          BenignTraffic       0.02      0.02      0.02      1070
       BrowserHijacking       0.00      0.00      0.00         7
       CommandInjection       0.00      0.00      0.00         4
 DDoS-ACK_Fragmentation       0.01      0.01      0.01       293
        DDoS-HTTP_Flood       0.00      0.00      0.00        37
        DDoS-ICMP_Flood       0.15      0.15      0.15      6981
DDoS-ICMP_Fragmentation       0.01      0.01      0.01       428
      DDoS-PSHACK_Flood       0.09      0.08      0.09      4056
       DDoS-RSTFINFlood       0.09      0.09      0.09      3961
         DDoS-SYN_Flood       0.09      0.09      0.09      4014
         DDoS-SlowLoris       0.00      0.00      0.00        18
DDoS-SynonymousIP_Flood       0.08      0.09      0.08      3421
         DDoS-T

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [15]:
# RESHAPE DATA from 3d to 2d
#nx, ny = X_train.shape
#d2_train_dataset = X_train.values.reshape(nx,ny)
#d2_train_dataset
models = {}
results = {}

In [16]:
# models = [
#     ('dt', DecisionTreeClassifier()),
#     ('rf', RandomForestClassifier())
# ]

In [17]:
# Robustness Check - Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=rand_state)

def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    # Train model
    model.fit(X_train, y_train)
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    # Scoring
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo')

    cv_scores = cross_val_score(model, X_train, y_train, cv=cv).mean()

    print(f"\n{name} Results: ")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1: {f1:.4f}")
    print(f"ROC-AUC: {roc:.4f}")
    print(f"CV: {cv_scores:.4f}")
    print(f"\nClassification Report: ")
    print(classification_report(y_test, y_pred))

    return{
        'model': model,
        'accuracy': acc,
        'f1_score': f1,
        'roc_auc': roc,
        'cv': cv_scores,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

In [18]:
rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=15,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=rand_state,
    n_jobs=-1
)
results['Random Forest'] = evaluate_model('Random Forest', rf_model, X_train, y_train, X_test, y_test)




Random Forest Results: 
Accuracy: 0.9898
F1: 0.9886
ROC-AUC: 0.9706
CV: 0.9897

Classification Report: 


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                         precision    recall  f1-score   support

       Backdoor_Malware       0.00      0.00      0.00         3
          BenignTraffic       0.78      1.00      0.88      1070
       BrowserHijacking       0.00      0.00      0.00         7
       CommandInjection       0.00      0.00      0.00         4
 DDoS-ACK_Fragmentation       1.00      0.98      0.99       293
        DDoS-HTTP_Flood       0.97      0.78      0.87        37
        DDoS-ICMP_Flood       1.00      1.00      1.00      6981
DDoS-ICMP_Fragmentation       1.00      0.98      0.99       428
      DDoS-PSHACK_Flood       1.00      1.00      1.00      4056
       DDoS-RSTFINFlood       1.00      1.00      1.00      3961
         DDoS-SYN_Flood       1.00      1.00      1.00      4014
         DDoS-SlowLoris       0.78      0.78      0.78        18
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      3421
         DDoS-TCP_Flood       1.00      1.00      1.00      4424
         DDoS-UDP_Flood 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [19]:
dt_model = tree.DecisionTreeClassifier(
    max_depth=15,
    random_state=rand_state
)
results['DecisionTree'] = evaluate_model('DecisionTree', dt_model, X_train, y_train, X_test, y_test)




DecisionTree Results: 
Accuracy: 0.9886
F1: 0.9883
ROC-AUC: 0.9349
CV: 0.9883

Classification Report: 


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                         precision    recall  f1-score   support

       Backdoor_Malware       0.00      0.00      0.00         3
          BenignTraffic       0.88      0.98      0.93      1070
       BrowserHijacking       0.00      0.00      0.00         7
       CommandInjection       0.00      0.00      0.00         4
 DDoS-ACK_Fragmentation       0.98      0.93      0.95       293
        DDoS-HTTP_Flood       0.88      0.19      0.31        37
        DDoS-ICMP_Flood       1.00      1.00      1.00      6981
DDoS-ICMP_Fragmentation       1.00      0.97      0.99       428
      DDoS-PSHACK_Flood       1.00      1.00      1.00      4056
       DDoS-RSTFINFlood       1.00      1.00      1.00      3961
         DDoS-SYN_Flood       1.00      1.00      1.00      4014
         DDoS-SlowLoris       1.00      0.06      0.11        18
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      3421
         DDoS-TCP_Flood       1.00      1.00      1.00      4424
         DDoS-UDP_Flood 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [47]:
# dt = tree.DecisionTreeClassifier()
# rf = RandomForestClassifier()

# dt.fit(X_train, y_train, sample_weight=None)
# rf.fit(X_train, y_train, sample_weight=None)

In [49]:
# #enter data you want a prediction for
# rf_predictions = rf.predict(X_test)
# dt_predictions = dt.predict(X_test)
# rf_predictions

In [27]:
print("Model Comparison")
print("-"*40)

comparison = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[m]['accuracy'] for m in results],
    'F1': [results[m]['f1_score'] for m in results],
    'ROC-AUC': [results[m]['roc_auc'] for m in results],
    'CV': [results[m]['cv'] for m in results]
})

#comparison = comparison.sort_values('F1', ascending=False)
print('\n', comparison.to_string(index=False))

# Best model according to F1 score
best_model_name = comparison.iloc[0]['Model']
best_model_result = results[best_model_name]
best_model = best_model_result['model']

print(f"\nBest Model: {best_model_name}")
print(f"Accuracy: {best_model_result['accuracy']:.4f}")
print(f"F1: {best_model_result['f1_score']:.4f}")
print(f"ROC-AUC: {best_model_result['roc_auc']:.4f}")

Model Comparison
----------------------------------------

         Model  Accuracy       F1  ROC-AUC       CV
Random Forest  0.989780 0.988636 0.970574 0.989725
 DecisionTree  0.988593 0.988292 0.934889 0.988291

Best Model: Random Forest
Accuracy: 0.9898
F1: 0.9886
ROC-AUC: 0.9706


In [38]:
# VOTING CLASSIFIER
vote_hard = VotingClassifier(estimators=[('dt',dt_model),('rf',rf_model)], voting='hard')
vote_hard.fit(X_train, y_train)

# PREDICT
y_pred_vote = vote_hard.predict(X_test)


# dt = tree.DecisionTreeClassifier()
# rf = RandomForestClassifier()

# dt.fit(X_train, y_train, sample_weight=None)
# rf.fit(X_train, y_train, sample_weight=None)

# Make dt PERSISTANT
#store TRAINED dt in file called IDS_IoT.joblib
#joblib.dump(dt_model, 'IDS_IoT.joblib')
#load trained dt from file
#dt = joblib.load('IDS_IoT.joblib')

#enter data you want a prediction for
rf_predictions = rf_model.predict(X_test)
dt_predictions = dt_model.predict(X_test)
# rf_predictions

In [39]:
# SCORING
accur_score = accuracy_score(y_test, y_pred_vote)
print("Hard Voting Accuracy: ", accur_score)

prec_score = precision_score(y_test, y_pred_vote, average='weighted') # "ill-defined" Warning
print("Precision Score: ", prec_score)

recall = recall_score(y_test, y_pred_vote, average='weighted')
print("Recall Score: ", recall)

f1 = f1_score(y_test, y_pred_vote, average='weighted')
print("F1 Score: ", f1)

Hard Voting Accuracy:  0.9890766830040221


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Precision Score:  0.9880905759570882
Recall Score:  0.9890766830040221
F1 Score:  0.9877793730189517


In [43]:
# #SCORING
# rf_accur_score = accuracy_score(y_test, rf_predictions) #returns # from 0-1
# dt_accur_score = accuracy_score(y_test, dt_predictions)

# rf_precision = precision_score(y_test, rf_predictions, average='weighted')
# dt_precision = precision_score(y_test, dt_predictions, average='weighted')

# rf_recall = recall_score(y_test, rf_predictions, average='weighted')
# dt_recall = recall_score(y_test, dt_predictions, average='weighted')

# print("Random Forest\nAccuracy: ", rf_accur_score)
# print("Precision: ", rf_precision)
# print("Recall: ", rf_recall)
# print("-----------------------------------------")
# print("Decision Tree\nAccuracy: ", dt_accur_score)
# print("Precision: ", dt_precision)
# print("Recall: ", dt_recall)


In [50]:
# VISUALIZATIONS
#decisionTree, filename, display specified columns & rules, output data in alphabetical order, writes info on graph, round box edges, color boxes
tree.export_graphviz(dt_model, out_file='IDS_IoT_decisionTree.dot', feature_names=X_test.columns, class_names=sorted(y.unique()), max_depth=5, label='all', rounded=True, filled=True)


In [52]:
rf_individual_tree = rf_model.estimators_[0] # grabs the first tree in the RandomForest
tree.export_graphviz(rf_individual_tree, out_file='IDS_IoT_rfTree.dot', feature_names=X_test.columns, class_names=sorted(y.unique()), max_depth=5, label='all', rounded=True, filled=True)
