In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 200)

In [29]:
# data processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# supervised methods
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, accuracy_score, precision_recall_fscore_support
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [3]:
# unsupervised method
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

## Loading the TON_IoT dataset and having a preview of the data

In [4]:
df = pd.read_csv('./dataset/toniot.csv.gz')
df.head()

Unnamed: 0,src_ip,src_port,dst_ip,dst_port,proto,service,duration,src_bytes,dst_bytes,conn_state,missed_bytes,src_pkts,src_ip_bytes,dst_pkts,dst_ip_bytes,dns_query,dns_qclass,dns_qtype,dns_rcode,dns_AA,dns_RD,dns_RA,dns_rejected,ssl_version,ssl_cipher,ssl_resumed,ssl_established,ssl_subject,ssl_issuer,http_trans_depth,http_method,http_uri,http_version,http_request_body_len,http_response_body_len,http_status_code,http_user_agent,http_orig_mime_types,http_resp_mime_types,weird_name,weird_addl,weird_notice,label,type
0,192.168.1.37,4444,192.168.1.193,49178,tcp,-,290.371539,101568,2592,OTH,0,108,108064,31,3832,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-,1,backdoor
1,192.168.1.193,49180,192.168.1.37,8080,tcp,-,0.000102,0,0,REJ,0,1,52,1,40,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-,1,backdoor
2,192.168.1.193,49180,192.168.1.37,8080,tcp,-,0.000148,0,0,REJ,0,1,52,1,40,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-,1,backdoor
3,192.168.1.193,49180,192.168.1.37,8080,tcp,-,0.000113,0,0,REJ,0,1,48,1,40,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-,1,backdoor
4,192.168.1.193,49180,192.168.1.37,8080,tcp,-,0.00013,0,0,REJ,0,1,52,1,40,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-,1,backdoor


#### Available attributes in the dataset

In [5]:
print(df.columns.values, len(df.columns.values))

['src_ip' 'src_port' 'dst_ip' 'dst_port' 'proto' 'service' 'duration'
 'src_bytes' 'dst_bytes' 'conn_state' 'missed_bytes' 'src_pkts'
 'src_ip_bytes' 'dst_pkts' 'dst_ip_bytes' 'dns_query' 'dns_qclass'
 'dns_qtype' 'dns_rcode' 'dns_AA' 'dns_RD' 'dns_RA' 'dns_rejected'
 'ssl_version' 'ssl_cipher' 'ssl_resumed' 'ssl_established' 'ssl_subject'
 'ssl_issuer' 'http_trans_depth' 'http_method' 'http_uri' 'http_version'
 'http_request_body_len' 'http_response_body_len' 'http_status_code'
 'http_user_agent' 'http_orig_mime_types' 'http_resp_mime_types'
 'weird_name' 'weird_addl' 'weird_notice' 'label' 'type'] 44


#### The total number of samples

In [6]:
print(f"There are {df.shape[0]} flow-based samples")

There are 211043 flow-based samples


#### Checking the available classes

In [7]:
df['type'].value_counts()

type
normal        50000
backdoor      20000
ddos          20000
dos           20000
injection     20000
password      20000
ransomware    20000
scanning      20000
xss           20000
mitm           1043
Name: count, dtype: int64

In [8]:
df['label'].value_counts(), df['label'].value_counts(normalize=True)

(label
 1    161043
 0     50000
 Name: count, dtype: int64,
 label
 1    0.763081
 0    0.236919
 Name: proportion, dtype: float64)

---
#### Signature- or Anomaly-based Discussion

The attribute `label` has only two values, `0` representing a normal sample and `1` representing one of the nine available attacks in the dataset. This attribute is helpful for the binary classification task (normal or attack) or from an anomaly approach using `0` as the normal behavior and the rest as non-normal.

As can be seen, the dataset is inherently unbalanced, with 76.3% of attack samples and 23.7% of normal samples. It is essential to highlight that an analysis of imbalances is important because, during operation, most of the data presented to an ML Model for inference would be normal samples.

---

#### Tuple 

This dataset is arranged in a 6-tuple based on the attributes:
- src_ip
- src_port
- dst_ip
- dst_port
- proto
- service

In [9]:
unique_tuples = df[['src_ip', 'src_port', 'dst_ip', 'dst_port', 'proto', 'service']].drop_duplicates().shape[0]
print(f"And there are only {unique_tuples} flow-based samples based on the 6-tuple.")

And there are only 124152 flow-based samples based on the 6-tuple.


#### Preprocessing

To use the dataset, it is important to remove all empty values (`NaN`), and infinite values. Additionally, a good practice is to work with scaled features, more info [here](https://en.wikipedia.org/wiki/Feature_scaling).

In [10]:
# Replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
# Now, remove rows with NaN values (which include former infinities)
df = df.dropna()

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211043 entries, 0 to 211042
Data columns (total 44 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   src_ip                  211043 non-null  object 
 1   src_port                211043 non-null  int64  
 2   dst_ip                  211043 non-null  object 
 3   dst_port                211043 non-null  int64  
 4   proto                   211043 non-null  object 
 5   service                 211043 non-null  object 
 6   duration                211043 non-null  float64
 7   src_bytes               211043 non-null  int64  
 8   dst_bytes               211043 non-null  int64  
 9   conn_state              211043 non-null  object 
 10  missed_bytes            211043 non-null  int64  
 11  src_pkts                211043 non-null  int64  
 12  src_ip_bytes            211043 non-null  int64  
 13  dst_pkts                211043 non-null  int64  
 14  dst_ip_bytes        

### Applicable features

Defining the applicable features: **only numeric** for simplicity.

In [15]:
numeric_df = df.select_dtypes(include='number')
features = numeric_df.columns.tolist()
features

['src_port',
 'dst_port',
 'duration',
 'src_bytes',
 'dst_bytes',
 'missed_bytes',
 'src_pkts',
 'src_ip_bytes',
 'dst_pkts',
 'dst_ip_bytes',
 'dns_qclass',
 'dns_qtype',
 'dns_rcode',
 'http_request_body_len',
 'http_response_body_len',
 'http_status_code',
 'label']

In [16]:
features.remove('label') # to be used as target variable
features

['src_port',
 'dst_port',
 'duration',
 'src_bytes',
 'dst_bytes',
 'missed_bytes',
 'src_pkts',
 'src_ip_bytes',
 'dst_pkts',
 'dst_ip_bytes',
 'dns_qclass',
 'dns_qtype',
 'dns_rcode',
 'http_request_body_len',
 'http_response_body_len',
 'http_status_code']

## Creating the train and test dataset splits

- Training: 70%
- Test: 30%

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df['label'], test_size=0.30, random_state=42)

## Scaling the features

In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

---
# Applying Machine Learning to the NIDS flow-based data
## Supervised Learning

Models evaluated:
- Decision Tree
- Random Forest
- SVM
- Logistic Regression

In [26]:
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42)
np.random.seed(42)
svm = SVC()
logistic_regression = LogisticRegression(random_state=42)

models = [decision_tree, random_forest, svm, logistic_regression]
model_names = ['Decision Tree', 'Random Forest', 'SVM', 'Logistic Regression']

### Training and Evaluation (Learning Metrics)
The following cell runs the training with the `model.fit()` method and then performs the prediction with the `model.predict()` method. It is important to highlight that training is performed with training data (`X_train_scaled`), and test (predict) is performed with the testing subset (`X_test_scaled`).

It uses `model.predict_proba()` or `model.decision_function()` to calculate the ROC curve.

After training, the following metrics are calculated:

- $\text{Accuracy} = \frac{\text{Number of Correct Predictions}}{\text{Total Number of Predictions}}$

- $\text{Precision} = \frac{\text{True Positives}}{\text{True Positives + False Positives}}$

- $\text{Recall} = \frac{\text{True Positives}}{\text{True Positives + False Negatives}}$   (or Sensitivity or True Positive Rate)

- ROC AUC (Area Under the Receiver Operating Characteristic Curve)$ = \int_{0}^{1} \text{TPR}(fpr) \, d(fpr)$

TPR is the True Positive Rate (or Recall), and \( fpr \) is the False Positive Rate.

In [27]:
for model, name in zip(models, model_names):
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:,1] if hasattr(model, "predict_proba") else model.decision_function(X_test_scaled)
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc = roc_auc_score(y_test, y_proba)
    print(f"Results for {name}:")
    print(classification_report(y_test, y_pred))
    print(f"ROC AUC: {auc}")
    print("==="*20)

Results for Decision Tree:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     15045
           1       1.00      1.00      1.00     48268

    accuracy                           1.00     63313
   macro avg       1.00      1.00      1.00     63313
weighted avg       1.00      1.00      1.00     63313

ROC AUC: 0.9967573275312318
Results for Random Forest:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15045
           1       1.00      1.00      1.00     48268

    accuracy                           1.00     63313
   macro avg       1.00      1.00      1.00     63313
weighted avg       1.00      1.00      1.00     63313

ROC AUC: 0.999863200101637
Results for SVM:
              precision    recall  f1-score   support

           0       0.95      0.61      0.74     15045
           1       0.89      0.99      0.94     48268

    accuracy                           0.90     63313
  

In [None]:
results = []

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:,1] if hasattr(model, "predict_proba") else model.decision_function(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    auc = roc_auc_score(y_test, y_proba)
    results.append({"Model": name, "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1-Score": f1, "ROC-AUC": auc})

results_df = pd.DataFrame(results)
print(results_df)

## Unsupervised Learning

Model Evaluated:
- KMeans: using two clusters to represent benign and attack samples

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans = kmeans.fit(X_train_scaled)

In [None]:
train_labels = kmeans.predict(X_train_scaled)
test_labels = kmeans.predict(X_test_scaled)

### Using the Silhouette Score 
The Silhouette Score is a measure used to evaluate the quality of clusters created by clustering algorithms. It calculates how similar an object is to its own cluster compared to other clusters. The score ranges from -1 to 1, where a high value indicates that the object is well-matched to its own cluster and poorly matched to neighboring clusters, thus signifying well-separated clusters. This metric is particularly useful for assessing the effectiveness of clustering algorithms like K-Means on a dataset.

In [None]:
silhouette_train = silhouette_score(X_train_scaled, train_labels)
silhouette_test = silhouette_score(X_test_scaled, test_labels)

print("Silhouette Score for training set: ", silhouette_train)
print("Silhouette Score for test set: ", silhouette_test)