In [37]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import auc, precision_recall_curve, accuracy_score, precision_score, recall_score, roc_auc_score
import os
import glob
from src.helpers import trainer_factory

folder_path = os.getcwd() + "/data/csvs"
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))


def eval_model(model, X_test, y_test):
    score = {}
    predictions = model.predict(X_test)

    predictions[predictions == -1] = 0

    print(pd.DataFrame(predictions).value_counts())

    precision, recall, _ = precision_recall_curve(y_test, predictions)
    score["auprc"] = auc(recall, precision)
    score["auc_roc"] = roc_auc_score(y_test, predictions)
    score["precision"] = precision_score(y_test, predictions)
    score["recall"] = recall_score(y_test, predictions)
    score["accuracy"] = accuracy_score(y_test, predictions)

    return score

print("Setup Complete")

Setup Complete


In [38]:
dfs = []

for file in csv_files:
    filename = file.split("/")[-1]
    df = pd.read_csv(file)
    df["Source File"] = filename
    dfs.append(df)

captures = pd.concat(dfs)

In [39]:
captures.columns

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

In [40]:
captures.rename(columns={name:name.strip() for name in captures.columns}, inplace=True)

In [41]:
captures.shape

(2830743, 80)

In [42]:

captures["Label"].unique()


array(['BENIGN', 'Web Attack � Brute Force', 'Web Attack � XSS',
       'Web Attack � Sql Injection', 'DDoS', 'DoS slowloris',
       'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye', 'Heartbleed',
       'FTP-Patator', 'SSH-Patator', 'Bot', 'Infiltration', 'PortScan'],
      dtype=object)

In [43]:
captures["Label"] = captures["Label"] == 'BENIGN'

In [44]:
captures.dropna(inplace=True)

In [45]:
le = LabelEncoder()

captures["Label"] = le.fit_transform(captures["Label"])

This Next Block is important to note. The Isolation forest cannot deal with numbers that cannot be stored in float32s in its current state. Those must be delt with.

In [46]:
captures = captures[captures["Flow Bytes/s"] != np.inf]

In [47]:
captures.shape

(2827876, 80)

In [48]:
captures["Source File"].unique()

array(['Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
       'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
       'Wednesday-workingHours.pcap_ISCX.csv',
       'Monday-WorkingHours.pcap_ISCX.csv',
       'Tuesday-WorkingHours.pcap_ISCX.csv',
       'Friday-WorkingHours-Morning.pcap_ISCX.csv',
       'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
       'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv'],
      dtype=object)

In [49]:
features = captures.columns.drop(["Label", "Source File"])

X_norm = captures[captures["Source File"] == "Monday-WorkingHours.pcap_ISCX.csv"][features]
y_norm = captures[captures["Source File"] == "Monday-WorkingHours.pcap_ISCX.csv"]["Label"]


In [50]:
isoforest_trainer = trainer_factory(IsolationForest, X_norm, y_norm)

In [51]:
X_attack = captures[captures["Source File"] != "Monday-WorkingHours.pcap_ISCX.csv"][features]
y_attack = captures[captures["Source File"] != "Monday-WorkingHours.pcap_ISCX.csv"]["Label"]

In [52]:
isoforest = isoforest_trainer(random_state=0)

eval_model(isoforest, X_attack, y_attack)

0
1    1973900
0     324495
Name: count, dtype: int64


{'auprc': 0.8954349827165388,
 'auc_roc': 0.6233985633983208,
 'precision': 0.8105861492476822,
 'recall': 0.9185785827507594,
 'accuracy': 0.7756229890858621}

The OCSVM took about an hour to trian and did not perform half as well as the isolation forest.
{'auc': 0.996731548529342,
 'precision': 0.9997661037563736,
 'recall': 0.5087117966295345,
 'accuracy': 0.5148973166032238}

In [53]:
# ocsvm_trainer = trainer_factory(OneClassSVM, X_train, y_train)

In [54]:
# ocsvm, _ = ocsvm_trainer()

# eval_model(ocsvm, X_test, y_test)