In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# (Ignore) Combining the data files

In [2]:
DATA_DIR = "data/"
# benign = pd.read_csv(DATA_DIR + "Benign Traffic.csv")
# ddos_icmp = pd.read_csv(DATA_DIR + "DDoS ICMP Flood.csv")
# ddos_udp = pd.read_csv(DATA_DIR + "DDoS UDP Flood.csv")
# dos_icmp = pd.read_csv(DATA_DIR + "DoS ICMP Flood.csv")
# dos_tcp = pd.read_csv(DATA_DIR + "DoS TCP Flood.csv")
# dos_udp = pd.read_csv(DATA_DIR + "DoS UDP Flood.csv")
# mitm_arp = pd.read_csv(DATA_DIR + "MITM ARP Spoofing.csv")
# ddos_publish = pd.read_csv(DATA_DIR + "MQTT DDoS Publish Flood.csv")
# dos_connect = pd.read_csv(DATA_DIR + "MQTT DoS Connect Flood.csv")
# dos_publish = pd.read_csv(DATA_DIR + "MQTT DoS Publish Flood.csv")
# mqtt_malformed = pd.read_csv(DATA_DIR + "MQTT Malformed.csv")
# os_scan = pd.read_csv(DATA_DIR + "Recon OS Scan.csv")
# ping_sweep = pd.read_csv(DATA_DIR + "Recon Ping Sweep.csv")
# port_scan = pd.read_csv(DATA_DIR + "Recon Port Scan.csv")
# vuln_scan = pd.read_csv(DATA_DIR + "Recon Vulnerability Scan.csv")

# # Combine all dataframes into a single dataframe and export to parquet
# all_data = pd.concat([
#     benign,
#     ddos_icmp,
#     ddos_udp,
#     dos_icmp,
#     dos_tcp,
#     dos_udp,
#     mitm_arp,
#     ddos_publish,
#     dos_connect,
#     dos_publish,
#     mqtt_malformed,
#     os_scan,
#     ping_sweep,
#     port_scan,
#     vuln_scan
# ], ignore_index=True)
# all_data.to_parquet(DATA_DIR + "combined_traffic.parquet", index=False)

# Initial Data Exploration

In [3]:
data = pd.read_parquet(DATA_DIR + "combined_traffic.parquet")

In [4]:
RANDOM_SEED = 42

In [6]:
# Counts for each type of attack
data["Attack Name"].value_counts()

Attack Name
DoS TCP Flood               2106916
Recon Port Scan              485522
MQTT DDoS Publish Flood      413913
MQTT DoS Connect Flood       238031
Recon OS Scan                 85317
Benign Traffic                32620
Recon Vulnerability Scan       8321
DoS UDP Flood                  3115
DDoS UDP Flood                 2576
DDoS ICMP Flood                2552
MQTT Malformed                 2246
DoS ICMP Flood                 2107
MITM ARP Spoofing              1053
MQTT DoS Publish Flood          953
Recon Ping Sweep                 71
Name: count, dtype: int64

**Note**: Distribution for each type of attack & benign wildly varies. There's only 32620 benign, but orders of magnitude more for DoS TCP Flood. We may want to take subsets to have a better distribution and not have crazy class imbalance which can cause issues with biased predictions. 

# Create our main dataset from sampling subsets
We essentially want a roughly 50/50 split between benign and malicious traffic. We can take all the data for attacks that don't have many samples (< 3000) and then randomly sample 3000 samples for the others that do.

In [7]:
recon_ping_sweep = data[data["Attack Name"] == "Recon Ping Sweep"] # 71
mqtt_dos_publish_flood = data[data["Attack Name"] == "MQTT DoS Publish Flood"] # 953
mitm_arp_spoofing = data[data["Attack Name"] == "MITM ARP Spoofing"] # 1053
dos_icmp_flood = data[data["Attack Name"] == "DoS ICMP Flood"] # 2107
mqtt_malformed = data[data["Attack Name"] == "MQTT Malformed"] # 2246
ddos_icmp_flood = data[data["Attack Name"] == "DDoS ICMP Flood"] # 2552
ddos_udp_flood = data[data["Attack Name"] == "DDoS UDP Flood"] # 2576

In [8]:
benign_traffic = data[data["Attack Name"] == "Benign Traffic"]
benign_traffic.shape[0]

32620

In [9]:
dos_udp_flood = data[data["Attack Name"] == "DoS UDP Flood"].sample(n=3000, random_state=RANDOM_SEED)
recon_vuln_scan = data[data["Attack Name"] == "Recon Vulnerability Scan"].sample(n=3000, random_state=RANDOM_SEED)
mqtt_dos_connect_flood = data[data["Attack Name"] == "MQTT DoS Connect Flood"].sample(n=3000, random_state=RANDOM_SEED)
mqtt_ddos_publish_flood = data[data["Attack Name"] == "MQTT DDoS Publish Flood"].sample(n=3000, random_state=RANDOM_SEED)
recon_port_scan = data[data["Attack Name"] == "Recon Port Scan"].sample(n=3000, random_state=RANDOM_SEED)
dos_tcp_flood = data[data["Attack Name"] == "DoS TCP Flood"].sample(n=3000, random_state=RANDOM_SEED)

In [10]:
sampled_data = pd.concat([
    benign_traffic,
    recon_ping_sweep,
    mqtt_dos_publish_flood,
    mitm_arp_spoofing,
    dos_icmp_flood,
    mqtt_malformed,
    ddos_icmp_flood,
    ddos_udp_flood,
    dos_udp_flood,
    recon_vuln_scan,
    mqtt_dos_connect_flood,
    mqtt_ddos_publish_flood,
    recon_port_scan,
    dos_tcp_flood
])
sampled_data.shape

(62178, 85)

In [11]:
sampled_data.to_parquet(DATA_DIR + "sampled_traffic.parquet", index=False)