In [1]:
"""
This extracts a test set. Different days for attack and benign than the training days.
"""
import numpy as np
import pandas as pd
import dask.dataframe as dd # get dask with: pip install "dask[complete]"
from os import listdir
from os.path import isfile, join
import os

Since the original dataset files are too big to be loaded into memory, and since the benign cases are spread out across all the parts, we extract what we need from all of them to prepare our extract.

In [11]:
# Assumes we've placed program in root of dataset folder, where both days are subdirectories.
path1 = os.getcwd() + "/01-12"
path2 = os.getcwd() + "/03-11"

In [12]:
# Get list of files in each directory.
fileList = [path1 + '/' + f for f in listdir(path1) if isfile(join(path1, f))]
fileList2 = [path2 + '/' + f for f in listdir(path2) if isfile(join(path2, f))]

In [13]:
"""
This function extracts an equal amount of benign and attack data for each day and forms the train/test sets.

Needs Dask due to how large many of the datasets are.
"""
def extract(fileList):

    print("Beginning extraction of benign data...")
    # First pass aggregates BENIGN data.
    i = 0
    outFrame = None
    for f in fileList:
        df = dd.read_csv(f, dtype={'SimillarHTTP': 'object'})
        lind = f.rfind('/')
        slind = f.rfind('/', 0,lind-1)
        print(f[slind:])
        #print(f"{df[' Label'].value_counts().compute()}\n")

        # Pull benign samples out and get their quantity.
        ben_df = df[df[' Label'] == 'BENIGN']

        if i == 0:
            outFrame = ben_df
            i += 1

        else:    
            outFrame = dd.concat([outFrame, ben_df])

        #print(f"Aggregated df stats: {outFrame[' Label'].value_counts().compute()}\n")

    # Get the total count of BENIGN samples that we've aggregated.
    shben = outFrame.shape
    ben_num = shben[0].compute()
    print(f"Number of Benign samples aggregated: {ben_num}")

    # Get an upper bound on the number of attack samples to extract. Goal is to get enough to be able to make them all equal after cleaning.
    ben_num = 100000
    
    print("Beginning extraction of attack data...")

    # Second pass pulls out attack data equal in quantity to the count of benign data that we've gathered.
    for f in fileList:
        df = dd.read_csv(f, dtype={'SimillarHTTP': 'object'})
        lind = f.rfind('/')
        slind = f.rfind('/', 0,lind-1)
        print(f[slind:])
  
        # Get list of label categories
        cats = list(set(list(df[' Label'])))

        # Go through non-benign categories and add a subsample equal in quantity to the number of benign samples.
        for cat in cats:

            if cat != "BENIGN":

                print(f"Appending data for {cat}...")
                # Find our how many samples we have
                cat_df = df[df[' Label'] == cat]
                shcat = cat_df.shape
                cat_num = shcat[0].compute()

                # Get a sample if there is equal/more attack data, otherwise add everything.
                # dask only does approximate sampling, so we'll have to do some post-processing to make the numbers exactly equal.
                if cat_num > ben_num:
                    ret_df = cat_df.sample(frac = ben_num / cat_num)

                else:
                    ret_df = cat_df

                outFrame = dd.concat([outFrame, ret_df])
                #print(f"Aggregated df stats: {outFrame[' Label'].value_counts().compute()}\n")

    # Return our aggregated DataFrame
    print(f"Aggregated df stats: {outFrame[' Label'].value_counts().compute()}\n")
    print("Extraction completed.")
    return outFrame
    


In [14]:
train = extract(fileList)
test = extract(fileList2)

Beginning extraction of benign data...
/01-12/DrDoS_DNS.csv
/01-12/DrDoS_LDAP.csv
/01-12/DrDoS_MSSQL.csv
/01-12/DrDoS_NetBIOS.csv
/01-12/DrDoS_NTP.csv
/01-12/DrDoS_SNMP.csv
/01-12/DrDoS_SSDP.csv
/01-12/DrDoS_UDP.csv
/01-12/Syn.csv
/01-12/TFTP.csv
/01-12/UDPLag.csv
Number of Benign samples aggregated: 56863
Beginning extraction of attack data...
/01-12/DrDoS_DNS.csv
Appending data for DrDoS_DNS...
/01-12/DrDoS_LDAP.csv
Appending data for DrDoS_LDAP...
/01-12/DrDoS_MSSQL.csv
Appending data for DrDoS_MSSQL...
/01-12/DrDoS_NetBIOS.csv
Appending data for DrDoS_NetBIOS...
/01-12/DrDoS_NTP.csv
Appending data for DrDoS_NTP...
/01-12/DrDoS_SNMP.csv
Appending data for DrDoS_SNMP...
/01-12/DrDoS_SSDP.csv
Appending data for DrDoS_SSDP...
/01-12/DrDoS_UDP.csv
Appending data for DrDoS_UDP...
/01-12/Syn.csv
Appending data for Syn...
/01-12/TFTP.csv
Appending data for TFTP...
/01-12/UDPLag.csv
Appending data for UDP-lag...
Appending data for WebDDoS...
Aggregated df stats: DrDoS_DNS        100001
DrDo

Save extracts, then reload them to finish processing.

In [15]:
dd.to_csv(df = train, filename = "day1_v2.csv", single_file=True)

['c:\\Users\\icarus\\Documents\\school\\Fall 2022\\CSI 5388\\dataset\\day1_v2.csv']

In [16]:
dd.to_csv(df = test, filename = "day2_v2.csv", single_file=True)

['c:\\Users\\icarus\\Documents\\school\\Fall 2022\\CSI 5388\\dataset\\day2_v2.csv']

In [2]:
df1 = pd.read_csv("day1_v2.csv")
df2 = pd.read_csv("day2_v2.csv")

  df1 = pd.read_csv("day1_v2.csv")
  df2 = pd.read_csv("day2_v2.csv")


In [3]:
print(df1[" Label"].value_counts())
print(df2[" Label"].value_counts())

DrDoS_DNS        100001
DrDoS_LDAP       100001
DrDoS_SNMP       100001
DrDoS_SSDP       100001
DrDoS_MSSQL      100000
DrDoS_NetBIOS    100000
DrDoS_NTP        100000
Syn              100000
UDP-lag          100000
DrDoS_UDP         99999
TFTP              99999
BENIGN            56863
WebDDoS             439
Name:  Label, dtype: int64
Syn        200005
UDP        199999
NetBIOS    199998
MSSQL      124392
LDAP       109932
Portmap    100000
BENIGN      56965
UDPLag       1873
Name:  Label, dtype: int64


Cleaning

In [4]:
# Remove socket information and timestamp
drop_cols = ['Flow ID', ' Source IP', ' Source Port', ' Destination IP',' Destination Port', ' Timestamp']

df1 = df1.drop(drop_cols, axis=1)
df2 = df2.drop(drop_cols, axis=1)

# remove infinity values
df1 = df1.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

# replace invalid SimilarHTTP values with NaN
for col in df1.columns:
    if col != " Label":
        df1[col] = pd.to_numeric(df1[col], errors='coerce')

# Repeat with multiclass dataset
df2 = df2.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

for col in df2.columns:
    if col != " Label":
        df2[col] = pd.to_numeric(df2[col], errors='coerce')

# drop remaining NaN values from both
df1 = df1.dropna(axis=0)
df2 = df2.dropna(axis=0)

In [5]:
print(df1[" Label"].value_counts())
print(df2[" Label"].value_counts())

DrDoS_SNMP       99788
DrDoS_NTP        99404
DrDoS_UDP        98687
DrDoS_SSDP       98400
DrDoS_LDAP       98268
DrDoS_MSSQL      97266
TFTP             97162
DrDoS_DNS        96801
DrDoS_NetBIOS    96793
UDP-lag          90049
Syn              87308
BENIGN           54369
WebDDoS            322
Name:  Label, dtype: int64
UDP        197434
NetBIOS    190701
Syn        184986
MSSQL      120069
LDAP       107638
Portmap     94849
BENIGN      54581
UDPLag       1873
Name:  Label, dtype: int64


We will now balance the samples and create our binary and multiclass datasets

In [6]:
# Get benign samples from Day 2 and cull them to the lowest number of attack samples found in Day 1 (excluding webddos)
day2ben = df1[df1[" Label"] == "BENIGN"].copy()
day2ben = day2ben.sample(n=50000,random_state=42)

In [7]:
# get list of labels so we can pull an equal number of samples from each.
cols = df2[" Label"].unique()
print(df2[" Label"].unique())

['BENIGN' 'LDAP' 'NetBIOS' 'MSSQL' 'Portmap' 'Syn' 'UDP' 'UDPLag']


In [9]:
# Assemble equal number of samples. Will be restructured in multiclass and binary datasets.
outMult = None

for val in cols:

    # too small, so we drop it
    if val == 'UDPLag':
        slice = df2[df2[" Label"] == val].copy()

    # replace benign with day 2.
    elif val == "BENIGN":
        slice = day2ben

    # get all attacks of this type and cull to appropriate number
    else:
        slice = df2[df2[" Label"] == val].copy()
        slice = slice.sample(n=50000,random_state=42) # Ensure equal sample number

    # add to output
    if outMult is None:
        outMult = slice
    else:
        outMult = pd.concat([outMult, slice])

print(outMult[" Label"].value_counts())

BENIGN     50000
LDAP       50000
NetBIOS    50000
MSSQL      50000
Portmap    50000
Syn        50000
UDP        50000
UDPLag      1873
Name:  Label, dtype: int64


In [11]:
outBin = outMult.drop(["Unnamed: 0.1", "Unnamed: 0"], axis= 1)
outBin.head()

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
22951,6,665498,48,59,4430.0,9674.0,517.0,0.0,92.291667,139.362829,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN
39086,17,14,4,0,804.0,0.0,201.0,201.0,201.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,BENIGN
53068,17,20912,2,2,64.0,204.0,32.0,32.0,32.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN
43265,6,152,1,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,BENIGN
45432,6,1,2,0,31.0,0.0,31.0,0.0,15.5,21.92031,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN


In [12]:
outBin.to_csv("testing_set.csv", index=False)