In [1]:
"""
This was designed to get a bigger extract of attack data to see if it improves our Multiclass results.

Author: Wesley
"""
import numpy as np
import pandas as pd
import dask.dataframe as dd # get dask with: pip install "dask[complete]"
from os import listdir
from os.path import isfile, join
import os

Since the original dataset files are too big to be loaded into memory, and since the benign cases are spread out across all the parts, we extract what we need from all of them to prepare our extract.

In [2]:
# Assumes we've placed program in root of dataset folder, where both days are subdirectories.
path1 = os.getcwd() + "/01-12"
path2 = os.getcwd() + "/03-11"

In [3]:
# Get list of files in each directory.
fileList = [path1 + '/' + f for f in listdir(path1) if isfile(join(path1, f))]
fileList2 = [path2 + '/' + f for f in listdir(path2) if isfile(join(path2, f))]

In [4]:
"""
Made this to see what was going on so I knew how much data to yank out.
"""
def peek(fileList):

    print("Beginning extraction of benign data...")
    # First pass aggregates BENIGN data.
    i = 0
    outFrame = None
    for f in fileList:
        df = dd.read_csv(f, dtype={'SimillarHTTP': 'object'})
        lind = f.rfind('/')
        slind = f.rfind('/', 0,lind-1)
        print(f[slind:])
        print(f"{df[' Label'].value_counts().compute()}\n")

In [5]:
peek(fileList)
peek(fileList2)

Beginning extraction of benign data...
/01-12/DrDoS_DNS.csv
DrDoS_DNS    5071011
BENIGN          3402
Name:  Label, dtype: int64

/01-12/DrDoS_LDAP.csv
DrDoS_LDAP    2179930
BENIGN           1612
Name:  Label, dtype: int64

/01-12/DrDoS_MSSQL.csv
DrDoS_MSSQL    4522492
BENIGN            2006
Name:  Label, dtype: int64

/01-12/DrDoS_NetBIOS.csv
DrDoS_NetBIOS    4093279
BENIGN              1707
Name:  Label, dtype: int64

/01-12/DrDoS_NTP.csv
DrDoS_NTP    1202642
BENIGN         14365
Name:  Label, dtype: int64

/01-12/DrDoS_SNMP.csv
DrDoS_SNMP    5159870
BENIGN           1507
Name:  Label, dtype: int64

/01-12/DrDoS_SSDP.csv
DrDoS_SSDP    2610611
BENIGN            763
Name:  Label, dtype: int64

/01-12/DrDoS_UDP.csv
DrDoS_UDP    3134645
BENIGN          2157
Name:  Label, dtype: int64

/01-12/Syn.csv
Syn       1582289
BENIGN        392
Name:  Label, dtype: int64

/01-12/TFTP.csv
TFTP      20082580
BENIGN       25247
Name:  Label, dtype: int64

/01-12/UDPLag.csv
UDP-lag    366461
BENIGN   

In [8]:
"""
This function extracts attack data for day 1.

Needs Dask due to how large many of the datasets are.
"""
def extract(fileList):

    outFrame = None

    # Get an upper bound on the number of attack samples to extract. Goal is to get enough to be able to make them all equal after cleaning.
    ben_num = 366000
    
    print("Beginning extraction of attack data...")

    # Second pass pulls out attack data equal in quantity to the count of benign data that we've gathered.
    for f in fileList:
        df = dd.read_csv(f, dtype={'SimillarHTTP': 'object'})
        lind = f.rfind('/')
        slind = f.rfind('/', 0,lind-1)
        print(f[slind:])
  
        # Get list of label categories
        cats = list(set(list(df[' Label'])))

        # Go through non-benign categories and add a subsample equal in quantity to the number of benign samples.
        for cat in cats:

            if cat != "BENIGN":

                print(f"Appending data for {cat}...")
                # Find our how many samples we have
                cat_df = df[df[' Label'] == cat]
                shcat = cat_df.shape
                cat_num = shcat[0].compute()

                # Get a sample if there is equal/more attack data, otherwise add everything.
                # dask only does approximate sampling, so we'll have to do some post-processing to make the numbers exactly equal.
                if cat_num > ben_num:
                    ret_df = cat_df.sample(frac = ben_num / cat_num)

                else:
                    ret_df = cat_df

                if outFrame is None:
                    outFrame = ret_df
                else:
                    outFrame = dd.concat([outFrame, ret_df])

    # Return our aggregated DataFrame
    print(f"Aggregated df stats: {outFrame[' Label'].value_counts().compute()}\n")
    print("Extraction completed.")
    return outFrame
    


In [9]:
# Day 1
train = extract(fileList)

Beginning extraction of attack data...
/01-12/DrDoS_DNS.csv
Appending data for DrDoS_DNS...
/01-12/DrDoS_LDAP.csv
Appending data for DrDoS_LDAP...
/01-12/DrDoS_MSSQL.csv
Appending data for DrDoS_MSSQL...
/01-12/DrDoS_NetBIOS.csv
Appending data for DrDoS_NetBIOS...
/01-12/DrDoS_NTP.csv
Appending data for DrDoS_NTP...
/01-12/DrDoS_SNMP.csv
Appending data for DrDoS_SNMP...
/01-12/DrDoS_SSDP.csv
Appending data for DrDoS_SSDP...
/01-12/DrDoS_UDP.csv
Appending data for DrDoS_UDP...
/01-12/Syn.csv
Appending data for Syn...
/01-12/TFTP.csv
Appending data for TFTP...
/01-12/UDPLag.csv
Appending data for UDP-lag...
Appending data for WebDDoS...
Aggregated df stats: DrDoS_SNMP       366003
DrDoS_DNS        366001
DrDoS_NTP        366001
DrDoS_NetBIOS    366001
DrDoS_UDP        366001
Syn              366001
DrDoS_MSSQL      366000
UDP-lag          366000
DrDoS_LDAP       365998
DrDoS_SSDP       365998
TFTP             365998
WebDDoS             439
Name:  Label, dtype: int64

Extraction completed

Save extracts, then reload them to finish processing.

In [10]:
dd.to_csv(df = train, filename = "day1_bigAttack.csv", single_file=True)

['c:\\Users\\icarus\\Documents\\school\\Fall 2022\\CSI 5388\\dataset\\day1_bigAttack.csv']

In [11]:
df1 = pd.read_csv("day1_bigAttack.csv")

  df1 = pd.read_csv("day1_bigAttack.csv")


In [12]:
print(df1[" Label"].value_counts())

DrDoS_SNMP       366003
DrDoS_DNS        366001
DrDoS_NetBIOS    366001
DrDoS_NTP        366001
DrDoS_UDP        366001
Syn              366001
DrDoS_MSSQL      366000
UDP-lag          366000
DrDoS_LDAP       365998
DrDoS_SSDP       365998
TFTP             365998
WebDDoS             439
Name:  Label, dtype: int64


Cleaning

In [13]:
# Remove socket information and timestamp
drop_cols = ['Flow ID', ' Source IP', ' Source Port', ' Destination IP',' Destination Port', ' Timestamp']

df1 = df1.drop(drop_cols, axis=1)

# remove infinity values
df1 = df1.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

# replace invalid SimilarHTTP values with NaN
for col in df1.columns:
    if col != " Label":
        df1[col] = pd.to_numeric(df1[col], errors='coerce')

# drop remaining NaN values from both
df1 = df1.dropna(axis=0)

In [14]:
print(df1[" Label"].value_counts())

DrDoS_SNMP       365218
DrDoS_NTP        363719
DrDoS_UDP        361324
DrDoS_SSDP       360093
DrDoS_LDAP       359373
DrDoS_MSSQL      355888
TFTP             355534
DrDoS_NetBIOS    354327
DrDoS_DNS        354276
UDP-lag          329659
Syn              319212
WebDDoS             322
Name:  Label, dtype: int64


We will now balance the samples and create our binary and multiclass datasets

In [15]:
# get list of labels so we can pull an equal number of samples from each.
cols = df1[" Label"].unique()
print(df1[" Label"].unique())

['DrDoS_DNS' 'DrDoS_LDAP' 'DrDoS_MSSQL' 'DrDoS_NetBIOS' 'DrDoS_NTP'
 'DrDoS_SNMP' 'DrDoS_SSDP' 'DrDoS_UDP' 'Syn' 'TFTP' 'UDP-lag' 'WebDDoS']


In [16]:
# Assemble equal number of samples. Will be restructured in multiclass and binary datasets.
outMult = None

for val in cols:

    # too small, so we drop it
    if val == 'WebDDoS':
        continue

    # get all attacks of this type and cull to appropriate number
    else:
        slice = df1[df1[" Label"] == val].copy()
        slice = slice.sample(n=319000,random_state=42) # Ensure equal sample number

    # add to output
    if outMult is None:
        outMult = slice
    else:
        outMult = pd.concat([outMult, slice])

print(outMult[" Label"].value_counts())

DrDoS_DNS        319000
DrDoS_LDAP       319000
DrDoS_MSSQL      319000
DrDoS_NetBIOS    319000
DrDoS_NTP        319000
DrDoS_SNMP       319000
DrDoS_SSDP       319000
DrDoS_UDP        319000
Syn              319000
TFTP             319000
UDP-lag          319000
Name:  Label, dtype: int64


In [17]:
# Drop the benign samples to get our multiclass dataset.
multiclass = outMult[outMult[" Label"] != "BENIGN"]

# Remove the annoying DrDOS prefix from labels.
multiclass[" Label"] = multiclass[" Label"].apply(lambda x: x.replace("DrDoS_", ""))

print(multiclass[" Label"].value_counts())

DNS        319000
LDAP       319000
MSSQL      319000
NetBIOS    319000
NTP        319000
SNMP       319000
SSDP       319000
UDP        319000
Syn        319000
TFTP       319000
UDP-lag    319000
Name:  Label, dtype: int64


In [18]:
multiclass = multiclass.drop(["Unnamed: 0.1", "Unnamed: 0"], axis= 1)
multiclass.head()

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
173758,17,1,2,0,2896.0,0.0,1448.0,1448.0,1448.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DNS
229441,17,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DNS
92964,17,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DNS
26856,17,45,2,0,2930.0,0.0,1465.0,1465.0,1465.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DNS
10704,17,229,2,0,2896.0,0.0,1448.0,1448.0,1448.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DNS


In [19]:
multiclass.to_csv("multiclass.csv", index=False)

In [20]:

print(multiclass[" Label"].value_counts())

DNS        319000
LDAP       319000
MSSQL      319000
NetBIOS    319000
NTP        319000
SNMP       319000
SSDP       319000
UDP        319000
Syn        319000
TFTP       319000
UDP-lag    319000
Name:  Label, dtype: int64
