In [2]:
"""
This extracts data from the dataset files since they are too big to all fit in memory.

Assumes the original dataset folders are located in the same directory in their original form.

Author: Wesley
"""
import numpy as np
import pandas as pd
import dask.dataframe as dd # get dask with: pip install "dask[complete]"
from os import listdir
from os.path import isfile, join
import os

Since the original dataset files are too big to be loaded into memory, and since the benign cases are spread out across all the parts, we extract what we need from all of them to prepare our extract.

In [38]:
# Assumes we've placed program in root of dataset folder, where both days are subdirectories.
path1 = os.getcwd() + "/01-12"
path2 = os.getcwd() + "/03-11"

In [23]:
# Get list of files in each directory.
fileList = [path1 + '/' + f for f in listdir(path1) if isfile(join(path1, f))]
fileList2 = [path2 + '/' + f for f in listdir(path2) if isfile(join(path2, f))]

In [27]:
def purge(fileList):
    for f in fileList:

        # Bypass the one that's too big.
        if 'TFTP.csv' in f:
            continue

        df = pd.read_csv(f, dtype={'SimillarHTTP': 'object'})
        # Remove socket information and timestamp.
        drop_cols = ['Flow ID', ' Source IP', ' Source Port', ' Destination IP',' Destination Port', ' Timestamp']

        df = df.drop(drop_cols, axis=1)


        # remove infinity values
        df = df.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

        # replace invalid SimilarHTTP values with NaN
        for col in df.columns:
            if col != " Label":
                df[col] = pd.to_numeric(df[col], errors='coerce')

        df = df.dropna(axis=0)

        df = df.drop(["Unnamed: 0"], axis= 1)

        df = df.drop_duplicates()

        # get file name from path (file.csv)
        targetIndex = f.rfind("/") + 1
        fileName = f[targetIndex:len(f)]

        path_name = f[0:targetIndex]

        # get file name without .csv extension (file)
        targetIndex = fileName.rfind(".")
        fileString = fileName[0:targetIndex]

        new_path = path_name + fileString + "_cleaned.csv"

        df.to_csv(new_path, index=False)

        print(df[" Label"].value_counts())
        print(f"Wrote {new_path}")

        


In [28]:
purge(fileList)
purge(fileList2)

DrDoS_DNS    108065
BENIGN         2924
Name:  Label, dtype: int64
Wrote c:\Users\icarus\Documents\school\Fall 2022\CSI 5388\dataset/01-12/DrDoS_DNS_cleaned.csv
DrDoS_LDAP    28839
BENIGN         1359
Name:  Label, dtype: int64
Wrote c:\Users\icarus\Documents\school\Fall 2022\CSI 5388\dataset/01-12/DrDoS_LDAP_cleaned.csv
DrDoS_MSSQL    193584
BENIGN           1810
Name:  Label, dtype: int64
Wrote c:\Users\icarus\Documents\school\Fall 2022\CSI 5388\dataset/01-12/DrDoS_MSSQL_cleaned.csv
DrDoS_NetBIOS    17953
BENIGN            1568
Name:  Label, dtype: int64
Wrote c:\Users\icarus\Documents\school\Fall 2022\CSI 5388\dataset/01-12/DrDoS_NetBIOS_cleaned.csv
DrDoS_NTP    1112274
BENIGN         12724
Name:  Label, dtype: int64
Wrote c:\Users\icarus\Documents\school\Fall 2022\CSI 5388\dataset/01-12/DrDoS_NTP_cleaned.csv
DrDoS_SNMP    111993
BENIGN          1253
Name:  Label, dtype: int64
Wrote c:\Users\icarus\Documents\school\Fall 2022\CSI 5388\dataset/01-12/DrDoS_SNMP_cleaned.csv
DrDoS_SSDP  

In [39]:
def purge_chunk(df):

    drop_cols = ['Flow ID', ' Source IP', ' Source Port', ' Destination IP',' Destination Port', ' Timestamp']

    df = df.drop(drop_cols, axis=1)


    # remove infinity values
    df = df.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

    # replace invalid SimilarHTTP values with NaN
    for col in df.columns:
        if col != " Label":
            df[col] = pd.to_numeric(df[col], errors='coerce')

    df = df.dropna(axis=0)

    df = df.drop(["Unnamed: 0"], axis= 1)

    df = df.drop_duplicates()

    return df


In [40]:
chunksize = 10 ** 6
chunklist = []
with pd.read_csv(os.getcwd() + "/01-12/TFTP.csv", chunksize=chunksize, dtype={'SimillarHTTP': 'object'}) as reader:
    for chunk in reader:
        chunklist.append(purge_chunk(chunk))

tf = pd.concat(chunklist)


In [41]:
tf = tf.drop_duplicates()
tf.to_csv(os.getcwd() + "/01-12/TFTP_cleaned.csv")

In [45]:
# Assumes we've placed program in root of dataset folder, where both days are subdirectories.
path1 = os.getcwd() + "/01-12-c"
path2 = os.getcwd() + "/03-11-c"
# Get list of files in each directory.
fileList = [path1 + '/' + f for f in listdir(path1) if isfile(join(path1, f))]
fileList2 = [path2 + '/' + f for f in listdir(path2) if isfile(join(path2, f))]

In [46]:
"""
This function extracts an equal amount of benign and attack data for each day and forms the train/test sets.

Needs Dask due to how large many of the datasets are.
"""
def extract(fileList):

    print("Beginning extraction of benign data...")
    # First pass aggregates BENIGN data.
    i = 0
    outFrame = None
    for f in fileList:
        df = dd.read_csv(f, dtype={'SimillarHTTP': 'object'})
        lind = f.rfind('/')
        slind = f.rfind('/', 0,lind-1)
        print(f[slind:])

        # Pull benign samples out and get their quantity.
        ben_df = df[df[' Label'] == 'BENIGN']

        if i == 0:
            outFrame = ben_df
            i += 1

        else:    
            outFrame = dd.concat([outFrame, ben_df])

    # Get the total count of BENIGN samples that we've aggregated.
    shben = outFrame.shape
    ben_num = shben[0].compute()
    print(f"Number of Benign samples aggregated: {ben_num}")

    # Get an upper bound on the number of attack samples to extract. Goal is to get enough to be able to make them all equal after cleaning.
    ben_num = 600000
    
    print("Beginning extraction of attack data...")

    # Second pass pulls out attack data equal in quantity to the count of benign data that we've gathered.
    for f in fileList:
        df = dd.read_csv(f, dtype={'SimillarHTTP': 'object'})
        lind = f.rfind('/')
        slind = f.rfind('/', 0,lind-1)
        print(f[slind:])
  
        # Get list of label categories
        cats = list(set(list(df[' Label'])))

        # Go through non-benign categories and add a subsample equal in quantity to the number of benign samples.
        for cat in cats:

            if cat != "BENIGN":

                print(f"Appending data for {cat}...")

                # Find our how many samples we have
                cat_df = df[df[' Label'] == cat]
                shcat = cat_df.shape
                cat_num = shcat[0].compute()

                # Get a sample if there is equal/more attack data, otherwise add everything.
                # dask only does approximate sampling, so we'll have to do some post-processing to make the numbers exactly equal.
                if cat_num > ben_num:
                    ret_df = cat_df.sample(frac = ben_num / cat_num)

                else:
                    ret_df = cat_df

                outFrame = dd.concat([outFrame, ret_df])

    # Return our aggregated DataFrame
    print(f"Aggregated df stats: {outFrame[' Label'].value_counts().compute()}\n")
    print("Extraction completed.")
    return outFrame
    


In [47]:
train = extract(fileList)
test = extract(fileList2)

Beginning extraction of benign data...
/01-12-c/DrDoS_DNS_cleaned.csv
/01-12-c/DrDoS_LDAP_cleaned.csv
/01-12-c/DrDoS_MSSQL_cleaned.csv
/01-12-c/DrDoS_NetBIOS_cleaned.csv
/01-12-c/DrDoS_NTP_cleaned.csv
/01-12-c/DrDoS_SNMP_cleaned.csv
/01-12-c/DrDoS_SSDP_cleaned.csv
/01-12-c/DrDoS_UDP_cleaned.csv
/01-12-c/Syn_cleaned.csv
/01-12-c/TFTP_cleaned.csv
/01-12-c/UDPLag_cleaned.csv
Number of Benign samples aggregated: 50111
Beginning extraction of attack data...
/01-12-c/DrDoS_DNS_cleaned.csv
Appending data for DrDoS_DNS...
/01-12-c/DrDoS_LDAP_cleaned.csv
Appending data for DrDoS_LDAP...
/01-12-c/DrDoS_MSSQL_cleaned.csv
Appending data for DrDoS_MSSQL...
/01-12-c/DrDoS_NetBIOS_cleaned.csv
Appending data for DrDoS_NetBIOS...
/01-12-c/DrDoS_NTP_cleaned.csv
Appending data for DrDoS_NTP...
/01-12-c/DrDoS_SNMP_cleaned.csv
Appending data for DrDoS_SNMP...
/01-12-c/DrDoS_SSDP_cleaned.csv
Appending data for DrDoS_SSDP...
/01-12-c/DrDoS_UDP_cleaned.csv
Appending data for DrDoS_UDP...
/01-12-c/Syn_cleaned.

Save extracts, then reload them to finish processing.

In [48]:
dd.to_csv(df = train, filename = "day1_v5.csv", single_file=True)

['c:\\Users\\icarus\\Documents\\school\\Fall 2022\\CSI 5388\\dataset\\day1_v5.csv']

In [49]:
dd.to_csv(df = test, filename = "day2_v5.csv", single_file=True)

['c:\\Users\\icarus\\Documents\\school\\Fall 2022\\CSI 5388\\dataset\\day2_v5.csv']

In [3]:
df1 = pd.read_csv("day1_v5.csv")
df2 = pd.read_csv("day2_v5.csv")

In [4]:
df1.head()

Unnamed: 0.2,Unnamed: 0.1,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label,Unnamed: 0
0,30,6,40335006,9,10,8.0,62.0,1.0,0.0,0.888889,...,90287.0,90185.0,9993447.0,40495.753715,10018634.0,9933709.0,0.0,0,BENIGN,
1,52,0,113244633,56,0,0.0,0.0,0.0,0.0,0.0,...,2978061.0,4.0,9188876.0,809901.667647,9882838.0,6781893.0,0.0,0,BENIGN,
2,92,6,95628949,21,20,20.0,0.0,1.0,0.0,0.952381,...,15367.0,14798.0,10014180.0,5184.077926,10016037.0,10000366.0,0.0,0,BENIGN,
3,93,6,95613243,21,20,20.0,0.0,1.0,0.0,0.952381,...,26534.0,26473.0,10004310.0,21.430119,10004365.0,10004289.0,0.0,0,BENIGN,
4,94,6,95597710,21,20,20.0,0.0,1.0,0.0,0.952381,...,26522.0,26437.0,10006080.0,5194.514136,10019931.0,10004309.0,0.0,0,BENIGN,


In [5]:
df2.head()

Unnamed: 0.1,Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
0,35,6,1,2,0,12.0,0.0,6.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN
1,60,6,47,2,0,12.0,0.0,6.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,BENIGN
2,61,6,1,2,0,12.0,0.0,6.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN
3,119,6,8001234,4,0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,8001232.0,0.0,8001232.0,8001232.0,0.0,1,BENIGN
4,154,6,112729696,13,6,368.0,196.0,46.0,6.0,28.307692,...,82.024387,62271.0,62155.0,56301460.5,3535899.0,58801719.0,53801202.0,0.0,0,BENIGN


In [6]:
print(df1[" Label"].value_counts())
print(df2[" Label"].value_counts())

DrDoS_UDP        600001
DrDoS_SSDP       600000
TFTP             600000
DrDoS_NTP        599999
DrDoS_MSSQL      193584
Syn              155495
DrDoS_SNMP       111993
DrDoS_DNS        108065
UDP-lag           89032
BENIGN            50111
DrDoS_LDAP        28839
DrDoS_NetBIOS     17953
WebDDoS             297
Name:  Label, dtype: int64
UDP        656477
Syn        531997
MSSQL      258469
BENIGN      44896
LDAP        17042
NetBIOS     11310
Portmap      1598
UDPLag        455
Name:  Label, dtype: int64


Cleaning

We will now balance the samples and create our binary and multiclass datasets

In [7]:
# get list of labels so we can pull an equal number of samples from each. Also standardize label names.
df1[" Label"] = df1[" Label"].apply(lambda x: x.replace("DrDoS_", ""))
df2[" Label"] = df2[" Label"].apply(lambda x: x.replace("UDPLag", "UDP-lag"))

cols1 = df1[" Label"].unique()
print(df1[" Label"].unique())
cols2 = df2[" Label"].unique()
print(df2[" Label"].unique())

['BENIGN' 'DNS' 'LDAP' 'MSSQL' 'NetBIOS' 'NTP' 'SNMP' 'SSDP' 'UDP' 'Syn'
 'TFTP' 'UDP-lag' 'WebDDoS']
['BENIGN' 'NetBIOS' 'LDAP' 'MSSQL' 'Portmap' 'Syn' 'UDP' 'UDP-lag']


In [8]:
df1 = df1.drop(["Unnamed: 0.1"], axis= 1)
df2 = df2.drop(["Unnamed: 0"], axis= 1)

In [9]:
df1 = df1.drop_duplicates()
df2 = df2.drop_duplicates()

In [10]:
print(df1[" Label"].value_counts())
print(df2[" Label"].value_counts())

SSDP       600000
UDP        600000
TFTP       600000
NTP        599998
MSSQL      193584
Syn        155495
SNMP       111988
DNS        108065
UDP-lag     89032
BENIGN      49302
LDAP        28839
NetBIOS     17953
WebDDoS       297
Name:  Label, dtype: int64
UDP        647996
Syn        525832
MSSQL      254092
BENIGN      43575
LDAP        16568
NetBIOS     10231
Portmap      1598
UDP-lag       455
Name:  Label, dtype: int64


In [11]:
merge_df = pd.concat([df1, df2])

In [12]:
print(merge_df[" Label"].value_counts())

UDP        1247996
Syn         681327
SSDP        600000
TFTP        600000
NTP         599998
MSSQL       447676
SNMP        111988
DNS         108065
BENIGN       92877
UDP-lag      89487
LDAP         45407
NetBIOS      28184
Portmap       1598
WebDDoS        297
Name:  Label, dtype: int64


In [13]:
merge_df = merge_df.drop_duplicates()
print(merge_df[" Label"].value_counts())

UDP        1200195
Syn         681311
SSDP        600000
TFTP        600000
NTP         599998
MSSQL       349292
SNMP        111988
DNS         108065
BENIGN       92738
UDP-lag      89411
LDAP         38114
NetBIOS      25572
Portmap       1598
WebDDoS        297
Name:  Label, dtype: int64


In [26]:
# Assemble equal number of samples. Will be restructured in multiclass and binary datasets.
outMult = None
outBin = None
ben = None
for val in merge_df[" Label"].unique():

    # too small, so we drop it
    if val == 'WebDDoS':
        continue

    # Retain benign data separtely
    elif val == "BENIGN":
        ben = merge_df[merge_df[" Label"] == val].copy()
        continue

    # We have to pad Portmap since it's too small.
    elif val == "Portmap":
        # 1598
        slice = merge_df[merge_df[" Label"] == val].copy()

        # 3196
        slice2 = pd.concat([slice, slice])

        # 6392
        slice3 = pd.concat([slice2, slice2])

        # 25568
        slice4 = pd.concat([slice3, slice3,slice3,slice3])

        # Now we can get enough, lol.
        slice = slice4.sample(n=23186,random_state=42) # Ensure equal sample number


    else:
        slice = merge_df[merge_df[" Label"] == val].copy()
        slice = slice.sample(n=23186,random_state=42) # Ensure equal sample number

    slice_bin = slice.iloc[:7729,:]
    slice_mult = slice.iloc[7729:,:]
    # add to output
    if outMult is None:
        outMult = slice_mult
    else:
        outMult = pd.concat([outMult, slice_mult])

    if outBin is None:
        outBin = slice_bin
    else:
        outBin = pd.concat([outBin, slice_bin])

outBin = pd.concat([outBin, ben])
print(outMult[" Label"].value_counts())
print(outBin[" Label"].value_counts())


DNS        15457
LDAP       15457
MSSQL      15457
NetBIOS    15457
NTP        15457
SNMP       15457
SSDP       15457
UDP        15457
Syn        15457
TFTP       15457
UDP-lag    15457
Portmap    15457
Name:  Label, dtype: int64
BENIGN     92738
DNS         7729
LDAP        7729
MSSQL       7729
NetBIOS     7729
NTP         7729
SNMP        7729
SSDP        7729
UDP         7729
Syn         7729
TFTP        7729
UDP-lag     7729
Portmap     7729
Name:  Label, dtype: int64


In [32]:
outBin = outBin.drop(['Unnamed: 0'], axis=1)
outMult = outMult.drop(['Unnamed: 0'], axis=1)

Binary dataset will be binarized later.

In [33]:
from sklearn.model_selection import train_test_split

binary_y = outBin[" Label"].copy()
binary_x = outBin.drop([" Label"], axis=1)

multiclass_y = outMult[" Label"].copy()
multiclass_x = outMult.drop([" Label"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(binary_x, binary_y, test_size=0.20, random_state=42, stratify = binary_y)
binary_train = X_train
binary_train[" Label"] = y_train.values
binary_test = X_test
binary_test[" Label"] = y_test.values

X_train, X_test, y_train, y_test = train_test_split(multiclass_x, multiclass_y, test_size=0.20, random_state=42, stratify = multiclass_y)
multiclass_train = X_train
multiclass_train[" Label"] = y_train.values
multiclass_test = X_test
multiclass_test[" Label"] = y_test.values

In [34]:
print(binary_train[" Label"].value_counts())
print(binary_test[" Label"].value_counts())
print(multiclass_train[" Label"].value_counts())
print(multiclass_test[" Label"].value_counts())

BENIGN     74190
TFTP        6184
UDP         6184
NetBIOS     6183
SSDP        6183
SNMP        6183
NTP         6183
Portmap     6183
DNS         6183
LDAP        6183
MSSQL       6183
UDP-lag     6183
Syn         6183
Name:  Label, dtype: int64
BENIGN     18548
UDP-lag     1546
MSSQL       1546
LDAP        1546
Syn         1546
Portmap     1546
NTP         1546
SSDP        1546
DNS         1546
SNMP        1546
NetBIOS     1546
TFTP        1545
UDP         1545
Name:  Label, dtype: int64
TFTP       12366
MSSQL      12366
DNS        12366
LDAP       12366
Syn        12366
Portmap    12366
UDP        12366
NTP        12365
UDP-lag    12365
SSDP       12365
NetBIOS    12365
SNMP       12365
Name:  Label, dtype: int64
UDP-lag    3092
SSDP       3092
SNMP       3092
NTP        3092
NetBIOS    3092
Portmap    3091
TFTP       3091
DNS        3091
Syn        3091
LDAP       3091
MSSQL      3091
UDP        3091
Name:  Label, dtype: int64


In [38]:
binary_train.to_csv("binary_train.csv", index=False)
binary_test.to_csv("binary_test.csv", index=False)

multiclass_train.to_csv("multiclass_train.csv", index=False)
multiclass_test.to_csv("multiclass_test.csv", index=False)