In [10]:
import numpy as np
import pandas as pd
import dask.dataframe as dd # get dask with: pip install "dask[complete]"
from os import listdir
from os.path import isfile, join
import os

Since the original dataset files are too big to be loaded into memory, and since the benign cases are spread out across all the parts, we extract what we need from all of them to prepare our extract.

In [11]:
# Assumes we've placed program in root of dataset folder, where both days are subdirectories.
path1 = os.getcwd() + "/01-12"
path2 = os.getcwd() + "/03-11"

In [12]:
# Get list of files in each directory.
fileList = [path1 + '/' + f for f in listdir(path1) if isfile(join(path1, f))]
fileList2 = [path2 + '/' + f for f in listdir(path2) if isfile(join(path2, f))]

In [13]:
"""
This function extracts an equal amount of benign and attack data for each day and forms the train/test sets.

Needs Dask due to how large many of the datasets are.
"""
def extract(fileList):

    print("Beginning extraction of benign data...")
    # First pass aggregates BENIGN data.
    i = 0
    outFrame = None
    for f in fileList:
        df = dd.read_csv(f, dtype={'SimillarHTTP': 'object'})
        lind = f.rfind('/')
        slind = f.rfind('/', 0,lind-1)
        print(f[slind:])
        #print(f"{df[' Label'].value_counts().compute()}\n")

        # Pull benign samples out and get their quantity.
        ben_df = df[df[' Label'] == 'BENIGN']

        if i == 0:
            outFrame = ben_df
            i += 1

        else:    
            outFrame = dd.concat([outFrame, ben_df])

        #print(f"Aggregated df stats: {outFrame[' Label'].value_counts().compute()}\n")

    # Get the total count of BENIGN samples that we've aggregated.
    shben = outFrame.shape
    ben_num = shben[0].compute()
    print(f"Number of Benign samples aggregated: {ben_num}")

    # Get an upper bound on the number of attack samples to extract. Goal is to get enough to be able to make them all equal after cleaning.
    ben_num = 100000
    
    print("Beginning extraction of attack data...")

    # Second pass pulls out attack data equal in quantity to the count of benign data that we've gathered.
    for f in fileList:
        df = dd.read_csv(f, dtype={'SimillarHTTP': 'object'})
        lind = f.rfind('/')
        slind = f.rfind('/', 0,lind-1)
        print(f[slind:])
  
        # Get list of label categories
        cats = list(set(list(df[' Label'])))

        # Go through non-benign categories and add a subsample equal in quantity to the number of benign samples.
        for cat in cats:

            if cat != "BENIGN":

                print(f"Appending data for {cat}...")
                # Find our how many samples we have
                cat_df = df[df[' Label'] == cat]
                shcat = cat_df.shape
                cat_num = shcat[0].compute()

                # Get a sample if there is equal/more attack data, otherwise add everything.
                # dask only does approximate sampling, so we'll have to do some post-processing to make the numbers exactly equal.
                if cat_num > ben_num:
                    ret_df = cat_df.sample(frac = ben_num / cat_num)

                else:
                    ret_df = cat_df

                outFrame = dd.concat([outFrame, ret_df])
                #print(f"Aggregated df stats: {outFrame[' Label'].value_counts().compute()}\n")

    # Return our aggregated DataFrame
    print(f"Aggregated df stats: {outFrame[' Label'].value_counts().compute()}\n")
    print("Extraction completed.")
    return outFrame
    


In [14]:
train = extract(fileList)
test = extract(fileList2)

Beginning extraction of benign data...
/01-12/DrDoS_DNS.csv
/01-12/DrDoS_LDAP.csv
/01-12/DrDoS_MSSQL.csv
/01-12/DrDoS_NetBIOS.csv
/01-12/DrDoS_NTP.csv
/01-12/DrDoS_SNMP.csv
/01-12/DrDoS_SSDP.csv
/01-12/DrDoS_UDP.csv
/01-12/Syn.csv
/01-12/TFTP.csv
/01-12/UDPLag.csv
Number of Benign samples aggregated: 56863
Beginning extraction of attack data...
/01-12/DrDoS_DNS.csv
Appending data for DrDoS_DNS...
/01-12/DrDoS_LDAP.csv
Appending data for DrDoS_LDAP...
/01-12/DrDoS_MSSQL.csv
Appending data for DrDoS_MSSQL...
/01-12/DrDoS_NetBIOS.csv
Appending data for DrDoS_NetBIOS...
/01-12/DrDoS_NTP.csv
Appending data for DrDoS_NTP...
/01-12/DrDoS_SNMP.csv
Appending data for DrDoS_SNMP...
/01-12/DrDoS_SSDP.csv
Appending data for DrDoS_SSDP...
/01-12/DrDoS_UDP.csv
Appending data for DrDoS_UDP...
/01-12/Syn.csv
Appending data for Syn...
/01-12/TFTP.csv
Appending data for TFTP...
/01-12/UDPLag.csv
Appending data for UDP-lag...
Appending data for WebDDoS...
Aggregated df stats: DrDoS_DNS        100001
DrDo

Save extracts, then reload them to finish processing.

In [15]:
dd.to_csv(df = train, filename = "day1_v2.csv", single_file=True)

['c:\\Users\\icarus\\Documents\\school\\Fall 2022\\CSI 5388\\dataset\\day1_v2.csv']

In [16]:
dd.to_csv(df = test, filename = "day2_v2.csv", single_file=True)

['c:\\Users\\icarus\\Documents\\school\\Fall 2022\\CSI 5388\\dataset\\day2_v2.csv']

In [21]:
df1 = pd.read_csv("day1_v2.csv")
df2 = pd.read_csv("day2_v2.csv")

  df1 = pd.read_csv("day1_v2.csv")
  df2 = pd.read_csv("day2_v2.csv")


In [22]:
print(df1[" Label"].value_counts())
print(df2[" Label"].value_counts())

DrDoS_DNS        100001
DrDoS_LDAP       100001
DrDoS_SNMP       100001
DrDoS_SSDP       100001
DrDoS_MSSQL      100000
DrDoS_NetBIOS    100000
DrDoS_NTP        100000
Syn              100000
UDP-lag          100000
DrDoS_UDP         99999
TFTP              99999
BENIGN            56863
WebDDoS             439
Name:  Label, dtype: int64
Syn        200005
UDP        199999
NetBIOS    199998
MSSQL      124392
LDAP       109932
Portmap    100000
BENIGN      56965
UDPLag       1873
Name:  Label, dtype: int64


Cleaning

In [23]:
# Remove socket information and timestamp
drop_cols = ['Flow ID', ' Source IP', ' Source Port', ' Destination IP',' Destination Port', ' Timestamp']

df1 = df1.drop(drop_cols, axis=1)
df2 = df2.drop(drop_cols, axis=1)

# remove infinity values
df1 = df1.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

# replace invalid SimilarHTTP values with NaN
for col in df1.columns:
    if col != " Label":
        df1[col] = pd.to_numeric(df1[col], errors='coerce')

# Repeat with multiclass dataset
df2 = df2.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

for col in df2.columns:
    if col != " Label":
        df2[col] = pd.to_numeric(df2[col], errors='coerce')

# drop remaining NaN values from both
df1 = df1.dropna(axis=0)
df2 = df2.dropna(axis=0)

In [24]:
print(df1[" Label"].value_counts())
print(df2[" Label"].value_counts())

DrDoS_SNMP       99788
DrDoS_NTP        99404
DrDoS_UDP        98687
DrDoS_SSDP       98400
DrDoS_LDAP       98268
DrDoS_MSSQL      97266
TFTP             97162
DrDoS_DNS        96801
DrDoS_NetBIOS    96793
UDP-lag          90049
Syn              87308
BENIGN           54369
WebDDoS            322
Name:  Label, dtype: int64
UDP        197434
NetBIOS    190701
Syn        184986
MSSQL      120069
LDAP       107638
Portmap     94849
BENIGN      54581
UDPLag       1873
Name:  Label, dtype: int64


We will now balance the samples and create our binary and multiclass datasets

In [25]:
# Get benign samples from Day 2 and cull them to the lowest number of attack samples found in Day 1 (excluding webddos)
day2ben = df2[df2[" Label"] == "BENIGN"].copy()
day2ben = day2ben.sample(n=54581,random_state=42)

In [26]:
# get list of labels so we can pull an equal number of samples from each.
cols = df1[" Label"].unique()
print(df1[" Label"].unique())

['BENIGN' 'DrDoS_DNS' 'DrDoS_LDAP' 'DrDoS_MSSQL' 'DrDoS_NetBIOS'
 'DrDoS_NTP' 'DrDoS_SNMP' 'DrDoS_SSDP' 'DrDoS_UDP' 'Syn' 'TFTP' 'UDP-lag'
 'WebDDoS']


In [27]:
# Assemble equal number of samples. Will be restructured in multiclass and binary datasets.
outMult = None

for val in cols:

    # too small, so we drop it
    if val == 'WebDDoS':
        continue

    # replace benign with day 2.
    elif val == "BENIGN":
        slice = day2ben

    # get all attacks of this type and cull to appropriate number
    else:
        slice = df1[df1[" Label"] == val].copy()
        slice = slice.sample(n=54581,random_state=42) # Ensure equal sample number

    # add to output
    if outMult is None:
        outMult = slice
    else:
        outMult = pd.concat([outMult, slice])

print(outMult[" Label"].value_counts())

BENIGN           54581
DrDoS_DNS        54581
DrDoS_LDAP       54581
DrDoS_MSSQL      54581
DrDoS_NetBIOS    54581
DrDoS_NTP        54581
DrDoS_SNMP       54581
DrDoS_SSDP       54581
DrDoS_UDP        54581
Syn              54581
TFTP             54581
UDP-lag          54581
Name:  Label, dtype: int64


In [28]:
# Drop the benign samples to get our multiclass dataset.
multiclass = outMult[outMult[" Label"] != "BENIGN"]

# Remove the annoying DrDOS prefix from labels.
multiclass[" Label"] = multiclass[" Label"].apply(lambda x: x.replace("DrDoS_", ""))

print(multiclass[" Label"].value_counts())

DNS        54581
LDAP       54581
MSSQL      54581
NetBIOS    54581
NTP        54581
SNMP       54581
SSDP       54581
UDP        54581
Syn        54581
TFTP       54581
UDP-lag    54581
Name:  Label, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiclass[" Label"] = multiclass[" Label"].apply(lambda x: x.replace("DrDoS_", ""))


In [29]:
# binary uses all columns
binary = outMult.copy()

# Remove the annoying DrDOS prefix from labels.
binary[" Label"] = binary[" Label"].apply(lambda x: x.replace("DrDoS_", ""))

print(binary[" Label"].value_counts())

BENIGN     54581
DNS        54581
LDAP       54581
MSSQL      54581
NetBIOS    54581
NTP        54581
SNMP       54581
SSDP       54581
UDP        54581
Syn        54581
TFTP       54581
UDP-lag    54581
Name:  Label, dtype: int64


Binary dataset will be binarized later.

In [30]:
# get attack labels
cols = binary[" Label"].unique()

# make new attack label comprised of equal subsets of attacks.
targ = int(54581 / 11)

outBin = None

for val in cols:

    # pull category
    slice = binary[binary[" Label"] == val].copy()

    # binary is equal to subset * 11 so it's equal.
    if val == "BENIGN":
        slice = slice.sample(n=targ * 11,random_state=42) # Ensure equal sample number

    # get subset and covert to attack type.
    else:
        slice = slice.sample(n=targ,random_state=42) # Ensure equal sample number
        #slice[" Label"] = slice[" Label"].apply(lambda x: "ATTACK")

    # add to output
    if outBin is None:
        outBin = slice
    else:
        outBin = pd.concat([outBin, slice])

print(outBin[" Label"].value_counts())

BENIGN     54571
DNS         4961
LDAP        4961
MSSQL       4961
NetBIOS     4961
NTP         4961
SNMP        4961
SSDP        4961
UDP         4961
Syn         4961
TFTP        4961
UDP-lag     4961
Name:  Label, dtype: int64


In [37]:
outBin = outBin.drop(["Unnamed: 0.1", "Unnamed: 0"], axis= 1)
outBin.head()

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
11664,6,119360531,136,213,41528.0,155400.0,2955.0,0.0,305.352941,610.445587,...,1522822.0,2403023.0,249428.0,58229360.5,565451.372605,58629195.0,57829526.0,0.0,0,BENIGN
12455,6,118838133,40,43,4216.0,16444.0,696.0,0.0,105.4,188.610683,...,804055.4,1152339.0,15233.0,58827757.0,223646.561181,58985899.0,58669615.0,0.0,0,BENIGN
21420,6,108,1,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,BENIGN
36079,17,21925,2,2,62.0,94.0,31.0,31.0,31.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN
27929,6,141,1,2,6.0,12.0,6.0,6.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,BENIGN


In [35]:
multiclass = multiclass.drop(["Unnamed: 0.1", "Unnamed: 0"], axis= 1)
multiclass.head()

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
144777,17,1,2,0,768.0,0.0,384.0,384.0,384.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DNS
139287,17,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DNS
119172,17,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DNS
113293,17,2,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DNS
62213,17,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DNS


In [38]:
multiclass.to_csv("multiclass.csv", index=False)
outBin.to_csv("binary.csv", index=False)

In [40]:
print(outBin[" Label"].value_counts())
print(multiclass[" Label"].value_counts())

BENIGN     54571
DNS         4961
LDAP        4961
MSSQL       4961
NetBIOS     4961
NTP         4961
SNMP        4961
SSDP        4961
UDP         4961
Syn         4961
TFTP        4961
UDP-lag     4961
Name:  Label, dtype: int64
DNS        54581
LDAP       54581
MSSQL      54581
NetBIOS    54581
NTP        54581
SNMP       54581
SSDP       54581
UDP        54581
Syn        54581
TFTP       54581
UDP-lag    54581
Name:  Label, dtype: int64
