In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd # get dask with: pip install "dask[complete]"
from os import listdir
from os.path import isfile, join
import os

In [37]:
# Assumes we've placed program in root of dataset folder, where both days are subdirectories.
path1 = os.getcwd() + "/01-12"
path2 = os.getcwd() + "/03-11"

In [38]:
# Get list of files in each directory.
fileList = [path1 + '/' + f for f in listdir(path1) if isfile(join(path1, f))]
fileList2 = [path2 + '/' + f for f in listdir(path2) if isfile(join(path2, f))]

In [46]:
"""
This function extracts an equal amount of benign and attack data for each day and forms the train/test sets.

Needs Dask due to how large many of the datasets are.
"""
def extract(fileList):

    print("Beginning extraction of benign data...")
    # First pass aggregates BENIGN data.
    i = 0
    outFrame = None
    for f in fileList:
        df = dd.read_csv(f, dtype={'SimillarHTTP': 'object'})
        lind = f.rfind('/')
        slind = f.rfind('/', 0,lind-1)
        print(f[slind:])
        #print(f"{df[' Label'].value_counts().compute()}\n")

        # Pull benign samples out and get their quantity.
        ben_df = df[df[' Label'] == 'BENIGN']

        if i == 0:
            outFrame = ben_df
            i += 1

        else:    
            outFrame = dd.concat([outFrame, ben_df])

        #print(f"Aggregated df stats: {outFrame[' Label'].value_counts().compute()}\n")

    # Get the total count of BENIGN samples that we've aggregated.
    shben = outFrame.shape
    ben_num = shben[0].compute()
    print(f"Number of Benign samples aggregated: {ben_num}")

    print("Beginning extraction of attack data...")

    # Second pass pulls out attack data equal in quantity to the count of benign data that we've gathered.
    for f in fileList:
        df = dd.read_csv(f, dtype={'SimillarHTTP': 'object'})
        lind = f.rfind('/')
        slind = f.rfind('/', 0,lind-1)
        print(f[slind:])
  
        # Get list of label categories
        cats = list(set(list(df[' Label'])))

        # Go through non-benign categories and add a subsample equal in quantity to the number of benign samples.
        for cat in cats:

            if cat != "BENIGN":

                print(f"Appending data for {cat}...")
                # Find our how many samples we have
                cat_df = df[df[' Label'] == cat]
                shcat = cat_df.shape
                cat_num = shcat[0].compute()

                # Get a sample if there is equal/more attack data, otherwise add everything.
                # dask only does approximate sampling, so we'll have to do some post-processing to make the numbers exactly equal.
                if cat_num > ben_num:
                    ret_df = cat_df.sample(frac = ben_num / cat_num)

                else:
                    ret_df = cat_df

                outFrame = dd.concat([outFrame, ret_df])
                #print(f"Aggregated df stats: {outFrame[' Label'].value_counts().compute()}\n")

    # Return our aggregated DataFrame
    print(f"Aggregated df stats: {outFrame[' Label'].value_counts().compute()}\n")
    print("Extraction completed.")
    return outFrame
    


In [47]:
train = extract(fileList)
test = extract(fileList2)

Beginning extraction...
/01-12/DrDoS_DNS.csv
DrDoS_DNS    5071011
BENIGN          3402
Name:  Label, dtype: int64

Aggregated df stats: BENIGN    3402
Name:  Label, dtype: int64

/01-12/DrDoS_LDAP.csv
DrDoS_LDAP    2179930
BENIGN           1612
Name:  Label, dtype: int64

Aggregated df stats: BENIGN    5014
Name:  Label, dtype: int64

/01-12/DrDoS_MSSQL.csv
DrDoS_MSSQL    4522492
BENIGN            2006
Name:  Label, dtype: int64

Aggregated df stats: BENIGN    7020
Name:  Label, dtype: int64

/01-12/DrDoS_NetBIOS.csv
DrDoS_NetBIOS    4093279
BENIGN              1707
Name:  Label, dtype: int64

Aggregated df stats: BENIGN    8727
Name:  Label, dtype: int64

/01-12/DrDoS_NTP.csv
DrDoS_NTP    1202642
BENIGN         14365
Name:  Label, dtype: int64

Aggregated df stats: BENIGN    23092
Name:  Label, dtype: int64

/01-12/DrDoS_SNMP.csv
DrDoS_SNMP    5159870
BENIGN           1507
Name:  Label, dtype: int64

Aggregated df stats: BENIGN    24599
Name:  Label, dtype: int64

/01-12/DrDoS_SSDP.cs

In [50]:
dd.to_csv(df = train, filename = "day1.csv", single_file=True)

['c:\\Users\\icarus\\Documents\\school\\Fall 2022\\CSI 5388\\dataset\\day1.csv']

In [49]:
dd.to_csv(df = test, filename = "day2.csv", single_file=True)

['c:\\Users\\icarus\\Documents\\school\\Fall 2022\\CSI 5388\\dataset\\day2.csv']

In [2]:
df1 = pd.read_csv("day1.csv")
df2 = pd.read_csv("day2.csv")

  df1 = pd.read_csv("day1.csv")
  df2 = pd.read_csv("day2.csv")


In [5]:
print(df1[" Label"].value_counts())
print(df2[" Label"].value_counts())

DrDoS_UDP        56866
DrDoS_DNS        56865
DrDoS_MSSQL      56865
Syn              56864
BENIGN           56863
DrDoS_SNMP       56863
TFTP             56863
UDP-lag          56863
DrDoS_LDAP       56862
DrDoS_SSDP       56862
DrDoS_NetBIOS    56861
DrDoS_NTP        56861
WebDDoS            439
Name:  Label, dtype: int64
Syn        113931
NetBIOS    113929
UDP        113928
MSSQL       81357
LDAP        66896
BENIGN      56965
Portmap     56965
UDPLag       1873
Name:  Label, dtype: int64
