In [6]:
#############################################################################################
# Notebook to be used for combining, downsampling, creating binary and/or multiclass files, #
# and creating csv and/or pickle files.                                                     #
#############################################################################################

In [7]:
# Import libraries
import numpy as np
import pandas as pd
import os
import sys
from sklearn.utils import shuffle

In [8]:
# Create a list of the cleaned data files
# CONFIG NEEDED: Uncomment only the specific files to be processed on your node

cleaned_csv_files = [
#  '02-14-2018-bruteforce-ftp-ssh.csv',
#  '02-15-2018-dos-goldeneye-slowloris.csv',
#  '02-16-2018-dos-slowhttp-hulk.csv',
#  '02-21-2018-ddos-loic-udp.csv',
#  '02-22-2018-bruteforce-webxss.csv',
#  '02-23-2018-bruteforce-webxss-sql.csv',
#  '02-28-2018-infiltration.csv',
#  '03-01-2018-botnet.csv',
#  '03-02-2018-infiltration.csv',
#  '02-20-2018-ddos-loic-tcp.csv'   # WARNING: 4GB FILE.

# Can also be used to combine the stratified test or train files as below
 'test-02-14-2018-bruteforce-ftp-ssh.csv',
 'test-02-15-2018-dos-goldeneye-slowloris.csv',
 'test-02-16-2018-dos-slowhttp-hulk.csv',
 'test-02-21-2018-ddos-loic-udp.csv',
 'test-02-22-2018-bruteforce-webxss.csv',
 'test-02-23-2018-bruteforce-webxss-sql.csv',
 'test-02-28-2018-infiltration.csv',
 'test-03-01-2018-botnet.csv',
 'test-03-02-2018-infiltration.csv',
 'test-02-20-2018-ddos-loic-tcp.csv'   # WARNING: LARGE FILE.
]


# CONFIG NEEDED: Change Binary and Multi-class output file names if needed
multi_class_file = 'COORD-TEST-IDS-2018-multiclass'
binary_class_file = 'COORD-TEST-IDS-2018-binaryclass'

In [9]:
# Set the folder name for raw data and processed files under the project directory
# CONFIG NEEDED: Change the './data' and 'processed' to what you named your directories
# Raw Data Files Location: final_project/data
# Processed Data Files Location: final_project/data/processed

rawdata_path = '../data'
processed_path = os.path.join(rawdata_path, 'processed')

# CONFIG NEEDED: Change to true if you want to downsample data and update the sample size as needed
down_sample = False
sample_size = 10000

# CONFIG NEEDED: Change to true as needed for multi-class or binary class files. 
# Note atleast one of these has to be true for the combined data file to be created. 
multi_class = True
binary_class = True

# CONFIG NEEDED: Change to true if you want to create a pickle file.
create_pickle = False

In [10]:
# Return the selected sample based on global setting
def get_samples(x):
    global sample_size
    if sample_size > x.shape[0]:
        return x
    else:
        return x.sample(sample_size)

In [11]:
# Read the first file from the list to be processed
fname = os.path.join(processed_path, cleaned_csv_files[0])
print('reading:', fname)
df = pd.read_csv(fname)

# If downsampling required, select sample as appropriate
if down_sample:
    df=df.groupby('label', as_index=False, group_keys=False).apply(get_samples)
    # Print name if downsampled
    print('downsampled:', fname)

# Check if created/downsampled correctly
print(df.shape)

# Read the remaining files in the list and downsample as needed
for name in cleaned_csv_files[1:]:
    fname = os.path.join(processed_path, name)
    print('appending:', fname)
    df1 = pd.read_csv(fname)

    if down_sample:
        df1=df1.groupby('label', as_index=False, group_keys=False).apply(get_samples)
        # Print name if downsampled
        print('downsampled:', fname)
    
    df = df.append(df1, ignore_index=True)

    # Check if created/downsampled correctly
    print('updated datafile shape')
    print(df.shape)  

# Shuffle the data records and print final shape
df = shuffle(df)
print('final datafile')
print(df.shape)

reading: ../data/processed/test-02-14-2018-bruteforce-ftp-ssh.csv
(104703, 80)
appending: ../data/processed/test-02-15-2018-dos-goldeneye-slowloris.csv
updated datafile shape
(209250, 80)
appending: ../data/processed/test-02-16-2018-dos-slowhttp-hulk.csv
updated datafile shape
(314108, 80)
appending: ../data/processed/test-02-21-2018-ddos-loic-udp.csv
updated datafile shape
(418966, 80)
appending: ../data/processed/test-02-22-2018-bruteforce-webxss.csv
updated datafile shape
(523620, 80)
appending: ../data/processed/test-02-23-2018-bruteforce-webxss-sql.csv
updated datafile shape
(628283, 80)
appending: ../data/processed/test-02-28-2018-infiltration.csv
updated datafile shape
(689378, 80)
appending: ../data/processed/test-03-01-2018-botnet.csv
updated datafile shape
(722380, 80)
appending: ../data/processed/test-03-02-2018-infiltration.csv
updated datafile shape
(827089, 80)
appending: ../data/processed/test-02-20-2018-ddos-loic-tcp.csv
updated datafile shape
(1619696, 80)
final datafi

In [12]:
# Create a multi-class label file
if multi_class:
    print('creating multi-class file')
    outFile = os.path.join(processed_path, multi_class_file)
    df.to_csv(outFile + '.csv', index=False)
    if create_pickle: # if pickle file is requested
        df.to_pickle(outFile + '.pickle')

creating multi-class file


In [13]:
# Create a binary-class label file
if binary_class:
    print('creating binary-class file') 
    
    # Map benign rows to 0, all others as 1
    df['label'] = df['label'].map(
        {'Benign': 0, 'FTP-BruteForce': 1, 'SSH-Bruteforce': 1, 'DoS attacks-GoldenEye': 1, 'DoS attacks-Slowloris': 1,
         'DoS attacks-SlowHTTPTest': 1, 'DoS attacks-Hulk': 1, 'Brute Force -Web': 1, 'Brute Force -XSS': 1,
         'SQL Injection': 1, 'Infilteration': 1, 'Bot': 1, 'DDOS attack-HOIC': 1, 'DDoS attacks-LOIC-HTTP': 1, 
         'DDOS attack-LOIC-UDP': 1}).astype(int)

    outFile = os.path.join(processed_path, binary_class_file)
    df.to_csv(outFile + '.csv', index=False)
    if create_pickle: # if pickle file is requested
        df.to_pickle(outFile + '.pickle')

creating binary-class file


In [14]:
print('all done...')

all done...


In [15]:
############################################################################################
# Cells below this are only needed if you want to test if the files were created correctly #
# Comment/Uncomment as needed
############################################################################################

In [17]:
# Read a sample file and check label counts
# CONFIG NEEDED: Change file name to the file you want to check
df = pd.read_csv("../data/processed/COORD-TEST-IDS-2018-multiclass.csv")
df['label'].value_counts()

Benign                      1344917
DDOS attack-HOIC              68602
DDoS attacks-LOIC-HTTP        57619
DoS attacks-Hulk              46192
Bot                           28619
FTP-BruteForce                19336
SSH-Bruteforce                18759
Infilteration                 16148
DoS attacks-SlowHTTPTest      13989
DoS attacks-GoldenEye          4151
DoS attacks-Slowloris          1099
DDOS attack-LOIC-UDP            173
Brute Force -Web                 61
Brute Force -XSS                 23
SQL Injection                     8
Name: label, dtype: int64

In [18]:
# Read a sample file and check label counts
# CONFIG NEEDED: Change file name to the file you want to check
df1 = pd.read_csv("../data/processed/COORD-TEST-IDS-2018-binaryclass.csv")
df1['label'].value_counts()

0    1344917
1     274779
Name: label, dtype: int64