In [47]:
#############################################################################################
# Notebook to be used for doing a stratified split of the cleaned datasets                  #
# and creating csv files.                                                                   #
#############################################################################################

In [48]:
# Import libraries
import numpy as np
import pandas as pd
import os
import sys
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [49]:
# Create a list of the cleaned data files
# CONFIG NEEDED: Uncomment only the specific files to be processed on your node

cleaned_csv_files = [
 '02-14-2018-bruteforce-ftp-ssh.csv',
 '02-15-2018-dos-goldeneye-slowloris.csv',
 '02-16-2018-dos-slowhttp-hulk.csv',
 '02-21-2018-ddos-loic-udp.csv',
 '02-22-2018-bruteforce-webxss.csv',
 '02-23-2018-bruteforce-webxss-sql.csv',
 '02-28-2018-infiltration.csv',
 '03-01-2018-botnet.csv'
# '03-02-2018-infiltration.csv',
# '02-20-2018-ddos-loic-tcp.csv'   # WARNING: 4GB FILE.
]

# CONFIG NEEDED: Change Binary and Multi-class output file names if needed
# multi_class_file = 'IDS-2018-multiclass'
# binary_class_file = 'IDS-2018-binaryclass'

test_prefix = 'test-'
train_prefix = 'train-'

test_size = 0.1

In [50]:
# Set the folder name for raw data and processed files under the project directory
# CONFIG NEEDED: Change the './data' and 'processed' to what you named your directories
# Raw Data Files Location: final_project/data
# Processed Data Files Location: final_project/data/processed

rawdata_path = '../data'
processed_path = os.path.join(rawdata_path, 'processed')

In [51]:
# Read the first file from the list to be processed
for name in cleaned_csv_files[:]:
    fname = os.path.join(processed_path, name)
    print('reading:', fname)
    df = pd.read_csv(fname)

    y = df.pop('label')
    X = df
    print(X.shape, y.shape)

    # split into train test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, shuffle=True, random_state=1)
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

    dftrain = X_train.join(y_train)
    dftest = X_test.join(y_test)

    outTrainFile = os.path.join(processed_path, train_prefix + name)
    dftrain.to_csv(outTrainFile, index=False)
    print('finished writing:', outTrainFile)

    outTestFile = os.path.join(processed_path, test_prefix + name)
    dftest.to_csv(outTestFile, index=False)
    print('finished writing:', outTestFile)

reading: ../data/processed/02-14-2018-bruteforce-ftp-ssh.csv
(1047028, 79) (1047028,)
(942325, 79) (104703, 79) (942325,) (104703,)
finished writing: ../data/processed/train-02-14-2018-bruteforce-ftp-ssh.csv
finished writing: ../data/processed/test-02-14-2018-bruteforce-ftp-ssh.csv
reading: ../data/processed/02-15-2018-dos-goldeneye-slowloris.csv
(1045469, 79) (1045469,)
(940922, 79) (104547, 79) (940922,) (104547,)
finished writing: ../data/processed/train-02-15-2018-dos-goldeneye-slowloris.csv
finished writing: ../data/processed/test-02-15-2018-dos-goldeneye-slowloris.csv
reading: ../data/processed/02-16-2018-dos-slowhttp-hulk.csv
(1048574, 79) (1048574,)
(943716, 79) (104858, 79) (943716,) (104858,)
finished writing: ../data/processed/train-02-16-2018-dos-slowhttp-hulk.csv
finished writing: ../data/processed/test-02-16-2018-dos-slowhttp-hulk.csv
reading: ../data/processed/02-21-2018-ddos-loic-udp.csv
(1048575, 79) (1048575,)
(943717, 79) (104858, 79) (943717,) (104858,)
finished wri

In [57]:
# Read a sample file and check label counts
# CONFIG NEEDED: Change file name to the file you want to check
df = pd.read_csv("../data/processed/train-03-02-2018-infiltration.csv")
df['label'].value_counts()

Benign    684802
Bot       257572
Name: label, dtype: int64

In [58]:
# Read a sample file and check label counts
# CONFIG NEEDED: Change file name to the file you want to check
df = pd.read_csv("../data/processed/test-03-02-2018-infiltration.csv")
df['label'].value_counts()

Benign    76090
Bot       28619
Name: label, dtype: int64