In [72]:
# Import libraries
import numpy as np
import pandas as pd
import os
import re
import sys
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [73]:
# Create a set of the raw data and processed files name
# CONFIG NEEDED: Uncomment only the specific files to be processed on your node

csv_files = [
 '02-14-2018.csv',
 '02-15-2018.csv',
 '02-16-2018.csv',
 '02-21-2018.csv',
 '02-22-2018.csv',
 '02-23-2018.csv',
 '02-28-2018.csv',
 '03-01-2018.csv',
 '03-02-2018.csv'
]

label_maps = {'Benign': 0, 'FTP-BruteForce': 1, 'SSH-Bruteforce': 1, 'DoS attacks-GoldenEye': 1, 'DoS attacks-Slowloris': 1,
         'DoS attacks-SlowHTTPTest': 1, 'DoS attacks-Hulk': 1, 'Brute Force -Web': 1, 'Brute Force -XSS': 1,
         'SQL Injection': 1, 'Infilteration': 1, 'Bot': 1, 'DDOS attack-HOIC': 1, 'DDoS attacks-LOIC-HTTP': 1, 
         'DDOS attack-LOIC-UDP': 1}
 
# CONFIG NEEDED: Change Binary and Multi-class output file names if needed
multi_class_file = 'DATA-IDS-2018-multiclass'
binary_class_file = 'DATA-IDS-2018-binaryclass'

# CONFIG NEEDED: Change Train and Test output file names if needed. Adjust the split size.
test_prefix = 'TEST-'
train_prefix = 'TRAIN-'

test_size = 0.10
num_trainers = 4

In [74]:
# Set the folder name for raw data and processed files under the project directory
# CONFIG NEEDED: Change the './data' and 'processed' to what you named your directories
# Raw Data Files Location: final_project/data
# Processed Data Files Location: final_project/data/processed

rawdata_path = '../data'
processed_path = os.path.join(rawdata_path, 'processed')

# CONFIG NEEDED: Change to true as needed for multi-class or binary class files. 
# Note atleast one of these has to be true for the combined data file to be created. 
multi_class = True
binary_class = True

In [None]:
# Read the first file from the list to be processed
fname = os.path.join(rawdata_path, csv_files[0])
print('reading:', fname)
df = pd.read_csv(fname).drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Src Port'], errors='ignore')

# Read the remaining files in the list
for name in csv_files[1:]:
    fname = os.path.join(rawdata_path, name)
    print('appending:', fname)
    df1 = pd.read_csv(fname).drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Src Port'], errors='ignore')
    df = df.append(df1, ignore_index=True)

# Shuffle the data records and print final shape
print('Combined Raw Datafile Shape')
print(df.shape)

num_of_raw_records = df.shape[0]
print('Original Number of Records: ', num_of_raw_records)

reading: ../data/02-14-2018.csv
appending: ../data/02-15-2018.csv
appending: ../data/02-16-2018.csv


  interactivity=interactivity, compiler=compiler, result=result)


appending: ../data/02-21-2018.csv
appending: ../data/02-22-2018.csv
appending: ../data/02-23-2018.csv
appending: ../data/02-28-2018.csv


In [None]:
# Remove infinity and NaN values
print('Number of Infinity or NaN Values')
print(df.isin([np.nan, np.inf, -np.inf]).sum().sum())

# Replace infinity to NaN and drop NaN values
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()
df = df.reset_index(drop=True)

dropped_NaN_records = num_of_raw_records - df.shape[0]
print('Number of NaN/Inf Records Dropped: ', dropped_NaN_records)

# Check infinity and NaN values
print('Remaining Infinity or NaN Values')
print(df.isin([np.nan, np.inf, -np.inf]).sum().sum())

print('Combined Raw Datafile Shape')
print(df.shape)

In [None]:
# Remove duplicate headers
df = df[~df['Dst Port'].str.contains('Dst Port', na=False)]

In [None]:
# Clean (spaces, special characters, etc.) column headers and lower case 
column_name_regex = re.compile(r"\W", re.IGNORECASE)

df.columns = [column_name_regex.sub('_', c.lower()) for c in df.columns]

In [None]:
print('Original Dataset Value Counts')
df['label'].value_counts()

In [None]:
# Drop attack types that have less than 20K rows.
df = df.groupby('label').filter(lambda x : len(x) > 20000)

In [None]:
print('Dataset Value Counts After Dropping Minimal Attacks')
df['label'].value_counts()

In [None]:
# Split the dataset into test and train data
y = df.pop('label')
X = df

# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, shuffle=True, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

dftrain = X_train.join(y_train)
dftest = X_test.join(y_test)

In [None]:
# Create a multi-class label file
if multi_class:
    print('Creating Multi-Class Test File')
    outTestFile = os.path.join(processed_path, test_prefix + '-' + multi_class_file + '.csv')
    dftest = dftest.drop('timestamp', axis=1)      # Drop timestamp column
    dftest.to_csv(outTestFile, index=False)
    print('finished writing:', outTestFile)
    
    # Sort training data based of timestamp and split into four equal chunks
    dftrain = dftrain.sort_values(by='timestamp', ascending=True)
    df_train_split = np.array_split(dftrain, num_trainers)
    
    for x in range(0, num_trainers):
        print('Creating Multi-Class Training File: ', str(x+1))
        outTrainFile = os.path.join(processed_path, train_prefix + str(x+1) + '-' + multi_class_file + '.csv')
        df_train_split[x].to_csv(outTrainFile, index=False)
        print('finished writing:', outTrainFile)

In [None]:
df = pd.read_csv("../data/processed/TRAIN-0DATA-IDS-2018-multiclass.csv")
df['label'].value_counts()

In [None]:
# if binary_class:
#     print('Creating Binary-Class Test File')
#     df = pd.read_csv(processed_path, test_prefix + multi_class_file + '.csv')
#     outTestFile = os.path.join(processed_path, test_prefix + binary_class_file + '.csv')

#     # Map benign rows to 0, all others as 1
#     df['label'] = df['label'].map(label_maps).astype(int)
#     df.to_csv(outTestFile, index=False)
#     print('finished writing:', outTestFile)

#     for x in range(0, num_trainers):
#         print('Creating Binary-Class Training File: ', str(x+1))
#         df = pd.read_csv(processed_path, train_prefix + str(x+1) + multi_class_file + '.csv')

#         # Map benign rows to 0, all others as 1
#         df['label'] = df['label'].map(label_maps).astype(int)

#         outTrainFile = os.path.join(processed_path, train_prefix + str(x+1) + binary_class_file + '.csv')
#         df.to_csv(outTrainFile, index=False)
#         print('finished writing:', outTrainFile)

In [None]:
# df = pd.read_csv("../data/processed/train-02-20-2018-ddos-loic-tcp.csv")
# df['label'].value_counts()

In [None]:
print('all done...')