In [1]:
# Import libraries
import numpy as np
import pandas as pd
import os
import re
import sys
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

sys.path.insert(0, '..')
sys.path.insert(0, '../fl-ids')

# Loading Files

In [2]:
# Create a set of the raw data and processed files name
# CONFIG NEEDED: Uncomment only the specific files to be processed on your node

csv_files = [
 '02-14-2018.csv',
 '02-15-2018.csv',
 '02-16-2018.csv',
 '02-21-2018.csv',
 '02-22-2018.csv',
 '02-23-2018.csv',
 '02-28-2018.csv',
 '03-01-2018.csv',
 '03-02-2018.csv',
 '02-20-2018.csv'    
]

label_maps = {'Benign': 0, 'FTP-BruteForce': 1, 'SSH-Bruteforce': 1, 'DoS attacks-GoldenEye': 1, 'DoS attacks-Slowloris': 1,
         'DoS attacks-SlowHTTPTest': 1, 'DoS attacks-Hulk': 1, 'Brute Force -Web': 1, 'Brute Force -XSS': 1,
         'SQL Injection': 1, 'Infilteration': 1, 'Bot': 1, 'DDOS attack-HOIC': 1, 'DDoS attacks-LOIC-HTTP': 1, 
         'DDOS attack-LOIC-UDP': 1}
 
# CONFIG NEEDED: Change Binary and Multi-class output file names if needed
multi_class_file = 'DATA-IDS-2018-multiclass'
binary_class_file = 'DATA-IDS-2018-binaryclass'

# CONFIG NEEDED: Change Train and Test output file names if needed. Adjust the split size.
test_prefix = 'TEST-'
train_prefix = 'TRAIN-'

test_size = 0.10
num_trainers = 4

In [3]:
# Set the folder name for raw data and processed files under the project directory
# CONFIG NEEDED: Change the './data' and 'processed' to what you named your directories
# Raw Data Files Location: final_project/data
# Processed Data Files Location: final_project/data/processed

rawdata_path = '../data/CSE-CIC-IDS2018'
processed_path = os.path.join(rawdata_path, 'processed')

# CONFIG NEEDED: Change to true as needed for multi-class or binary class files. 
# Note atleast one of these has to be true for the combined data file to be created. 
multi_class = True
binary_class = False

In [4]:
# Read the first file from the list to be processed
fname = os.path.join(rawdata_path, csv_files[0])
print('reading:', fname)
df = pd.read_csv(fname).drop(columns=['Timestamp'], errors='ignore')

# Read the remaining files in the list
for name in csv_files[1:]:
    fname = os.path.join(rawdata_path, name)
    print('appending:', fname)
    df1 = pd.read_csv(fname).drop(columns=['Timestamp'], errors='ignore')
    df = df.append(df1, ignore_index=True)

# Shuffle the data records and print final shape
print('Combined Raw Datafile Shape')
print(df.shape)

num_of_raw_records = df.shape[0]
print('Original Number of Records: ', num_of_raw_records)

reading: ../data/CSE-CIC-IDS2018\02-14-2018.csv
appending: ../data/CSE-CIC-IDS2018\02-15-2018.csv
appending: ../data/CSE-CIC-IDS2018\02-16-2018.csv


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


appending: ../data/CSE-CIC-IDS2018\02-21-2018.csv
appending: ../data/CSE-CIC-IDS2018\02-22-2018.csv
appending: ../data/CSE-CIC-IDS2018\02-23-2018.csv
appending: ../data/CSE-CIC-IDS2018\02-28-2018.csv
appending: ../data/CSE-CIC-IDS2018\03-01-2018.csv
appending: ../data/CSE-CIC-IDS2018\03-02-2018.csv
appending: ../data/CSE-CIC-IDS2018\02-20-2018.csv
Combined Raw Datafile Shape
(16233002, 83)
Original Number of Records:  16233002


# Data Processing

In [None]:
# Remove infinity and NaN values
print('Number of Infinity or NaN Values')
print(df.isin([np.nan, np.inf, -np.inf]).sum().sum())

# Replace infinity to NaN and drop NaN values
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()
df = df.reset_index(drop=True)

dropped_NaN_records = num_of_raw_records - df.shape[0]
print('Number of NaN/Inf Records Dropped: ', dropped_NaN_records)

# Check infinity and NaN values
print('Remaining Infinity or NaN Values')
print(df.isin([np.nan, np.inf, -np.inf]).sum().sum())

print('Combined Raw Datafile Shape')
print(df.shape)

In [None]:
df.columns

In [None]:
# Remove duplicate headers
df = df[~df['Dst Port'].str.contains('Dst Port', na=False)]

In [None]:
# Clean (spaces, special characters, etc.) column headers and lower case 
column_name_regex = re.compile(r"\W", re.IGNORECASE)

df.columns = [column_name_regex.sub('_', c.lower()) for c in df.columns]

In [None]:
print('Original Dataset Value Counts')
df['label'].value_counts()

In [None]:
# Drop attack types that have less than 20K rows.
df = df.groupby('label').filter(lambda x : len(x) > 20000)

In [None]:
print('Dataset Value Counts After Dropping Minimal Attacks')
df['label'].value_counts()

# Train Test Split

In [None]:
# Split the dataset into test and train data
y = df.pop('label')
X = df

# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, shuffle=True, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

dftrain = X_train.join(y_train)
dftest = X_test.join(y_test)

In [21]:
dftrain.shape

(14524388, 80)

In [22]:
dftest.shape

(1613821, 80)

In [26]:
dftest['label'].value_counts()

Benign                      1339118
DDOS attack-HOIC              68601
DDoS attacks-LOIC-HTTP        57619
DoS attacks-Hulk              46191
Bot                           28619
FTP-BruteForce                19335
SSH-Bruteforce                18759
Infilteration                 16074
DoS attacks-SlowHTTPTest      13989
DoS attacks-GoldenEye          4151
DoS attacks-Slowloris          1099
DDOS attack-LOIC-UDP            173
Brute Force -Web                 61
Brute Force -XSS                 23
SQL Injection                     9
Name: label, dtype: int64

In [27]:
dftrain['label'].value_counts()

Benign                      12052057
DDOS attack-HOIC              617411
DDoS attacks-LOIC-HTTP        518572
DoS attacks-Hulk              415721
Bot                           257572
FTP-BruteForce                174019
SSH-Bruteforce                168830
Infilteration                 144665
DoS attacks-SlowHTTPTest      125901
DoS attacks-GoldenEye          37357
DoS attacks-Slowloris           9891
DDOS attack-LOIC-UDP            1557
Brute Force -Web                 550
Brute Force -XSS                 207
SQL Injection                     78
Name: label, dtype: int64

# Splitting Trainers

In [None]:
# Trainer 1: 
#     DDOS attack-HOIC + DoS attacks-GoldenEye + Brute Force -Web + SQL Injection: 617411 + 37357 + 550 + 78 = 655,396
# Trainer 2: 
#     DDoS attacks-LOIC-HTTP + Infilteration + Brute Force -XSS: 518572 + 144665 + 207 = 663,444
# Trainer 3:
#     DoS attacks-Hulk + FTP-BruteForce + DDOS attack-LOIC-UDP: 415721 + 174019 + 1557 = 591,297
# Trainer 4: 
#     Bot + DoS attacks-SlowHTTPTest + SSH-Bruteforce  + DoS attacks-Slowlori: 257572 + 168830 + 125901 + 9891 = 562,194

In [35]:
train_1_labels = ['DDOS attack-HOIC', 'DoS attacks-GoldenEye', 'Brute Force -Web', 'SQL Injection']
train_2_labels = ['DDoS attacks-LOIC-HTTP', 'Infilteration', 'Brute Force -XSS']
train_3_labels = ['DoS attacks-Hulk', 'FTP-BruteForce', 'DDOS attack-LOIC-UDP']
train_4_labels = ['Bot', 'DoS attacks-SlowHTTPTest', 'SSH-Bruteforce', 'DoS attacks-Slowloris']

In [36]:
df_train_1 = dftrain[dftrain.label.isin(train_1_labels)]
df_train_2 = dftrain[dftrain.label.isin(train_2_labels)]
df_train_3 = dftrain[dftrain.label.isin(train_3_labels)]
df_train_4 = dftrain[dftrain.label.isin(train_4_labels)]

In [37]:
print(f'Training set 1 shape (without benigns): {df_train_1.shape}')
print(f'Training set 2 shape (without benigns): {df_train_2.shape}')
print(f'Training set 3 shape (without benigns): {df_train_3.shape}')
print(f'Training set 4 shape (without benigns): {df_train_4.shape}')

Training set 1 shape (without benigns): (655396, 80)
Training set 2 shape (without benigns): (663444, 80)
Training set 3 shape (without benigns): (591297, 80)
Training set 4 shape (without benigns): (562194, 80)


## Splitting Benign Data Across Trainers

In [38]:
df_benign = dftrain[dftrain['label'] == 'Benign']

In [39]:
df_benign.shape

(12052057, 80)

In [40]:
df_benign = df_benign.sample(frac=1) # Shuffle data
df_benign_1, df_benign_2, df_benign_3, df_benign_4 = np.array_split(df_benign, 4)  

In [41]:
print(f'Benign set 1 shape: {df_benign_1.shape}')
print(f'Benign set 2 shape: {df_benign_2.shape}')
print(f'Benign set 3 shape: {df_benign_3.shape}')
print(f'Benign set 4 shape: {df_benign_4.shape}')

Benign set 1 shape: (3013015, 80)
Benign set 2 shape: (3013014, 80)
Benign set 3 shape: (3013014, 80)
Benign set 4 shape: (3013014, 80)


## Concatenating Dataframes

In [42]:
df_train_full_1 = pd.concat([df_train_1, df_benign_1])
df_train_full_2 = pd.concat([df_train_2, df_benign_2])
df_train_full_3 = pd.concat([df_train_3, df_benign_3])
df_train_full_4 = pd.concat([df_train_4, df_benign_4])

In [43]:
print(f'Full dataset 1 shape: {df_train_full_1.shape}')
print(f'Full dataset 2 shape: {df_train_full_2.shape}')
print(f'Full dataset 3 shape: {df_train_full_3.shape}')
print(f'Full dataset 4 shape: {df_train_full_4.shape}')

Full dataset 1 shape: (3668411, 80)
Full dataset 2 shape: (3676458, 80)
Full dataset 3 shape: (3604311, 80)
Full dataset 4 shape: (3575208, 80)


# Last Processing Steps

In [50]:
from conf import LABEL_TO_ID
from util.data_loader import get_id_from_label

In [53]:
df_train_full_1['label'] = df_train_full_1['label'].apply(lambda x: get_id_from_label(x, LABEL_TO_ID))

In [56]:
df_train_full_1['label'].unique()

array([ 4,  9, 12, 14,  0])

In [57]:
df_train_full_2['label'] = df_train_full_2['label'].apply(lambda x: get_id_from_label(x, LABEL_TO_ID))
df_train_full_3['label'] = df_train_full_3['label'].apply(lambda x: get_id_from_label(x, LABEL_TO_ID))
df_train_full_4['label'] = df_train_full_4['label'].apply(lambda x: get_id_from_label(x, LABEL_TO_ID))

In [58]:
print(df_train_full_1['label'].unique())
print(df_train_full_2['label'].unique())
print(df_train_full_3['label'].unique())
print(df_train_full_4['label'].unique())

[ 4  9 12 14  0]
[ 1 10 13  0]
[ 8  5 11  0]
[7 6 3 2 0]


In [60]:
df_train_dict = {
    '1': df_train_full_1,
    '2': df_train_full_2,
    '3': df_train_full_3,
    '4': df_train_full_4
}

# Writing Files

In [59]:
print('Creating Multi-Class Test File')
test_file_name = os.path.join(processed_path, test_prefix + '-' + multi_class_file + '.csv')
# dftest = dftest.drop('timestamp', axis=1) # Drop timestamp column
dftest.drop('timestamp', axis=1).to_csv(test_file_name, index=False)
print('Finished writing: ', test_file_name)

Creating Multi-Class Test File
Finished writing:  ../data/processed/TEST--DATA-IDS-2018-multiclass.csv


In [62]:
for i in df_train_dict.keys():
    print(f'Creating Multi-Class Test File for Trainer {i}')
    train_file_name = os.path.join(processed_path, f'{train_prefix}{i}-{multi_class_file}.csv')
    df_train_dict[i].drop('timestamp', axis=1).to_csv(train_file_name, index=False)
    print('Finished writing: ', train_file_name)

Creating Multi-Class Test File for Trainer 1
Finished writing:  ../data/processed/TRAIN-1-DATA-IDS-2018-multiclass.csv
Creating Multi-Class Test File for Trainer 2
Finished writing:  ../data/processed/TRAIN-2-DATA-IDS-2018-multiclass.csv
Creating Multi-Class Test File for Trainer 3
Finished writing:  ../data/processed/TRAIN-3-DATA-IDS-2018-multiclass.csv
Creating Multi-Class Test File for Trainer 4
Finished writing:  ../data/processed/TRAIN-4-DATA-IDS-2018-multiclass.csv


In [63]:
test_read = pd.read_csv(test_file_name)

In [64]:
print(test_read.shape) # Should match dftest.shape = (1613821, 80) (-1 column with dropped timestamp)

(1613821, 79)


In [65]:
del test_read
del df_train_dict

# Bootstrapping Minority Classes 

## Trainer 1

In [67]:
df_train_full_1['label'].value_counts()

0     3013015
4      617411
9       37357
12        550
14         78
Name: label, dtype: int64

In [68]:
df_1_label_14_bootstrapped = df_train_full_1[df_train_full_1['label'] == 14].sample(10000, replace=True)
df_1_label_12_bootstrapped = df_train_full_1[df_train_full_1['label'] == 12].sample(20000, replace=True)

In [76]:
df_train_full_1[~df_train_full_1.label.isin([12, 14])].label.value_counts()

0    3013015
4     617411
9      37357
Name: label, dtype: int64

In [77]:
df_train_bootstrapped_1 = pd.concat(
    [
        df_train_full_1[~df_train_full_1.label.isin([12, 14])], 
        df_1_label_14_bootstrapped, 
        df_1_label_12_bootstrapped
    ]
)

In [78]:
df_train_bootstrapped_1.label.value_counts()

0     3013015
4      617411
9       37357
12      20000
14      10000
Name: label, dtype: int64

## Trainer 2

In [79]:
df_train_full_2['label'].value_counts()

0     3013014
10     518572
1      144665
13        207
Name: label, dtype: int64

In [80]:
df_2_label_13_bootstrapped = df_train_full_2[df_train_full_2['label'] == 13].sample(20000, replace=True)

In [81]:
df_train_bootstrapped_2 = pd.concat(
    [
        df_train_full_2[~df_train_full_2.label.isin([13])], 
        df_2_label_13_bootstrapped, 
    ]
)

In [82]:
df_train_bootstrapped_2.label.value_counts()

0     3013014
10     518572
1      144665
13      20000
Name: label, dtype: int64

## Trainer 3

In [83]:
df_train_full_3['label'].value_counts()

0     3013014
8      415721
5      174019
11       1557
Name: label, dtype: int64

In [84]:
df_3_label_11_bootstrapped = df_train_full_3[df_train_full_3['label'] == 11].sample(50000, replace=True)

In [85]:
df_train_bootstrapped_3 = pd.concat(
    [
        df_train_full_3[~df_train_full_3.label.isin([11])], 
        df_3_label_11_bootstrapped, 
    ]
)

In [86]:
df_train_bootstrapped_3.label.value_counts()

0     3013014
8      415721
5      174019
11      50000
Name: label, dtype: int64

## Trainer 4

In [87]:
df_train_full_4['label'].value_counts()

0    3013014
7     257572
3     168830
6     125901
2       9891
Name: label, dtype: int64

In [88]:
df_4_label_2_bootstrapped = df_train_full_4[df_train_full_4['label'] == 2].sample(30000, replace=True)

In [89]:
df_train_bootstrapped_4 = pd.concat(
    [
        df_train_full_4[~df_train_full_4.label.isin([2])], 
        df_4_label_2_bootstrapped, 
    ]
)

In [90]:
df_train_bootstrapped_4.label.value_counts()

0    3013014
7     257572
3     168830
6     125901
2      30000
Name: label, dtype: int64

# Writing Bootstrap Oversampled Files

In [91]:
df_train_dict = {
    '1': df_train_bootstrapped_1,
    '2': df_train_bootstrapped_2,
    '3': df_train_bootstrapped_3,
    '4': df_train_bootstrapped_4
}

In [93]:
bootstrap_processed_path = f'{processed_path}_bootstrap'

In [96]:
for i in df_train_dict.keys():
    print(f'Creating Multi-Class Oversampled File for Trainer {i}')
    train_file_name = os.path.join(bootstrap_processed_path, f'{train_prefix}{i}-{multi_class_file}-bootstrap.csv')
    df_train_dict[i].drop('timestamp', axis=1).to_csv(train_file_name, index=False)
    print('Finished writing: ', train_file_name)

Creating Multi-Class Oversampled File for Trainer 1
Finished writing:  ../data/processed_bootstrap/TRAIN-1-DATA-IDS-2018-multiclass-bootstrap.csv
Creating Multi-Class Oversampled File for Trainer 2
Finished writing:  ../data/processed_bootstrap/TRAIN-2-DATA-IDS-2018-multiclass-bootstrap.csv
Creating Multi-Class Oversampled File for Trainer 3
Finished writing:  ../data/processed_bootstrap/TRAIN-3-DATA-IDS-2018-multiclass-bootstrap.csv
Creating Multi-Class Oversampled File for Trainer 4
Finished writing:  ../data/processed_bootstrap/TRAIN-4-DATA-IDS-2018-multiclass-bootstrap.csv


In [None]:
# # Create a multi-class label file
# if multi_class:
#     print('Creating Multi-Class Test File')
#     outTestFile = os.path.join(processed_path, test_prefix + '-' + multi_class_file + '.csv')
#     dftest = dftest.drop('timestamp', axis=1)      # Drop timestamp column
#     dftest.to_csv(outTestFile, index=False)
#     print('finished writing:', outTestFile)
    
#     # Sort training data based of timestamp and split into four equal chunks
#     dftrain = dftrain.sort_values(by='timestamp', ascending=True)
#     df_train_split = np.array_split(dftrain, num_trainers)
    
#     for x in range(0, num_trainers):
#         print('Creating Multi-Class Training File: ', str(x+1))
#         outTrainFile = os.path.join(processed_path, train_prefix + str(x+1) + '-' + multi_class_file + '.csv')
#         df_train_split[x].to_csv(outTrainFile, index=False)
#         print('finished writing:', outTrainFile)

In [None]:
# df = pd.read_csv("../data/processed/TRAIN-0DATA-IDS-2018-multiclass.csv")
# df['label'].value_counts()

In [None]:
# if binary_class:
#     print('Creating Binary-Class Test File')
#     df = pd.read_csv(processed_path, test_prefix + multi_class_file + '.csv')
#     outTestFile = os.path.join(processed_path, test_prefix + binary_class_file + '.csv')

#     # Map benign rows to 0, all others as 1
#     df['label'] = df['label'].map(label_maps).astype(int)
#     df.to_csv(outTestFile, index=False)
#     print('finished writing:', outTestFile)

#     for x in range(0, num_trainers):
#         print('Creating Binary-Class Training File: ', str(x+1))
#         df = pd.read_csv(processed_path, train_prefix + str(x+1) + multi_class_file + '.csv')

#         # Map benign rows to 0, all others as 1
#         df['label'] = df['label'].map(label_maps).astype(int)

#         outTrainFile = os.path.join(processed_path, train_prefix + str(x+1) + binary_class_file + '.csv')
#         df.to_csv(outTrainFile, index=False)
#         print('finished writing:', outTrainFile)

In [None]:
# df = pd.read_csv("../data/processed/train-02-20-2018-ddos-loic-tcp.csv")
# df['label'].value_counts()

In [None]:
print('all done...')