In [2]:
# Import libraries
import numpy as np
import pandas as pd
import os
import re
import sys
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

sys.path.insert(0, '..')
sys.path.insert(0, '../fl-ids')

In [3]:
print(pd.__version__)

1.2.2


# Loading Files

In [4]:
# Create a set of the raw data and processed files name
# CONFIG NEEDED: Uncomment only the specific files to be processed on your node

csv_files = [
 '02-14-2018.csv',
 '02-15-2018.csv',
 '02-16-2018.csv',
 '02-21-2018.csv',
 '02-22-2018.csv',
 '02-23-2018.csv',
 '02-28-2018.csv',
 '03-01-2018.csv',
 '03-02-2018.csv',
 '02-20-2018.csv'    
]

label_maps = {'Benign': 0, 'FTP-BruteForce': 1, 'SSH-Bruteforce': 1, 'DoS attacks-GoldenEye': 1, 'DoS attacks-Slowloris': 1,
         'DoS attacks-SlowHTTPTest': 1, 'DoS attacks-Hulk': 1, 'Brute Force -Web': 1, 'Brute Force -XSS': 1,
         'SQL Injection': 1, 'Infilteration': 1, 'Bot': 1, 'DDOS attack-HOIC': 1, 'DDoS attacks-LOIC-HTTP': 1, 
         'DDOS attack-LOIC-UDP': 1}
 
# CONFIG NEEDED: Change Binary and Multi-class output file names if needed
multi_class_file = 'DATA-IDS-2018-multiclass'
binary_class_file = 'DATA-IDS-2018-binaryclass'

# CONFIG NEEDED: Change Train and Test output file names if needed. Adjust the split size.
test_prefix = 'TEST-'
train_prefix = 'TRAIN-'

test_size = 0.10
num_trainers = 4

In [5]:
# Set the folder name for raw data and processed files under the project directory
# CONFIG NEEDED: Change the './data' and 'processed' to what you named your directories
# Raw Data Files Location: final_project/data
# Processed Data Files Location: final_project/data/processed

rawdata_path = '../data/CSE-CIC-IDS2018'
processed_path = os.path.join(rawdata_path, 'processed')

# CONFIG NEEDED: Change to true as needed for multi-class or binary class files. 
# Note atleast one of these has to be true for the combined data file to be created. 
multi_class = True
binary_class = False

In [5]:
# Read the first file from the list to be processed
fname = os.path.join(rawdata_path, csv_files[0])
print('reading:', fname)
df = pd.read_csv(fname, low_memory=False).drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Src Port'], errors='ignore')

# Read the remaining files in the list
for name in csv_files[1:]:
    fname = os.path.join(rawdata_path, name)
    print('appending:', fname)
    df1 = pd.read_csv(fname, low_memory=False).drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Src Port'], errors='ignore')    
    df = df.append(df1, ignore_index=True)

# print final shape
print('Combined Raw Datafile Shape')
print(df.shape)

num_of_raw_records = df.shape[0]
print('Original Number of Records: ', num_of_raw_records)

reading: ../data/CSE-CIC-IDS2018/02-14-2018.csv
appending: ../data/CSE-CIC-IDS2018/02-15-2018.csv
appending: ../data/CSE-CIC-IDS2018/02-16-2018.csv
appending: ../data/CSE-CIC-IDS2018/02-21-2018.csv
appending: ../data/CSE-CIC-IDS2018/02-22-2018.csv
appending: ../data/CSE-CIC-IDS2018/02-23-2018.csv
appending: ../data/CSE-CIC-IDS2018/02-28-2018.csv
appending: ../data/CSE-CIC-IDS2018/03-01-2018.csv
appending: ../data/CSE-CIC-IDS2018/03-02-2018.csv
appending: ../data/CSE-CIC-IDS2018/02-20-2018.csv
Combined Raw Datafile Shape
(16233002, 80)
Original Number of Records:  16233002


In [6]:
print('Original Dataset Value Counts')
df['Label'].value_counts()

Original Dataset Value Counts


Benign                      13484708
DDOS attack-HOIC              686012
DDoS attacks-LOIC-HTTP        576191
DoS attacks-Hulk              461912
Bot                           286191
FTP-BruteForce                193360
SSH-Bruteforce                187589
Infilteration                 161934
DoS attacks-SlowHTTPTest      139890
DoS attacks-GoldenEye          41508
DoS attacks-Slowloris          10990
DDOS attack-LOIC-UDP            1730
Brute Force -Web                 611
Brute Force -XSS                 230
SQL Injection                     87
Label                             59
Name: Label, dtype: int64

# Data Processing

In [8]:
# Remove infinity and NaN values
print('Number of Infinity or NaN Values')
print(df.isin([np.nan, np.inf, -np.inf]).sum().sum())

# Replace infinity to NaN and drop NaN values
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()
df = df.reset_index(drop=True)

dropped_NaN_records = num_of_raw_records - df.shape[0]
print('Number of NaN/Inf Records Dropped: ', dropped_NaN_records)

# Check infinity and NaN values
print('Remaining Infinity or NaN Values')
print(df.isin([np.nan, np.inf, -np.inf]).sum().sum())

print('Combined Raw Datafile Shape')
print(df.shape)

Number of Infinity or NaN Values
179219
Number of NaN/Inf Records Dropped:  92547
Remaining Infinity or NaN Values
0
Combined Raw Datafile Shape
(16140455, 80)


In [9]:
df.columns

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

In [10]:
df = df.drop('Timestamp', axis=1)

In [11]:
# Remove duplicate headers
df = df[~df['Dst Port'].str.contains('Dst Port', na=False)]

In [12]:
# Clean (spaces, special characters, etc.) column headers and lower case 
column_name_regex = re.compile(r"\W", re.IGNORECASE)
df.columns = [column_name_regex.sub('_', c.lower()) for c in df.columns]

In [13]:
print('Original Dataset Value Counts')
df['label'].value_counts()

Original Dataset Value Counts


Benign                      13393005
DDOS attack-HOIC              686012
DDoS attacks-LOIC-HTTP        576191
DoS attacks-Hulk              461912
Bot                           286191
FTP-BruteForce                193354
SSH-Bruteforce                187589
Infilteration                 161096
DoS attacks-SlowHTTPTest      139890
DoS attacks-GoldenEye          41508
DoS attacks-Slowloris          10990
DDOS attack-LOIC-UDP            1730
Brute Force -Web                 611
Brute Force -XSS                 230
SQL Injection                     87
Name: label, dtype: int64

In [19]:
print('Creating combined dataset after cleaning and preprocessing')
dataset_file_name = os.path.join(processed_path, 'CSE-CIC-IDS2018-CombinedDataset.csv')
print('writing:', dataset_file_name)
df.to_csv(dataset_file_name, index=False)
print('Finished writing: ', dataset_file_name)

Creating combined dataset after cleaning and preprocessing
writing: ../data/CSE-CIC-IDS2018/processed/CSE-CIC-IDS2018-CombinedDataset.csv
Finished writing:  ../data/CSE-CIC-IDS2018/processed/CSE-CIC-IDS2018-CombinedDataset.csv


In [35]:
print('Number of Infinity or NaN Values')
print(df.isin([np.nan, np.inf, -np.inf]).sum().sum())

Number of Infinity or NaN Values
0


#  Loading combined dataset from CSV file (auto inference of data types)

In [5]:
dataset_file_name = os.path.join(processed_path, 'CSE-CIC-IDS2018-CombinedDataset.csv')
print('reading:', dataset_file_name)
combined_dataset_df = pd.read_csv(dataset_file_name, low_memory=False)

reading: ../data/CSE-CIC-IDS2018/processed/CSE-CIC-IDS2018-CombinedDataset.csv


In [6]:
# print final shape
print('Combined Dataset Shape')
print(combined_dataset_df.shape)

Combined Dataset Shape
(16140396, 79)


In [7]:
print('Combined Dataset Value Counts')
combined_dataset_df['label'].value_counts()

Combined Dataset Value Counts


Benign                      13393005
DDOS attack-HOIC              686012
DDoS attacks-LOIC-HTTP        576191
DoS attacks-Hulk              461912
Bot                           286191
FTP-BruteForce                193354
SSH-Bruteforce                187589
Infilteration                 161096
DoS attacks-SlowHTTPTest      139890
DoS attacks-GoldenEye          41508
DoS attacks-Slowloris          10990
DDOS attack-LOIC-UDP            1730
Brute Force -Web                 611
Brute Force -XSS                 230
SQL Injection                     87
Name: label, dtype: int64

In [8]:
print('Number of Infinity or NaN Values')
print(combined_dataset_df.isin([np.nan, np.inf, -np.inf]).sum().sum())

Number of Infinity or NaN Values
6426


In [9]:
num_of_raw_records_combined = combined_dataset_df.shape[0]
print('Original Number of Records in combined dataset : ', num_of_raw_records_combined)

Original Number of Records in combined dataset :  16140396


In [10]:
# Replace infinity to NaN and drop NaN values
combined_dataset_df = combined_dataset_df.replace([np.inf, -np.inf], np.nan)
combined_dataset_df = combined_dataset_df.dropna()
combined_dataset_df = combined_dataset_df.reset_index(drop=True)

dropped_NaN_records_combined = num_of_raw_records_combined - combined_dataset_df.shape[0]
print('Number of NaN/Inf Records Dropped in combined dataset: ', dropped_NaN_records_combined)

# Check infinity and NaN values
print('Remaining Infinity or NaN Values')
print(combined_dataset_df.isin([np.nan, np.inf, -np.inf]).sum().sum())

print('Combined Dataset Shape')
print(combined_dataset_df.shape)

Number of NaN/Inf Records Dropped in combined dataset:  3213
Remaining Infinity or NaN Values
0
Combined Dataset Shape
(16137183, 79)


In [11]:
# Drop attack types that have less than 20K rows.
reduced_df = combined_dataset_df.groupby('label').filter(lambda x : len(x) > 20000)

In [12]:
print('Dataset Value Counts After Dropping Minimal Attacks')
reduced_df['label'].value_counts()

Dataset Value Counts After Dropping Minimal Attacks


Benign                      13390249
DDOS attack-HOIC              686012
DDoS attacks-LOIC-HTTP        576191
DoS attacks-Hulk              461912
Bot                           286191
FTP-BruteForce                193354
SSH-Bruteforce                187589
Infilteration                 160639
DoS attacks-SlowHTTPTest      139890
DoS attacks-GoldenEye          41508
Name: label, dtype: int64

# Train Test Split

In [13]:
# Split the dataset into test and train data
y = reduced_df.pop('label')
X = reduced_df

test_size = 0.10

# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, shuffle=True, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

dftrain = X_train.join(y_train)
dftest = X_test.join(y_test)

(14511181, 78) (1612354, 78) (14511181,) (1612354,)


In [14]:
dftrain.shape

(14511181, 79)

In [15]:
dftest.shape

(1612354, 79)

In [16]:
dftest['label'].value_counts()

Benign                      1339025
DDOS attack-HOIC              68601
DDoS attacks-LOIC-HTTP        57619
DoS attacks-Hulk              46191
Bot                           28619
FTP-BruteForce                19336
SSH-Bruteforce                18759
Infilteration                 16064
DoS attacks-SlowHTTPTest      13989
DoS attacks-GoldenEye          4151
Name: label, dtype: int64

In [17]:
dftrain['label'].value_counts()

Benign                      12051224
DDOS attack-HOIC              617411
DDoS attacks-LOIC-HTTP        518572
DoS attacks-Hulk              415721
Bot                           257572
FTP-BruteForce                174018
SSH-Bruteforce                168830
Infilteration                 144575
DoS attacks-SlowHTTPTest      125901
DoS attacks-GoldenEye          37357
Name: label, dtype: int64

In [18]:
# Finding the null values.
print(dftrain.isin([np.nan, np.inf, -np.inf]).sum().sum())

0


In [19]:
for i in range(len(dftrain.columns)):
      print(dftrain.columns[i], '-', dftrain.dtypes[i])

dst_port - int64
protocol - int64
flow_duration - int64
tot_fwd_pkts - int64
tot_bwd_pkts - int64
totlen_fwd_pkts - float64
totlen_bwd_pkts - float64
fwd_pkt_len_max - float64
fwd_pkt_len_min - float64
fwd_pkt_len_mean - float64
fwd_pkt_len_std - float64
bwd_pkt_len_max - float64
bwd_pkt_len_min - float64
bwd_pkt_len_mean - float64
bwd_pkt_len_std - float64
flow_byts_s - float64
flow_pkts_s - float64
flow_iat_mean - float64
flow_iat_std - float64
flow_iat_max - float64
flow_iat_min - float64
fwd_iat_tot - float64
fwd_iat_mean - float64
fwd_iat_std - float64
fwd_iat_max - float64
fwd_iat_min - float64
bwd_iat_tot - float64
bwd_iat_mean - float64
bwd_iat_std - float64
bwd_iat_max - float64
bwd_iat_min - float64
fwd_psh_flags - int64
bwd_psh_flags - int64
fwd_urg_flags - int64
bwd_urg_flags - int64
fwd_header_len - int64
bwd_header_len - int64
fwd_pkts_s - float64
bwd_pkts_s - float64
pkt_len_min - float64
pkt_len_max - float64
pkt_len_mean - float64
pkt_len_std - float64
pkt_len_var - fl

# Bootstrapping data using SMOTE

In [20]:
# summarize distribution before resampling and augmentation of data
from collections import Counter

counter = Counter(y_train)
for k,v in counter.items():
    per = v / len(y_train) * 100
    print('label=%s, n=%d (%.3f%%)' % (k, v, per))

label=Benign, n=12051224 (83.048%)
label=DoS attacks-Hulk, n=415721 (2.865%)
label=DDOS attack-HOIC, n=617411 (4.255%)
label=DDoS attacks-LOIC-HTTP, n=518572 (3.574%)
label=Bot, n=257572 (1.775%)
label=DoS attacks-SlowHTTPTest, n=125901 (0.868%)
label=FTP-BruteForce, n=174018 (1.199%)
label=Infilteration, n=144575 (0.996%)
label=SSH-Bruteforce, n=168830 (1.163%)
label=DoS attacks-GoldenEye, n=37357 (0.257%)


In [21]:
most_frequent_label = dftrain['label'].value_counts().idxmax()
most_frequent_label

'Benign'

In [22]:
most_frequent_label_count = dftrain[dftrain['label'] == most_frequent_label].shape[0]
most_frequent_label_count

12051224

In [23]:
sampling_strategy_dict = {
    "DDOS attack-HOIC" : round(0.2*most_frequent_label_count),
    "DDoS attacks-LOIC-HTTP" : round(0.2*most_frequent_label_count),
    "DoS attacks-Hulk" : round(0.2*most_frequent_label_count),
    "Bot" : round(0.1*most_frequent_label_count),
    "FTP-BruteForce" : round(0.1*most_frequent_label_count),
    "SSH-Bruteforce" : round(0.1*most_frequent_label_count),
    "Infilteration" : round(0.1*most_frequent_label_count),
    "DoS attacks-SlowHTTPTest" : round(0.1*most_frequent_label_count),
    "DoS attacks-GoldenEye" : round(0.1*most_frequent_label_count)
}

In [24]:
# Upsample the minority class and randomly downsample the majority class.
over = SMOTE(sampling_strategy=sampling_strategy_dict)

In [25]:
# Fit the model to generate the data.
oversampled_trainX, oversampled_trainY = over.fit_resample(X_train, y_train)

In [26]:
#oversampled_trainY.value_counts()
# summarize distribution
from collections import Counter

counter = Counter(oversampled_trainY)
for k,v in counter.items():
    per = v / len(oversampled_trainY) * 100
    print('label=%s, n=%d (%.3f%%)' % (k, v, per))

label=Benign, n=12051224 (45.455%)
label=DoS attacks-Hulk, n=2410245 (9.091%)
label=DDOS attack-HOIC, n=2410245 (9.091%)
label=DDoS attacks-LOIC-HTTP, n=2410245 (9.091%)
label=Bot, n=1205122 (4.545%)
label=DoS attacks-SlowHTTPTest, n=1205122 (4.545%)
label=FTP-BruteForce, n=1205122 (4.545%)
label=Infilteration, n=1205122 (4.545%)
label=SSH-Bruteforce, n=1205122 (4.545%)
label=DoS attacks-GoldenEye, n=1205122 (4.545%)


In [27]:
sampling_strategy_dict_under = {
    "Benign": round(0.7*most_frequent_label_count)    
}
under = RandomUnderSampler(sampling_strategy=sampling_strategy_dict_under)
#steps = [('o', over), ('u', under)]
#pipeline = Pipeline(steps=steps)

In [28]:
# Fit the model to generate the data.
sampled_trainX, sampled_trainY = under.fit_resample(oversampled_trainX, oversampled_trainY)

In [29]:
#oversampled_trainY.value_counts()
# summarize distribution
from collections import Counter

counter = Counter(sampled_trainY)
for k,v in counter.items():
    per = v / len(sampled_trainY) * 100
    print('label=%s, n=%d (%.3f%%)' % (k, v, per))

label=Benign, n=8435857 (36.842%)
label=Bot, n=1205122 (5.263%)
label=DDOS attack-HOIC, n=2410245 (10.526%)
label=DDoS attacks-LOIC-HTTP, n=2410245 (10.526%)
label=DoS attacks-GoldenEye, n=1205122 (5.263%)
label=DoS attacks-Hulk, n=2410245 (10.526%)
label=DoS attacks-SlowHTTPTest, n=1205122 (5.263%)
label=FTP-BruteForce, n=1205122 (5.263%)
label=Infilteration, n=1205122 (5.263%)
label=SSH-Bruteforce, n=1205122 (5.263%)


In [30]:
sampled_train = pd.concat([pd.DataFrame(sampled_trainX), pd.DataFrame(sampled_trainY)], axis=1)
sampled_train.shape

(22897324, 79)

In [31]:
sampled_train.columns

Index(['dst_port', 'protocol', 'flow_duration', 'tot_fwd_pkts', 'tot_bwd_pkts',
       'totlen_fwd_pkts', 'totlen_bwd_pkts', 'fwd_pkt_len_max',
       'fwd_pkt_len_min', 'fwd_pkt_len_mean', 'fwd_pkt_len_std',
       'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_mean',
       'bwd_pkt_len_std', 'flow_byts_s', 'flow_pkts_s', 'flow_iat_mean',
       'flow_iat_std', 'flow_iat_max', 'flow_iat_min', 'fwd_iat_tot',
       'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max', 'fwd_iat_min',
       'bwd_iat_tot', 'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max',
       'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags', 'fwd_urg_flags',
       'bwd_urg_flags', 'fwd_header_len', 'bwd_header_len', 'fwd_pkts_s',
       'bwd_pkts_s', 'pkt_len_min', 'pkt_len_max', 'pkt_len_mean',
       'pkt_len_std', 'pkt_len_var', 'fin_flag_cnt', 'syn_flag_cnt',
       'rst_flag_cnt', 'psh_flag_cnt', 'ack_flag_cnt', 'urg_flag_cnt',
       'cwe_flag_count', 'ece_flag_cnt', 'down_up_ratio', 'pkt_size_avg',
       'fwd_seg_siz

In [32]:
print('Creating sampled dataset after oversampling minority classes and undersampling majority classes')
sampled_dataset_file_name = os.path.join(processed_path, 'CSE-CIC-IDS2018-SampledDataset.csv')
print('writing:', sampled_dataset_file_name)
sampled_train.to_csv(sampled_dataset_file_name, index=False)
print('Finished writing: ', sampled_dataset_file_name)

Creating sampled dataset after oversampling minority classes and undersampling majority classes
writing: ../data/CSE-CIC-IDS2018/processed/CSE-CIC-IDS2018-SampledDataset.csv
Finished writing:  ../data/CSE-CIC-IDS2018/processed/CSE-CIC-IDS2018-SampledDataset.csv


In [33]:
print('Creating Multi-Class Test File')
test_file_name = os.path.join(processed_path, test_prefix + '-' + multi_class_file + '.csv')
dftest.to_csv(test_file_name, index=False)
print('Finished writing: ', test_file_name)

Creating Multi-Class Test File
Finished writing:  ../data/CSE-CIC-IDS2018/processed/TEST--DATA-IDS-2018-multiclass.csv


# Splitting Trainers

In [1]:
# Trainer 1: 
#     DDOS attack-HOIC + DoS attacks-GoldenEye + Brute Force -Web + SQL Injection: 617411 + 37357 + 550 + 78 = 655,396
# Trainer 2: 
#     DDoS attacks-LOIC-HTTP + Infilteration + Brute Force -XSS: 518572 + 144665 + 207 = 663,444
# Trainer 3:
#     DoS attacks-Hulk + FTP-BruteForce + DDOS attack-LOIC-UDP: 415721 + 174019 + 1557 = 591,297
# Trainer 4: 
#     Bot + DoS attacks-SlowHTTPTest + SSH-Bruteforce  + DoS attacks-Slowlori: 257572 + 168830 + 125901 + 9891 = 562,194

In [7]:
IDS_df = pd.read_csv("../data/CSE-CIC-IDS2018/processed/CSE-CIC-IDS2018-SampledDataset.csv", low_memory=False)
IDS_df['label'].value_counts()

Benign                      8435857
DoS attacks-Hulk            2410245
DDOS attack-HOIC            2410245
DDoS attacks-LOIC-HTTP      2410245
FTP-BruteForce              1205122
Bot                         1205122
DoS attacks-SlowHTTPTest    1205122
Infilteration               1205122
SSH-Bruteforce              1205122
DoS attacks-GoldenEye       1205122
Name: label, dtype: int64

In [8]:
train_1_labels = ['DDOS attack-HOIC', 'DoS attacks-GoldenEye']
train_2_labels = ['DDoS attacks-LOIC-HTTP', 'Infilteration' ]
train_3_labels = ['DoS attacks-Hulk', 'FTP-BruteForce' ]
train_4_labels = ['Bot', 'DoS attacks-SlowHTTPTest', 'SSH-Bruteforce']

In [9]:
df_train_1 = IDS_df[IDS_df.label.isin(train_1_labels)]
df_train_2 = IDS_df[IDS_df.label.isin(train_2_labels)]
df_train_3 = IDS_df[IDS_df.label.isin(train_3_labels)]
df_train_4 = IDS_df[IDS_df.label.isin(train_4_labels)]

In [10]:
print(f'Training set 1 shape (without benigns): {df_train_1.shape}')
print(f'Training set 2 shape (without benigns): {df_train_2.shape}')
print(f'Training set 3 shape (without benigns): {df_train_3.shape}')
print(f'Training set 4 shape (without benigns): {df_train_4.shape}')

Training set 1 shape (without benigns): (3615367, 79)
Training set 2 shape (without benigns): (3615367, 79)
Training set 3 shape (without benigns): (3615367, 79)
Training set 4 shape (without benigns): (3615366, 79)


## Splitting Benign Data Across Trainers

In [11]:
df_benign = IDS_df[IDS_df['label'] == 'Benign']

In [12]:
df_benign.shape

(8435857, 79)

In [13]:
df_benign = df_benign.sample(frac=1) # Shuffle data
df_benign_1, df_benign_2, df_benign_3, df_benign_4 = np.array_split(df_benign, 4)  

In [14]:
print(f'Benign set 1 shape: {df_benign_1.shape}')
print(f'Benign set 2 shape: {df_benign_2.shape}')
print(f'Benign set 3 shape: {df_benign_3.shape}')
print(f'Benign set 4 shape: {df_benign_4.shape}')

Benign set 1 shape: (2108965, 79)
Benign set 2 shape: (2108964, 79)
Benign set 3 shape: (2108964, 79)
Benign set 4 shape: (2108964, 79)


## Concatenating Dataframes

In [15]:
df_train_full_1 = pd.concat([df_train_1, df_benign_1])
df_train_full_2 = pd.concat([df_train_2, df_benign_2])
df_train_full_3 = pd.concat([df_train_3, df_benign_3])
df_train_full_4 = pd.concat([df_train_4, df_benign_4])

In [16]:
print(f'Full dataset 1 shape: {df_train_full_1.shape}')
print(f'Full dataset 2 shape: {df_train_full_2.shape}')
print(f'Full dataset 3 shape: {df_train_full_3.shape}')
print(f'Full dataset 4 shape: {df_train_full_4.shape}')

Full dataset 1 shape: (5724332, 79)
Full dataset 2 shape: (5724331, 79)
Full dataset 3 shape: (5724331, 79)
Full dataset 4 shape: (5724330, 79)


# Writing Files

In [59]:
print('Creating Multi-Class Test File')
test_file_name = os.path.join(processed_path, test_prefix + '-' + multi_class_file + '.csv')
# dftest = dftest.drop('timestamp', axis=1) # Drop timestamp column
dftest.drop('timestamp', axis=1).to_csv(test_file_name, index=False)
print('Finished writing: ', test_file_name)

Creating Multi-Class Test File
Finished writing:  ../data/processed/TEST--DATA-IDS-2018-multiclass.csv


In [20]:
df_train_dict = {
    '1': df_train_full_1,
    '2': df_train_full_2,
    '3': df_train_full_3,
    '4': df_train_full_4
}

In [21]:
bootstrap_processed_path = f'{processed_path}_bootstrap'

In [24]:
for i in df_train_dict.keys():
    print(f'Creating Multi-Class Oversampled File for Trainer {i}')
    train_file_name = os.path.join(bootstrap_processed_path, f'{train_prefix}{i}-{multi_class_file}-bootstrap.csv')
    df_train_dict[i].to_csv(train_file_name, index=False)
    print('Finished writing: ', train_file_name)

Creating Multi-Class Oversampled File for Trainer 1
Finished writing:  ../data/CSE-CIC-IDS2018/processed_bootstrap/TRAIN-1-DATA-IDS-2018-multiclass-bootstrap.csv
Creating Multi-Class Oversampled File for Trainer 2
Finished writing:  ../data/CSE-CIC-IDS2018/processed_bootstrap/TRAIN-2-DATA-IDS-2018-multiclass-bootstrap.csv
Creating Multi-Class Oversampled File for Trainer 3
Finished writing:  ../data/CSE-CIC-IDS2018/processed_bootstrap/TRAIN-3-DATA-IDS-2018-multiclass-bootstrap.csv
Creating Multi-Class Oversampled File for Trainer 4
Finished writing:  ../data/CSE-CIC-IDS2018/processed_bootstrap/TRAIN-4-DATA-IDS-2018-multiclass-bootstrap.csv
