# Data Splitter
---

In [1]:
# TODO: define whether to use binary or multiclass classification
multiclass = True

## Load Preprocessed Data

In [2]:
import pandas as pd

# load dataset
if multiclass:
    df = pd.read_csv('../CICIDS2017/preprocessed/multiclass_min_max_combined.csv')
else:
    df = pd.read_csv('../CICIDS2017/preprocessed/binary_min_max_combined.csv')
df.shape
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.138552,5.083333e-07,5e-06,0.0,3.100775e-07,0.0,8.1e-05,0.00086,0.000337,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.000809,0.0004499583,0.0,3e-06,3.643411e-06,1.571432e-07,0.001894,0.020215,0.007911,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.015808,7.499999e-07,0.0,3e-06,0.0,9.153974e-09,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
3,0.00676,0.0006526416,2.3e-05,2.1e-05,0.0001494574,5.139956e-06,0.051289,0.0,0.054089,0.071124,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.000809,1.758333e-06,5e-06,7e-06,5.426357e-06,3.814156e-07,0.00141,0.015054,0.005891,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Split Data

In [3]:
# split data
from sklearn.model_selection import train_test_split

X = df.drop(columns=[' Label'])
y = df[' Label']

### One-Hot Label Encoding

In [4]:
import json


def get_label_encoding():
    if multiclass:
        with open("../CICIDS2017/preprocessed/multiclass_label_encoding.json", "r") as file:
            class_mapping = json.load(file)
    else: 
        with open("../CICIDS2017/preprocessed/binary_label_encoding.json", "r") as file:
            class_mapping = json.load(file)
    target_names = [class_mapping[str(i)] for i in range(len(class_mapping))]
    return target_names

In [8]:
# def set_single_class_encoding():
#     y.columns = ["BENIGN", "MALICIOUS"]

# def set_multi_class_encoding():
#     import json
#     with open('../CICIDS2017/preprocessed/multiclass_label_encoding.json') as f:
#         label_encoding = json.load(f)
#         y.columns = [label_encoding[str(col)] for col in y.columns]

In [6]:
# one-hot-encode y
y = pd.get_dummies(y)
print("Label One-Hot-Encoding....")
label_encoding = get_label_encoding()
y.columns = label_encoding
print(y[:2])

print("Split Data....")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(f"Train Data: X: {X_train.shape}, y: {y_train.shape}")
print(f"Test Data: X: {X_test.shape}, y: {y_test.shape}")

Label One-Hot-Encoding....
   BENIGN    Bot   DDoS  DoS GoldenEye  DoS Hulk  DoS Slowhttptest  \
0    True  False  False          False     False             False   
1    True  False  False          False     False             False   

   DoS Slowloris  FTP-Patator  Heartbleed  Infiltration  PortScan  \
0          False        False       False         False     False   
1          False        False       False         False     False   

   SSH-Patator  Web Attack Brute Force  Web Attack Sql Injection  \
0        False                   False                     False   
1        False                   False                     False   

   Web Attack XSS  
0           False  
1           False  
Split Data....
Train Data: X: (2120907, 70), y: (2120907, 15)
Test Data: X: (706969, 70), y: (706969, 15)


## Store Train and Test Data

In [10]:
print("Storing Train and Test Split.....")
X_train.to_csv('../CICIDS2017/train_test_split/X_train.csv', index=False)
print("Stored X_train")
X_test.to_csv('../CICIDS2017/train_test_split/X_test.csv', index=False)
print("Stored X_test")
y_train.to_csv('../CICIDS2017/train_test_split/y_train_multiclass.csv', index=False)
print("Stored y_train")
y_test.to_csv('../CICIDS2017/train_test_split/y_test_multiclass.csv', index=False)
print("Stored y_test")
print("Done!")

Storing Train and Test Split.....
Stored X_train
Stored X_test
Stored y_train
Stored y_test
Done!


## Store Small Train and Test Data

In [5]:
# 0.25 train/test split -> 2000/500
print("Storing Small Train and Test Split.....")
X_train[:2000].to_csv('../CICIDS2017/train_test_split/X_train_small.csv', index=False)
print("Stored X_train_small")
X_test[:500].to_csv('../CICIDS2017/train_test_split/X_test_small.csv', index=False)
print("Stored X_test_small")
y_train[:2000].to_csv('../CICIDS2017/train_test_split/y_train_small.csv', index=False)
print("Stored y_train_small")
y_test[:500].to_csv('../CICIDS2017/train_test_split/y_test_small.csv', index=False)
print("Stored y_test_small")
print("Done!")

Storing Small Train and Test Split.....
Stored X_train_small
Stored X_test_small
Stored y_train_small
Stored y_test_small
Done!


## Store Train and Test Data for Prototype

In [4]:
print("Storing Train and Test Split for Prototyping.....")
X_train[:120000].to_csv('../CICIDS2017/train_test_split/X_train_proto.csv', index=False)
print("Stored X_train_proto")
X_test[:30000].to_csv('../CICIDS2017/train_test_split/X_test_proto.csv', index=False)
print("Stored X_test_proto")
y_train[:120000].to_csv('../CICIDS2017/train_test_split/y_train_proto.csv', index=False)
print("Stored y_train_proto")
y_test[:30000].to_csv('../CICIDS2017/train_test_split/y_test_proto.csv', index=False)
print("Stored y_test_proto")
print("Done!")

Storing Train and Test Split for Prototyping.....
Stored X_train_proto
Stored X_test_proto
Stored y_train_proto
Stored y_test_proto
Done!
