# Data Splitter
---

## Load Preprocessed Data

In [1]:
import pandas as pd

# load dataset
df = pd.read_csv('../CICIDS2017/preprocessed/binary_min_max_combined.csv')
df.shape
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.837186,1.333333e-07,5e-06,0.0,9.302326e-07,0.0,0.000242,0.002581,0.00101,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.84007,1.016667e-06,0.0,3e-06,4.651163e-07,9.153974e-09,0.000242,0.002581,0.00101,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.840085,5.416666e-07,0.0,3e-06,4.651163e-07,9.153974e-09,0.000242,0.002581,0.00101,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.705516,3.916666e-07,0.0,3e-06,4.651163e-07,9.153974e-09,0.000242,0.002581,0.00101,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.837156,1.333333e-07,5e-06,0.0,9.302326e-07,0.0,0.000242,0.002581,0.00101,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Split Data

In [3]:
# split data
from sklearn.model_selection import train_test_split

X = df.drop(columns=[' Label'])
y = df[' Label']

### One-Hot Label Encoding

In [4]:
# one-hot-encode y
y = pd.get_dummies(y)
y.columns = ["BENIGN", "MALICIOUS"]
print("Label One-Hot-Encoding....")
print(y[:2])

print("Split Data....")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(f"Train Data: X: {X_train.shape}, y: {y_train.shape}")
print(f"Test Data: X: {X_test.shape}, y: {y_test.shape}")

Label One-Hot-Encoding....
   BENIGN  MALICIOUS
0    True      False
1    True      False
Split Data....
Train Data: X: (2120907, 70), y: (2120907, 2)
Test Data: X: (706969, 70), y: (706969, 2)


## Store Train and Test Data

In [11]:
print("Storing Train and Test Split.....")
X_train.to_csv('../CICIDS2017/train_test_split/X_train.csv', index=False)
print("Stored X_train")
X_test.to_csv('../CICIDS2017/train_test_split/X_test.csv', index=False)
print("Stored X_test")
y_train.to_csv('../CICIDS2017/train_test_split/y_train.csv', index=False)
print("Stored y_train")
y_test.to_csv('../CICIDS2017/train_test_split/y_test.csv', index=False)
print("Stored y_test")
print("Done!")

## Store Small Train and Test Data

In [5]:
# 0.25 train/test split -> 2000/500
print("Storing Small Train and Test Split.....")
X_train[:2000].to_csv('../CICIDS2017/train_test_split/X_train_small.csv', index=False)
print("Stored X_train_small")
X_test[:500].to_csv('../CICIDS2017/train_test_split/X_test_small.csv', index=False)
print("Stored X_test_small")
y_train[:2000].to_csv('../CICIDS2017/train_test_split/y_train_small.csv', index=False)
print("Stored y_train_small")
y_test[:500].to_csv('../CICIDS2017/train_test_split/y_test_small.csv', index=False)
print("Stored y_test_small")
print("Done!")

Storing Small Train and Test Split.....
Stored X_train_small
Stored X_test_small
Stored y_train_small
Stored y_test_small
Done!
