# Data Preprocessing

### On the CICIDS2017 Dataset
---------------------------------------------

## Combine all Data Files

In [7]:
import pandas as pd
import os

# combine all CICIDS2017 files
path = '../data/CICIDS2017/'
combined_df = pd.DataFrame()
for file in os.listdir(path):
    if file.endswith('.csv'):
        print(file)
        df = pd.read_csv(path + file)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    

combined_df.head()

Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Friday-WorkingHours-Morning.pcap_ISCX.csv
Monday-WorkingHours.pcap_ISCX.csv
Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Tuesday-WorkingHours.pcap_ISCX.csv
Wednesday-workingHours.pcap_ISCX.csv


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [8]:
print(combined_df[' Label'].value_counts())
print('Shape: ', combined_df.shape)

 Label
BENIGN                        2273097
DoS Hulk                       231073
PortScan                       158930
DDoS                           128027
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64
Shape:  (2830743, 79)


## Remove NaN and Infinity Values

In [9]:
import numpy as np

# remove rows with NaN values
combined_df.dropna(inplace=True)
# remove rows with Infinity values
inf_rows = combined_df.isin([np.inf, -np.inf]).any(axis=1)
print(f"Number of rows with infinity values: {inf_rows.sum()}")
# Drop rows with infinity
combined_df = combined_df[~inf_rows]
# check if there are still NaN or Infinity values
print("Removing NaN and Infinity values....")
print(f"Remaining NaN Values: {combined_df.isnull().sum()}")
print(f"Remaining Infinity values: {combined_df.isin([np.inf, -np.inf]).any(axis=1).sum()}")
print('Shape after removing NaN and Infinity values: ', combined_df.shape)


Number of rows with infinity values: 1509
Removing NaN and Infinity values....
Remaining NaN Values:  Destination Port              0
 Flow Duration                 0
 Total Fwd Packets             0
 Total Backward Packets        0
Total Length of Fwd Packets    0
                              ..
Idle Mean                      0
 Idle Std                      0
 Idle Max                      0
 Idle Min                      0
 Label                         0
Length: 79, dtype: int64
Remaining Infinity values: 0
Shape after removing NaN and Infinity values:  (2827876, 79)


## Symbolic Feature Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder

# check for symbolic values
print("Checking for symbolic values....")
symbolic_values = combined_df.select_dtypes(include=['object']).columns
print(f"Symbolic values: {symbolic_values}")

feature_df = combined_df

# encode label column
label_df = feature_df[' Label']
feature_df.drop(columns=[' Label'], inplace=True)
# binary encoding
binary_label_df = label_df.apply(lambda x: 0 if x == 'BENIGN' else 1)
print(f"Binary Label: {binary_label_df.value_counts()}")

# multiclass encoding
label_encoder = LabelEncoder()
multiclass_label_df = label_encoder.fit_transform(label_df)
print(f"Multiclass Label: {label_encoder.classes_}")
# print unique labels and their counts
unique_labels, counts = np.unique(multiclass_label_df, return_counts=True)
print(f"Unique Labels: {unique_labels}")
print(f"Unique Labels Counts: {counts}")


Checking for symbolic values....
Symbolic values: Index([' Label'], dtype='object')
Binary Label:  Label
0    2271320
1     556556
Name: count, dtype: int64
Multiclass Label: ['BENIGN' 'Bot' 'DDoS' 'DoS GoldenEye' 'DoS Hulk' 'DoS Slowhttptest'
 'DoS slowloris' 'FTP-Patator' 'Heartbleed' 'Infiltration' 'PortScan'
 'SSH-Patator' 'Web Attack � Brute Force' 'Web Attack � Sql Injection'
 'Web Attack � XSS']
Unique Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
Unique Labels Counts: [2271320    1956  128025   10293  230124    5499    5796    7935      11
      36  158804    5897    1507      21     652]


## Normalization

In [11]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# normalize features
scaler = StandardScaler()

# min-max normalization
min_max_scaler = MinMaxScaler()
min_max_feature_df = min_max_scaler.fit_transform(feature_df)
min_max_feature_df = pd.DataFrame(min_max_feature_df, columns=feature_df.columns)
print("Min-Max Normalization")
print(min_max_feature_df.head())

# standard normalization
standard_feature_df = scaler.fit_transform(feature_df)
standard_feature_df = pd.DataFrame(standard_feature_df, columns=feature_df.columns)
print("Standard Normalization")
print(standard_feature_df.head())

# no normalization
print("No Normalization")
no_norm_df = feature_df
print(no_norm_df.head())


Min-Max Normalization
    Destination Port   Flow Duration   Total Fwd Packets  \
0           0.837186    1.333333e-07            0.000005   
1           0.840070    1.016667e-06            0.000000   
2           0.840085    5.416666e-07            0.000000   
3           0.705516    3.916666e-07            0.000000   
4           0.837156    1.333333e-07            0.000005   

    Total Backward Packets  Total Length of Fwd Packets  \
0                 0.000000                 9.302326e-07   
1                 0.000003                 4.651163e-07   
2                 0.000003                 4.651163e-07   
3                 0.000003                 4.651163e-07   
4                 0.000000                 9.302326e-07   

    Total Length of Bwd Packets   Fwd Packet Length Max  \
0                  0.000000e+00                0.000242   
1                  9.153974e-09                0.000242   
2                  9.153974e-09                0.000242   
3                  9.15397

## Store Data

In [12]:
# combine features and labels
binary_min_max_combined_df = pd.concat([min_max_feature_df, binary_label_df], axis=1)
print("Binary Min-Max Combined")
print(binary_min_max_combined_df.head(1))
binary_standard_combined_df = pd.concat([standard_feature_df, binary_label_df], axis=1)
binary_no_norm_combined_df = pd.concat([no_norm_df, binary_label_df], axis=1)
multiclass_min_max_combined_df = pd.concat([min_max_feature_df, pd.Series(multiclass_label_df)], axis=1)
multiclass_standard_combined_df = pd.concat([standard_feature_df, pd.Series(multiclass_label_df)], axis=1)
multiclass_no_norm_combined_df = pd.concat([no_norm_df, pd.Series(multiclass_label_df)], axis=1)


# store dataframes to csv
binary_min_max_combined_df.to_csv('../data/preprocessed/binary_min_max_combined.csv', index=False)
binary_standard_combined_df.to_csv('../data/preprocessed/binary_standard_combined.csv', index=False)
binary_no_norm_combined_df.to_csv('../data/preprocessed/binary_no_norm_combined.csv', index=False)
multiclass_min_max_combined_df.to_csv('../data/preprocessed/multiclass_min_max_combined.csv', index=False)
multiclass_standard_combined_df.to_csv('../data/preprocessed/multiclass_standard_combined.csv', index=False)
multiclass_no_norm_combined_df.to_csv('../data/preprocessed/multiclass_no_norm_combined.csv', index=False)
print("Dataframes stored to csv....")

Binary Min-Max Combined
    Destination Port   Flow Duration   Total Fwd Packets  \
0           0.837186    1.333333e-07            0.000005   

    Total Backward Packets  Total Length of Fwd Packets  \
0                      0.0                 9.302326e-07   

    Total Length of Bwd Packets   Fwd Packet Length Max  \
0                           0.0                0.000242   

    Fwd Packet Length Min   Fwd Packet Length Mean   Fwd Packet Length Std  \
0                0.002581                  0.00101                     0.0   

   ...   min_seg_size_forward  Active Mean   Active Std   Active Max  \
0  ...                    1.0          0.0          0.0          0.0   

    Active Min  Idle Mean   Idle Std   Idle Max   Idle Min   Label  
0          0.0        0.0        0.0        0.0        0.0     0.0  

[1 rows x 79 columns]
Dataframes stored to csv....
