# Data Preprocessing

### On the CICIDS2017 Dataset
---------------------------------------------

## Combine all Data Files

In [2]:
import pandas as pd
import os

# combine all CICIDS2017 files
path = '../CICIDS2017/raw/'
combined_df = pd.DataFrame()
for file in os.listdir(path):
    if file.endswith('.csv'):
        print(file)
        df = pd.read_csv(path + file)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    

combined_df.head()

Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Friday-WorkingHours-Morning.pcap_ISCX.csv
Monday-WorkingHours.pcap_ISCX.csv
Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Tuesday-WorkingHours.pcap_ISCX.csv
Wednesday-workingHours.pcap_ISCX.csv


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [2]:
print(combined_df[' Label'].value_counts())
print('Shape: ', combined_df.shape)

df = combined_df.copy()

 Label
BENIGN                        2273097
DoS Hulk                       231073
PortScan                       158930
DDoS                           128027
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64
Shape:  (2830743, 79)


## Remove NaN and Infinity Values

In [3]:
import numpy as np

# print number of rows with NaN values
print("Number of rows with NaN values: ", df.isnull().sum().sum())

# print number of rows with Infinity values
inf_rows = df.isin([np.inf, -np.inf]).any(axis=1)
print(f"Number of rows with Infinity values: {inf_rows.sum()}")


print("Removing NaN and Infinity values....")
# remove NaN values
df.dropna(inplace=True)
# remove Infinity values
df = df[~inf_rows]


# check if there are still NaN or Infinity values
print(f"Remaining NaN Values: {df.isnull().sum().sum()}")
print(f"Remaining Infinity values: {df.isin([np.inf, -np.inf]).any(axis=1).sum()}")
print('Shape after removing NaN and Infinity values: ', df.shape)


Number of rows with NaN values:  1358
Number of rows with Infinity values: 2867
Removing NaN and Infinity values....


  df = df[~inf_rows]


Remaining NaN Values: 0
Remaining Infinity values: 0
Shape after removing NaN and Infinity values:  (2827876, 79)


## Shuffle Data

In [4]:
# shuffle data
df = df.sample(frac=1, random_state=187).reset_index(drop=True)
print("Shuffled data:")
print(df.head(2))


Shuffled data:
    Destination Port   Flow Duration   Total Fwd Packets  \
0                 80        63095538                   7   
1              51235             159                   1   

    Total Backward Packets  Total Length of Fwd Packets  \
0                        0                            0   
1                        1                            0   

    Total Length of Bwd Packets   Fwd Packet Length Max  \
0                             0                       0   
1                             0                       0   

    Fwd Packet Length Min   Fwd Packet Length Mean   Fwd Packet Length Std  \
0                       0                      0.0                     0.0   
1                       0                      0.0                     0.0   

   ...   min_seg_size_forward  Active Mean   Active Std   Active Max  \
0  ...                     40    7008270.0          0.0      7008270   
1  ...                     32          0.0          0.0            0 

## Split Label & Features

In [5]:
feature_df = df.drop(columns=' Label')
label_df = df[' Label']

## Symbolic Feature Encoding

In [6]:
# check for symbolic values
print("Checking for symbolic values....")
symbolic_values = df.select_dtypes(include=['object']).columns
print(f"Symbolic values: {symbolic_values}")

Checking for symbolic values....
Symbolic values: Index([' Label'], dtype='object')


In [7]:
from sklearn.preprocessing import LabelEncoder

# binary label encoding 
print("Binary label encoding....")
binary_label_df = label_df.apply(lambda x: 0 if x == 'BENIGN' else 1)
print(binary_label_df.value_counts())
print(binary_label_df.shape)

# multi-class label encoding
print("Multi-class label encoding....")
label_encoder = LabelEncoder()
multiclass_label_df = pd.DataFrame()
multiclass_label_df[' Label'] = label_encoder.fit_transform(label_df)
print(multiclass_label_df.value_counts())
print(multiclass_label_df.shape)

Binary label encoding....
 Label
0    2271320
1     556556
Name: count, dtype: int64
(2827876,)
Multi-class label encoding....
 Label
0         2271320
4          230124
10         158804
2          128025
3           10293
7            7935
11           5897
6            5796
5            5499
1            1956
12           1507
14            652
9              36
13             21
8              11
Name: count, dtype: int64
(2827876, 1)


## Remove Feature Columns with only 0 values

In [8]:
# check for columns which contain only 0 values
print("Columns which contain only 0 values....")
zero_columns = feature_df.columns[(feature_df.sum() == 0)]
print(f"Zero Columns: {zero_columns}")
# drop columns with only 0 values
feature_df.drop(columns=zero_columns, inplace=True)
print("Dropped Zero Columns....")
print(feature_df.shape)

Columns which contain only 0 values....
Zero Columns: Index([' Bwd PSH Flags', ' Bwd URG Flags', 'Fwd Avg Bytes/Bulk',
       ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk',
       ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate'],
      dtype='object')
Dropped Zero Columns....
(2827876, 70)


In [10]:
# print out all feature names
print(feature_df.columns)

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Fwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length',
       ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance',
       'FIN Flag Count', ' SYN Flag Count', 

## Normalization

### Min-Max, Standard and No-Norm Normalization

In [9]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# min-max normalization
print("Min-Max Normalization....")
min_max_scaler = MinMaxScaler()
min_max_norm_feature_df = pd.DataFrame(min_max_scaler.fit_transform(feature_df), columns=feature_df.columns, index=df.index)
print(min_max_norm_feature_df.shape)
print(min_max_norm_feature_df.head(2))

# standardization without label column
print("Standardization....")
scaler = StandardScaler()
standard_feature_df = pd.DataFrame(scaler.fit_transform(feature_df), columns=feature_df.columns, index=df.index)
print(standard_feature_df.shape)
print(standard_feature_df.head(2))

# no normalization
print("No Normalization....")
no_norm_feature_df = feature_df.copy()
print(no_norm_feature_df.shape)
print(no_norm_feature_df.head(2))

Min-Max Normalization....
(2827876, 70)
    Destination Port   Flow Duration   Total Fwd Packets  \
0           0.001221        0.525796            0.000027   
1           0.781796        0.000001            0.000000   

    Total Backward Packets  Total Length of Fwd Packets  \
0                 0.000000                          0.0   
1                 0.000003                          0.0   

    Total Length of Bwd Packets   Fwd Packet Length Max  \
0                           0.0                     0.0   
1                           0.0                     0.0   

    Fwd Packet Length Min   Fwd Packet Length Mean   Fwd Packet Length Std  \
0                     0.0                      0.0                     0.0   
1                     0.0                      0.0                     0.0   

   ...   act_data_pkt_fwd   min_seg_size_forward  Active Mean   Active Std  \
0  ...                0.0                    1.0     0.063712          0.0   
1  ...                0.0       

## Store Data

In [9]:
# combine features and labels
binary_min_max_combined_df = pd.concat([min_max_norm_feature_df, binary_label_df], axis=1)
print(f"Binary Min-Max Combined : {binary_min_max_combined_df.shape}")
print(binary_min_max_combined_df.iloc[:, [0, -1]].head(2))
binary_standard_combined_df = pd.concat([standard_feature_df, binary_label_df], axis=1)
print(f"Binary Standard Combined : {binary_standard_combined_df.shape}")
print(binary_standard_combined_df.iloc[:, [0, -1]].head(2))
binary_no_norm_combined_df = pd.concat([no_norm_feature_df, binary_label_df], axis=1)
print(f"Binary No Norm Combined : {binary_no_norm_combined_df.shape}")
print(binary_no_norm_combined_df.iloc[:, [0, -1]].head(2))
multiclass_min_max_combined_df = pd.concat([min_max_norm_feature_df, multiclass_label_df], axis=1)
print(f"Multiclass Min-Max Combined : {multiclass_min_max_combined_df.shape}")
print(multiclass_min_max_combined_df.iloc[:, [0, -1]].head(2))
multiclass_standard_combined_df = pd.concat([standard_feature_df, multiclass_label_df], axis=1)
print(f"Multiclass Standard Combined : {multiclass_standard_combined_df.shape}")
print(multiclass_standard_combined_df.iloc[:, [0, -1]].head(2))
multiclass_no_norm_combined_df = pd.concat([no_norm_feature_df, multiclass_label_df], axis=1)
print(f"Multiclass No Norm Combined : {multiclass_no_norm_combined_df.shape}")
print(multiclass_no_norm_combined_df.iloc[:, [0, -1]].head(2))


# store dataframes to csv
print("Store Dataframes to csv....")
binary_min_max_combined_df.to_csv('../CICIDS2017/preprocessed/binary_min_max_combined.csv', index=False)
print("... binary_min_max_combined.csv")
binary_standard_combined_df.to_csv('../CICIDS2017/preprocessed/binary_standard_combined.csv', index=False)
print("... binary_standard_combined.csv")
binary_no_norm_combined_df.to_csv('../CICIDS2017/preprocessed/binary_no_norm_combined.csv', index=False)
print("... binary_no_norm_combined.csv")
multiclass_min_max_combined_df.to_csv('../CICIDS2017/preprocessed/multiclass_min_max_combined.csv', index=False)
print("... multiclass_min_max_combined.csv")
multiclass_standard_combined_df.to_csv('../CICIDS2017/preprocessed/multiclass_standard_combined.csv', index=False)
print("... multiclass_standard_combined.csv")
multiclass_no_norm_combined_df.to_csv('../CICIDS2017/preprocessed/multiclass_no_norm_combined.csv', index=False)
print("... multiclass_no_norm_combined.csv")

Binary Min-Max Combined : (2827876, 71)
   Destination Port  Label
0          0.837186      0
1          0.840070      0
Binary Standard Combined : (2827876, 71)
   Destination Port  Label
0          2.561161      0
1          2.571503      0
Binary No Norm Combined : (2827876, 71)
   Destination Port  Label
0             54865      0
1             55054      0
Multiclass Min-Max Combined : (2830743, 71)
   Destination Port  Label
0          0.837186    0.0
1          0.840070    0.0
Multiclass Standard Combined : (2830743, 71)
   Destination Port  Label
0          2.561161    0.0
1          2.571503    0.0
Multiclass No Norm Combined : (2830743, 71)
   Destination Port  Label
0           54865.0    0.0
1           55054.0    0.0
Store Dataframes to csv....
... binary_min_max_combined.csv
... binary_standard_combined.csv
... binary_no_norm_combined.csv
... multiclass_min_max_combined.csv
... multiclass_standard_combined.csv
... multiclass_no_norm_combined.csv


In [60]:
print("Storing additional Data....")

# store min-max values
min = min_max_scaler.data_min_
max = min_max_scaler.data_max_
min_max_df = pd.DataFrame({'min': min, 'max': max}, index=feature_df.columns)
min_max_df.to_csv('../CICIDS2017/preprocessed/min_max_values.csv')
print("... stored min_max_values.csv")

# store feature types
feature_types = feature_df.dtypes.to_numpy()
np.save('../CICIDS2017/preprocessed/feature_types.npy', feature_types)
print("... stored feature_types.npy")

Storing Min-Max values to csv....
... stored min_max_values.csv
