In [None]:
import pandas as pd 
import numpy as np
from google.colab import data_table
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

In [None]:
# a function for visualizing data by using google colab data_table tool
def visual(inp):
  data_table.DataTable.max_columns=100
  return data_table.DataTable(inp)

In [None]:
# import the data from a csv file
df=pd.read_csv("/content/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",low_memory=False)
df.shape

(225745, 79)

In [None]:
# data description
visual(df.describe())

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,...,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0
mean,8879.61946,16241650.0,4.874916,4.572775,939.463346,5960.477,538.535693,27.882221,164.826715,214.907242,...,3.311497,21.482753,184826.1,12934.36,208084.9,177620.1,10322140.0,3611943.0,12878130.0,7755355.0
std,19754.6474,31524370.0,15.422874,21.755356,3249.403484,39218.34,1864.128991,163.324159,504.892965,797.411073,...,12.270018,4.166799,797925.0,210273.7,900235.0,784260.2,21853030.0,12756890.0,26921260.0,19831090.0
min,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,80.0,71180.0,2.0,1.0,26.0,0.0,6.0,0.0,6.0,0.0,...,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,1452333.0,3.0,4.0,30.0,164.0,20.0,0.0,8.666667,5.301991,...,2.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,80.0,8805237.0,5.0,5.0,63.0,11601.0,34.0,6.0,32.0,10.263203,...,4.0,20.0,1878.0,0.0,1878.0,1862.0,8239725.0,0.0,8253838.0,7422849.0
max,65532.0,119999900.0,1932.0,2942.0,183012.0,5172346.0,11680.0,1472.0,3867.0,6692.644993,...,1931.0,52.0,100000000.0,39500000.0,100000000.0,100000000.0,120000000.0,65300000.0,120000000.0,120000000.0


In [None]:
# convert str value of label column to int
non_numeric_label={"BENIGN":0,"DDoS":1}
df[" Label"]=df[" Label"].map(non_numeric_label)
df[" Label"].value_counts(dropna=False)
# 0 represent for no attack
# 1 represents for DDoS attack

1    128027
0     97718
Name:  Label, dtype: int64

In [None]:
# make a list by column names
columns=df.columns.tolist()

# maximum value of each column
for i in range(0,len(columns)):
  print(f"{columns[i]} -> {df[columns[i]].max()}")


 Destination Port -> 65532
 Flow Duration -> 119999937
 Total Fwd Packets -> 1932
 Total Backward Packets -> 2942
Total Length of Fwd Packets -> 183012
 Total Length of Bwd Packets -> 5172346
 Fwd Packet Length Max -> 11680
 Fwd Packet Length Min -> 1472
 Fwd Packet Length Mean -> 3867.0
 Fwd Packet Length Std -> 6692.644993
Bwd Packet Length Max -> 11680
 Bwd Packet Length Min -> 1460
 Bwd Packet Length Mean -> 5800.5
 Bwd Packet Length Std -> 8194.660487
Flow Bytes/s -> inf
 Flow Packets/s -> inf
 Flow IAT Mean -> 107000000.0
 Flow IAT Std -> 69200000.0
 Flow IAT Max -> 120000000
 Flow IAT Min -> 107000000
Fwd IAT Total -> 120000000
 Fwd IAT Mean -> 120000000.0
 Fwd IAT Std -> 76700000.0
 Fwd IAT Max -> 120000000
 Fwd IAT Min -> 120000000
Bwd IAT Total -> 120000000
 Bwd IAT Mean -> 120000000.0
 Bwd IAT Std -> 76700000.0
 Bwd IAT Max -> 120000000
 Bwd IAT Min -> 120000000
Fwd PSH Flags -> 1
 Bwd PSH Flags -> 0
 Fwd URG Flags -> 0
 Bwd URG Flags -> 0
 Fwd Header Length -> 39396
 Bwd He

In [None]:
# put all ports and their usage in a dictonary
# key: port, value: usage
port_dicts={}
for i in range(df.shape[0]):
  port_in_the_data=df[" Destination Port"][i]
  if port_in_the_data in port_dicts:
    port_dicts[port_in_the_data]+=1
  else:
    port_dicts.update({port_in_the_data : 1})

# sort with respect to the usage of them
from collections import OrderedDict
port_dicts = OrderedDict(sorted(port_dicts.items()))

# find the most used port
values=port_dicts.values()
max_used_port=max(values)

# find the port
for i in port_dicts:
  if(port_dicts[i]==max_used_port):
    print(f"Max used port is '{i}' with '{max_used_port}' calls")

Max used port is '80' with '136951' calls


In [None]:
# port 80 which is most used port attacked and not attacked values
print(f"port 80, used: {len(df.loc[(df[' Destination Port']==80)])}")
print(f"port 80, attacked: {len(df.loc[(df[' Destination Port']==80) & (df[' Label']==1)])}")
print(f"port 80, not attacked: {len(df.loc[(df[' Destination Port']==80) & (df[' Label']==0)])}")

port 80, used: 136951
port 80, attacked: 128024
port 80, not attacked: 8927


In [None]:
# optimize the int type values
for i in columns:
  if (df[f"{i}"].dtype=='int64'):
    if (df[f"{i}"].max()<127):
      df[f"{i}"]=df[f"{i}"].astype('int8')
    elif(df[f"{i}"].max()<32767):
      df[f"{i}"]=df[f"{i}"].astype('int16')
    elif(df[f"{i}"].max()<2*10**9):
      df[f"{i}"]=df[f"{i}"].astype('int32')
  else:
    continue

In [None]:
# visualize how many null values exist in data in descending
print(df.isnull().sum(axis=0).sort_values(ascending=False))

Flow Bytes/s             4
 Destination Port        0
 Bwd Avg Bytes/Bulk      0
 Fwd Avg Packets/Bulk    0
Fwd Avg Bytes/Bulk       0
                        ..
 Bwd IAT Mean            0
Bwd IAT Total            0
 Fwd IAT Min             0
 Fwd IAT Max             0
 Label                   0
Length: 79, dtype: int64


In [None]:
# drop null values(samples) and replace to main dataset
df.dropna(subset=['Flow Bytes/s'],inplace=True)
df.dropna(subset=[' Label'],inplace=True)
df.dropna(subset=[' Init_Win_bytes_backward'],inplace=True)
df.dropna(subset=[' Fwd Avg Packets/Bulk'],inplace=True)
df.dropna(subset=[' Fwd Avg Bulk Rate'],inplace=True)

In [None]:
# visualize how many null values exist in data in descending
print(df.isnull().sum(axis=0).sort_values(ascending=False))

 Destination Port        0
 ECE Flag Count          0
 Fwd Avg Packets/Bulk    0
Fwd Avg Bytes/Bulk       0
 Fwd Header Length.1     0
                        ..
Bwd IAT Total            0
 Fwd IAT Min             0
 Fwd IAT Max             0
 Fwd IAT Std             0
 Label                   0
Length: 79, dtype: int64


In [None]:
visual(df.describe())
# Flow Bytes/s and Flow Packets/s max values are infinity

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
count,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,...,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0
mean,8879.294213,16241940.0,4.874972,4.572851,939.479993,5960.583,538.545235,27.882715,164.829636,214.91105,...,21.482513,184829.4,12934.59,208088.6,177623.2,10322330.0,3612007.0,12878360.0,7755493.0,0.567141
std,19754.491905,31524580.0,15.423004,21.755541,3249.429866,39218.68,1864.144128,163.325564,504.896961,797.417625,...,4.16639,797931.7,210275.5,900242.5,784266.8,21853180.0,12757000.0,26921450.0,19831240.0,0.495473
min,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,80.0,71183.0,2.0,1.0,26.0,0.0,6.0,0.0,6.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,1452362.0,3.0,4.0,30.0,164.0,20.0,0.0,8.666667,5.301991,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,80.0,8805245.0,5.0,5.0,63.0,11601.0,34.0,6.0,32.0,10.263203,...,20.0,1878.0,0.0,1878.0,1862.0,8239754.0,0.0,8253869.0,7422871.0,1.0
max,65532.0,119999900.0,1932.0,2942.0,183012.0,5172346.0,11680.0,1472.0,3867.0,6692.644993,...,52.0,100000000.0,39500000.0,100000000.0,100000000.0,120000000.0,65300000.0,120000000.0,120000000.0,1.0


In [None]:
# delete both of them from features
df=df.drop('Flow Bytes/s',axis='columns')
df=df.drop(' Flow Packets/s',axis='columns')

# update columns list
columns=df.columns.tolist()

In [None]:
# other than ports column, normalize all featuers between zero and one
scaler = MinMaxScaler(feature_range=(0, 1))
for i in columns:
  try:
    if (i== ' Destination Port'):
      continue
    else:
      df[[f"{i}"]] = scaler.fit_transform(df[[f"{i}"]])
  except ValueError:
    continue

In [None]:
# delete some columns which all of their values are zero
for k in range(len(columns)):
  if(df[columns[k]].min()==0 and df[columns[k]].max()==0):
    df=df.drop(columns[k],axis='columns')

In [None]:
visual(df.describe())

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
count,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,...,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0,225741.0
mean,8879.294213,0.13535,0.002007,0.001554,0.005133,0.001152,0.046108,0.018942,0.042625,0.032112,...,0.413125,0.001848,0.000327,0.002081,0.001776,0.086019,0.055314,0.10732,0.064629,0.567141
std,19754.491905,0.262705,0.007987,0.007395,0.017755,0.007582,0.159601,0.110955,0.130566,0.119148,...,0.080123,0.007979,0.005323,0.009002,0.007843,0.18211,0.19536,0.224345,0.16526,0.495473
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,80.0,0.000593,0.000518,0.00034,0.000142,0.0,0.000514,0.0,0.001552,0.0,...,0.384615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,0.012103,0.001036,0.00136,0.000164,3.2e-05,0.001712,0.0,0.002241,0.000792,...,0.384615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,80.0,0.073377,0.002071,0.0017,0.000344,0.002243,0.002911,0.004076,0.008275,0.001534,...,0.384615,1.9e-05,0.0,1.9e-05,1.9e-05,0.068665,0.0,0.068782,0.061857,1.0
max,65532.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# data shape after pre-processing part
df.shape

(225741, 67)

In [None]:
# save preprocessed dataset as a csv file
df.to_csv('pre_processed_dataset.csv')