In [1]:
# Step 1
import warnings
warnings.filterwarnings("ignore")

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('./CICIDS-7-flows.csv') 
start_time = time.time()

In [3]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,65511209.0,6.0,6.0,288.0,288.0,48.0,48.0,48.000000,0.00000,48.0,...,20.0,1506210.0,0.000000e+00,1506210.0,1506210.0,64004884.00,0.000000e+00,64004884.0,64004884.0,BENIGN
1,67037196.0,8.0,8.0,384.0,384.0,48.0,48.0,48.000000,0.00000,48.0,...,8.0,11034681.0,0.000000e+00,11034681.0,11034681.0,55956316.00,0.000000e+00,55956316.0,55956316.0,BENIGN
2,67987663.0,6.0,6.0,288.0,288.0,48.0,48.0,48.000000,0.00000,48.0,...,20.0,996497.0,0.000000e+00,996497.0,996497.0,66991029.00,0.000000e+00,66991029.0,66991029.0,BENIGN
3,72017628.0,8.0,8.0,384.0,384.0,48.0,48.0,48.000000,0.00000,48.0,...,8.0,15016472.0,0.000000e+00,15016472.0,15016472.0,56970996.00,0.000000e+00,56970996.0,56970996.0,BENIGN
4,96404610.0,58.0,33.0,12242.0,8238.0,1337.0,0.0,211.068966,287.25341,3069.0,...,32.0,3124660.0,3.965763e+06,8266119.0,386.0,20976398.75,2.298942e+07,54036735.0,5025593.0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687102,119835.0,1.0,1.0,45.0,103.0,45.0,45.0,45.000000,0.00000,103.0,...,20.0,0.0,0.000000e+00,0.0,0.0,0.00,0.000000e+00,0.0,0.0,BENIGN
687103,155149.0,1.0,1.0,45.0,140.0,45.0,45.0,45.000000,0.00000,140.0,...,20.0,0.0,0.000000e+00,0.0,0.0,0.00,0.000000e+00,0.0,0.0,BENIGN
687104,4.0,2.0,0.0,124.0,0.0,62.0,62.0,62.000000,0.00000,0.0,...,32.0,0.0,0.000000e+00,0.0,0.0,0.00,0.000000e+00,0.0,0.0,BENIGN
687105,125.0,2.0,2.0,96.0,96.0,48.0,48.0,48.000000,0.00000,48.0,...,20.0,0.0,0.000000e+00,0.0,0.0,0.00,0.000000e+00,0.0,0.0,BENIGN


In [4]:
df['Label'].unique()

labels = {'BENIGN', 'DDoS', 'DoS Hulk', 'FTP-Patator', 'SSH-Patator'}
df = df[df['Label'].isin(labels)]

In [5]:
df['Label'].value_counts()

BENIGN         349107
DoS Hulk       229705
DDoS            84396
FTP-Patator      7752
SSH-Patator      5854
Name: Label, dtype: int64

In [6]:
# Z-score normalization

df['Label'] = df['Label'].astype('object')

features = df.dtypes[df.dtypes != 'object'].index
df[features] = df[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
# Fill empty values by 0
df = df.fillna(0)


In [7]:
labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])

In [9]:
df.to_csv('./CICIDS_final_baseline.csv',index=False)   # was 'CICIDS_paper_subset_downsampledBenign_z_normalized.csv'

In [10]:
# retain the minority class instances and sample the majority class instances
df_minor = df[(df['Label']==3)|(df['Label']==4)]
df_major = df.drop(df_minor.index)

In [11]:
X = df_major.drop(['Label'],axis=1) 
y = df_major.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)

In [12]:
# use k-means to cluster the data samples and select a proportion of data from each cluster
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=1000, random_state=0).fit(X)

In [13]:
klabel=kmeans.labels_
df_major['klabel']=klabel

In [14]:
df_major['klabel'].value_counts()

546    9343
0      6922
4      6607
357    5856
808    4412
       ... 
232      10
487      10
943       9
310       9
203       4
Name: klabel, Length: 994, dtype: int64

In [15]:
cols = list(df_major)
cols.insert(78, cols.pop(cols.index('Label')))
df_major = df_major.loc[:, cols]

In [16]:
df_major

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,klabel,Label
0,0.905281,-0.001712,-0.002260,-0.043944,-0.007894,-0.213954,0.260959,-0.148888,-0.226821,-0.645426,...,2.531753,-0.029505,1.977578,2.682155,0.967979,-0.178118,0.917289,0.997378,614,0
1,0.941962,0.001887,0.000401,-0.027245,-0.007835,-0.213954,0.260959,-0.148888,-0.226821,-0.645426,...,19.305200,-0.029505,15.175870,20.395345,0.765190,-0.178118,0.719493,0.794202,224,0
2,0.964809,-0.001712,-0.002260,-0.043944,-0.007894,-0.213954,0.260959,-0.148888,-0.226821,-0.645426,...,1.634480,-0.029505,1.271553,1.734611,1.043216,-0.178118,0.990674,1.072760,614,0
3,1.061681,0.001887,0.000401,-0.027245,-0.007835,-0.213954,0.260959,-0.148888,-0.226821,-0.645426,...,26.314547,-0.029505,20.691219,27.797395,0.790755,-0.178118,0.744429,0.819816,935,0
4,1.647889,0.091852,0.033652,2.035431,-0.003014,0.933074,-0.368658,0.393395,0.376629,0.334022,...,5.380792,16.554593,11.341017,-0.117134,-0.116151,2.493613,0.672319,-0.491484,72,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687102,-0.666581,-0.010708,-0.008910,-0.086213,-0.008008,-0.216624,0.221608,-0.158864,-0.226821,-0.627594,...,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,542,0
687103,-0.665732,-0.010708,-0.008910,-0.086213,-0.007985,-0.216624,0.221608,-0.158864,-0.226821,-0.615598,...,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,758,0
687104,-0.669461,-0.008909,-0.010240,-0.072471,-0.008071,-0.201496,0.444597,-0.102331,-0.226821,-0.660988,...,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,260,0
687105,-0.669458,-0.008909,-0.007580,-0.077342,-0.008012,-0.213954,0.260959,-0.148888,-0.226821,-0.645426,...,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,507,0


In [17]:
def typicalSampling(group):
    name = group.name
    # frac = 0.008
    frac = 0.1
    return group.sample(frac=frac)

result = df_major.groupby(
    'klabel', group_keys=False
).apply(typicalSampling)

In [18]:
result['Label'].value_counts()

0    34908
2    22979
1     8435
Name: Label, dtype: int64

In [19]:
result

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,klabel,Label
349865,-0.669461,-0.008909,-0.010240,-0.094041,-0.008071,-0.256668,-0.368658,-0.308511,-0.226821,-0.660988,...,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,0,2
220265,-0.669461,-0.008909,-0.010240,-0.094041,-0.008071,-0.256668,-0.368658,-0.308511,-0.226821,-0.660988,...,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,0,2
240604,-0.669461,-0.008909,-0.010240,-0.094041,-0.008071,-0.256668,-0.368658,-0.308511,-0.226821,-0.660988,...,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,0,2
323608,-0.669461,-0.008909,-0.010240,-0.094041,-0.008071,-0.256668,-0.368658,-0.308511,-0.226821,-0.660988,...,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,0,2
394245,-0.669461,-0.008909,-0.010240,-0.094041,-0.008071,-0.256668,-0.368658,-0.308511,-0.226821,-0.660988,...,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380747,1.730174,-0.003511,-0.000929,-0.032289,-0.000953,0.048554,-0.368658,-0.072402,0.092667,0.747393,...,-0.100319,-0.029505,-0.093484,-0.097381,1.867339,-0.178118,1.794505,1.898459,999,2
206544,1.664318,-0.005310,-0.000929,-0.023592,-0.000953,0.098386,-0.368658,0.028194,0.190222,0.747393,...,-0.119704,-0.029505,-0.108737,-0.117852,1.799311,-0.178118,1.728152,1.830301,999,2
288277,1.632354,-0.005310,-0.000929,-0.033507,-0.000953,0.047664,-0.368658,-0.019194,0.130357,0.747393,...,-0.119704,-0.029505,-0.108737,-0.117852,1.769076,-0.178118,1.698662,1.800009,999,2
284476,1.699046,-0.005310,-0.000929,-0.039943,-0.000953,0.014739,-0.368658,-0.049955,0.091499,0.747393,...,-0.119704,-0.029505,-0.108737,-0.117852,1.834585,-0.178118,1.762557,1.865643,999,2


In [20]:
result = result.drop(['klabel'],axis=1)
result = result.append(df_minor)

In [21]:
result.to_csv('./CICIDS_final_baseline_undersampled.csv',index=0)

In [22]:
# Read the sampled dataset
df=pd.read_csv('./CICIDS_final_baseline_undersampled.csv')  # was './CICIDS_final_baseline.csv'

In [23]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,-0.669461,-0.008909,-0.010240,-0.094041,-0.008071,-0.256668,-0.368658,-0.308511,-0.226821,-0.660988,...,0.002696,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,2
1,-0.669461,-0.008909,-0.010240,-0.094041,-0.008071,-0.256668,-0.368658,-0.308511,-0.226821,-0.660988,...,0.002696,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,2
2,-0.669461,-0.008909,-0.010240,-0.094041,-0.008071,-0.256668,-0.368658,-0.308511,-0.226821,-0.660988,...,0.002696,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,2
3,-0.669461,-0.008909,-0.010240,-0.094041,-0.008071,-0.256668,-0.368658,-0.308511,-0.226821,-0.660988,...,0.002696,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,2
4,-0.669461,-0.008909,-0.010240,-0.094041,-0.008071,-0.256668,-0.368658,-0.308511,-0.226821,-0.660988,...,0.002696,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79923,-0.342317,0.027077,0.032322,0.255247,-0.006386,0.312842,-0.368658,-0.004985,0.063466,-0.344556,...,0.002696,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,4
79924,-0.341255,0.027077,0.033652,0.255247,-0.006386,0.312842,-0.368658,-0.004985,0.063466,-0.344556,...,0.002696,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,4
79925,-0.340594,0.027077,0.033652,0.255247,-0.006386,0.312842,-0.368658,-0.004985,0.063466,-0.344556,...,0.002696,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,4
79926,-0.341932,0.027077,0.033652,0.255247,-0.006386,0.312842,-0.368658,-0.004985,0.063466,-0.344556,...,0.002696,-0.119704,-0.029505,-0.108737,-0.117852,-0.644664,-0.178118,-0.655647,-0.618349,4


In [24]:
df['Label'] = df['Label'].astype('object')
features = df.dtypes[df.dtypes != 'object'].index
df['Label'] = df['Label'].astype('int')

In [29]:
X = df.drop(['Label'],axis=1).values
y = df.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8, test_size = 0.2, random_state = 0)

In [33]:
from sklearn.feature_selection import mutual_info_classif
importances = mutual_info_classif(X_train, y_train)

In [34]:
# calculate the sum of importance scores
f_list = sorted(zip(map(lambda x: round(x, 4), importances), features), reverse=True)
Sum = 0
fs = []
for i in range(0, len(f_list)):
    Sum = Sum + f_list[i][0]
    fs.append(f_list[i][1])

In [35]:
# select the important features from top to bottom until the accumulated importance reaches 90%
f_list2 = sorted(zip(map(lambda x: round(x, 4), importances/Sum), features), reverse=True)
Sum2 = 0
fs = []
for i in range(0, len(f_list2)):
    Sum2 = Sum2 + f_list2[i][0]
    fs.append(f_list2[i][1])
    if Sum2>=0.9:
        break        

In [36]:
X_fs = df[fs].values

In [37]:
X_fs.shape

(79928, 46)

In [38]:
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
fcbf = FCBFK(k = 20)
#fcbf.fit(X_fs, y)

In [39]:
# 5 minutes
X_fss = fcbf.fit_transform(X_fs,y)

In [40]:
selected_features = fcbf.idx_sel

In [41]:
selected_features

[3, 28, 19, 16, 30, 31, 32, 33, 1, 6, 12, 13, 4, 5, 15, 11, 14, 34, 9, 10]

In [42]:
df['Label'].value_counts()

0    34908
2    22979
1     8435
3     7752
4     5854
Name: Label, dtype: int64

In [43]:
end_time = time.time()

print(f'Elapsed time: {end_time-start_time} seconds')

Elapsed time: 294.63885045051575 seconds
