In [1]:
from openfe import OpenFE, tree_to_formula, transform, TwoStageFeatureSelector
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef
from IPython.display import clear_output
from FS import pso, hho
from sklearn.ensemble import RandomForestClassifier

## Preprocessing

In [2]:
df = pd.read_csv('F:/Kỳ 4/Thầy Hùng/OpenFE/data/MQTT-IoT-IDS2020/mqtt_50k.csv')

In [3]:
df.head(5)

Unnamed: 0,ip_src,ip_dst,prt_src,prt_dst,proto,fwd_num_pkts,bwd_num_pkts,fwd_mean_iat,bwd_mean_iat,fwd_std_iat,...,fwd_num_bytes,bwd_num_bytes,fwd_num_psh_flags,bwd_num_psh_flags,fwd_num_rst_flags,bwd_num_rst_flags,fwd_num_urg_flags,bwd_num_urg_flags,is_attack,category
0,10.0.0.13,192.168.1.7,32964,1883,6,7,5,0.000342,0.0004,0.000356,...,464,272,3,1,0,0,0,0,0,sparta
1,192.168.2.5,10.0.0.12,58390,786,17,1,1,0.0,0.0,0.0,...,28,28,0,0,0,0,0,0,1,scan_sU
2,192.168.2.5,10.0.0.8,58390,38,17,1,1,0.0,0.0,0.0,...,28,28,0,0,0,0,0,0,1,scan_sU
3,10.0.0.16,192.168.1.7,58728,1883,6,7,5,0.00048,0.000616,0.000615,...,445,272,3,1,0,0,0,0,0,sparta
4,10.0.0.13,192.168.1.7,60512,1883,6,7,5,0.00029,0.000341,0.000312,...,462,272,3,1,0,0,0,0,0,sparta


In [4]:
df.shape

(50000, 33)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ip_src             50000 non-null  object 
 1   ip_dst             50000 non-null  object 
 2   prt_src            50000 non-null  int64  
 3   prt_dst            50000 non-null  int64  
 4   proto              50000 non-null  int64  
 5   fwd_num_pkts       50000 non-null  int64  
 6   bwd_num_pkts       50000 non-null  int64  
 7   fwd_mean_iat       50000 non-null  float64
 8   bwd_mean_iat       50000 non-null  float64
 9   fwd_std_iat        50000 non-null  float64
 10  bwd_std_iat        50000 non-null  float64
 11  fwd_min_iat        50000 non-null  float64
 12  bwd_min_iat        50000 non-null  float64
 13  fwd_max_iat        50000 non-null  float64
 14  bwd_max_iat        50000 non-null  float64
 15  fwd_mean_pkt_len   50000 non-null  float64
 16  bwd_mean_pkt_len   500

In [6]:
df.isnull().sum()

ip_src               0
ip_dst               0
prt_src              0
prt_dst              0
proto                0
fwd_num_pkts         0
bwd_num_pkts         0
fwd_mean_iat         0
bwd_mean_iat         0
fwd_std_iat          0
bwd_std_iat          0
fwd_min_iat          0
bwd_min_iat          0
fwd_max_iat          0
bwd_max_iat          0
fwd_mean_pkt_len     0
bwd_mean_pkt_len     0
fwd_std_pkt_len      0
bwd_std_pkt_len      0
fwd_min_pkt_len      0
bwd_min_pkt_len      0
fwd_max_pkt_len      0
bwd_max_pkt_len      0
fwd_num_bytes        0
bwd_num_bytes        0
fwd_num_psh_flags    0
bwd_num_psh_flags    0
fwd_num_rst_flags    0
bwd_num_rst_flags    0
fwd_num_urg_flags    0
bwd_num_urg_flags    0
is_attack            0
category             0
dtype: int64

In [7]:
df.isnull().sum()
df.dropna(inplace=True)

In [8]:
df['category'].value_counts()

category
sparta             17603
normal             16580
scan_sU             7646
scan_A              4953
mqtt_bruteforce     3218
Name: count, dtype: int64

In [9]:
#Encode
label_encoder = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
    	df[col] = label_encoder.fit_transform(df[col])

In [10]:
for col in df.columns:
  if len(df[col].unique()) <= 1:
    print(col)
    df.drop(col, axis=1, inplace=True)

bwd_num_urg_flags


In [11]:
features = df.drop(['category', 'is_attack', 'ip_src', 'ip_dst'], axis=1)
labels = pd.DataFrame(df['category'])

## Training

In [12]:
def get_score(train_x, test_x, train_y, test_y):
    gbm = lgb.LGBMClassifier()
    gbm.fit(train_x, train_y)
    pred = gbm.predict(test_x)
    clear_output()
    accuracy = accuracy_score(test_y, pred)
    f1 = f1_score(test_y, pred, average='macro')
    mcc = matthews_corrcoef(test_y, pred)
    return {'Accuracy': accuracy, 'F1 Score': f1, 'MCC': mcc}

In [13]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)

In [14]:
results = {}

In [15]:
base_data_score = get_score(X_train, X_test, y_train, y_test)
# results['Base data'] = base_data_score
base_data_score

{'Accuracy': 0.86664,
 'F1 Score': 0.8925882963383482,
 'MCC': 0.8162979148365876}

## OpenFE

In [16]:
ofe = OpenFE()
features = ofe.fit(data = X_train, label = y_train, n_jobs=4)

The number of candidate features is 4103
Start stage I selection.


100%|██████████| 16/16 [02:11<00:00,  8.22s/it]


1630 same features have been deleted.
Meet early-stopping in successive feature-wise halving.


100%|██████████| 16/16 [15:07<00:00, 56.70s/it]


The number of remaining candidate features is 2000
Start stage II selection.


100%|██████████| 16/16 [01:07<00:00,  4.24s/it]


Finish data processing.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.723151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 391947
[LightGBM] [Info] Number of data points in the train set: 30000, number of used features: 2027


In [17]:
X_train_expanded, X_test_expanded = transform(X_train, X_test, ofe.new_features_list, n_jobs = 4)

In [18]:
print('Number of features after using OpenFE:', len(X_train_expanded.columns))

Number of features after using OpenFE: 2028


In [19]:
openFE_score = get_score(X_train_expanded, X_test_expanded, y_train, y_test)
results['OpenFE'] = openFE_score
openFE_score

{'Accuracy': 0.88808, 'F1 Score': 0.906926913655813, 'MCC': 0.8458644219329308}

# Features Selection PSO

In [16]:
xtrain, xvalid, ytrain, yvalid = train_test_split(X_train.values, y_train.values, test_size=0.25, stratify=y_train)
fold = {'xt':xtrain, 'yt':ytrain, 'xv':xvalid, 'yv':yvalid}

# parameter
k    = 5     # k-value in KNN
N    = 5    # number of particles
T    = 10    # maximum number of iterations
w    = 0.9
c1   = 2
c2   = 2
opts = {'k':k, 'fold':fold, 'N':N, 'T':T, 'w':w, 'c1':c1, 'c2':c2}

In [17]:
# perform feature selection
fmdl = pso.jfs(X_train, y_train, opts)
sf   = fmdl['sf']

Iteration: 1
Best (PSO): 0.3177468571428572
Iteration: 2
Best (PSO): 0.3177468571428572
Iteration: 3
Best (PSO): 0.3169673142857143
Iteration: 4
Best (PSO): 0.3166505142857143
Iteration: 5
Best (PSO): 0.31593622857142856
Iteration: 6
Best (PSO): 0.31593622857142856
Iteration: 7
Best (PSO): 0.3156194285714286
Iteration: 8
Best (PSO): 0.29909600000000003
Iteration: 9
Best (PSO): 0.29502742857142855
Iteration: 10
Best (PSO): 0.29502742857142855


In [22]:
features_selected = X_train.columns[sf]
features_selected

Index(['bwd_num_pkts', 'fwd_mean_iat', 'bwd_mean_iat', 'fwd_std_iat',
       'bwd_std_iat', 'fwd_mean_pkt_len', 'fwd_std_pkt_len', 'fwd_min_pkt_len',
       'bwd_min_pkt_len', 'fwd_max_pkt_len', 'fwd_num_psh_flags'],
      dtype='object')

In [23]:
X_train_pso = X_train[features_selected]
X_test_pso = X_test[features_selected]

In [24]:
pso_score = get_score(X_train_pso, X_test_pso, y_train, y_test)
results['PSO'] = pso_score
pso_score

{'Accuracy': 0.76704,
 'F1 Score': 0.8146410939257634,
 'MCC': 0.6761654582050493}

# HHO


In [18]:
# perform feature selection
fmdl = hho.jfs(X_train, y_train, opts)
sf   = fmdl['sf']

Iteration: 1
Best (HHO): 0.30073577142857144
Iteration: 2
Best (HHO): 0.2991766857142857
Iteration: 3
Best (HHO): 0.29839714285714286
Iteration: 4
Best (HHO): 0.29839714285714286
Iteration: 5
Best (HHO): 0.29839714285714286
Iteration: 6
Best (HHO): 0.29839714285714286
Iteration: 7
Best (HHO): 0.29839714285714286
Iteration: 8
Best (HHO): 0.29839714285714286
Iteration: 9
Best (HHO): 0.29839714285714286
Iteration: 10
Best (HHO): 0.29839714285714286


In [19]:
features_selected = X_train.columns[sf]
features_selected

Index(['proto', 'bwd_num_pkts', 'fwd_mean_iat', 'bwd_mean_iat', 'fwd_std_iat',
       'bwd_std_iat', 'bwd_min_iat', 'fwd_max_iat', 'fwd_mean_pkt_len',
       'bwd_mean_pkt_len', 'fwd_min_pkt_len', 'bwd_num_bytes',
       'fwd_num_psh_flags', 'bwd_num_psh_flags', 'bwd_num_rst_flags'],
      dtype='object')

In [20]:
X_train_hho = X_train[features_selected]
X_test_hho = X_test[features_selected]

In [21]:
hho_score = get_score(X_train_hho, X_test_hho, y_train, y_test)
results['HHO'] = hho_score
hho_score

{'Accuracy': 0.76272,
 'F1 Score': 0.8096659558169137,
 'MCC': 0.6697986071611925}

# PSO + OpenFE

In [29]:
ofe = OpenFE()
features = ofe.fit(data = X_train_pso, label = y_train, n_jobs=4)

The number of candidate features is 613
Start stage I selection.


100%|██████████| 16/16 [00:56<00:00,  3.54s/it]


122 same features have been deleted.
Meet early-stopping in successive feature-wise halving.


100%|██████████| 16/16 [03:54<00:00, 14.67s/it]


The number of remaining candidate features is 491
Start stage II selection.


100%|██████████| 16/16 [00:24<00:00,  1.53s/it]


Finish data processing.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 82362
[LightGBM] [Info] Number of data points in the train set: 30000, number of used features: 499


In [30]:
X_train_pso_expanded, X_test_pso_expanded = transform(X_train_pso, X_test_pso, ofe.new_features_list, n_jobs = 4)

In [31]:
print('Number of features after using OpenFE:', len(X_train_pso_expanded.columns))

Number of features after using OpenFE: 502


In [32]:
pso_openFE_score = get_score(X_train_pso_expanded, X_test_pso_expanded, y_train, y_test)
results['PSO + OpenFE'] = pso_openFE_score
pso_openFE_score

{'Accuracy': 0.77792,
 'F1 Score': 0.8218992585416173,
 'MCC': 0.6919765475464567}

# HHO + OpenFE

In [33]:
ofe = OpenFE()
features = ofe.fit(data = X_train_hho, label = y_train, n_jobs=4)

The number of candidate features is 686
Start stage I selection.


100%|██████████| 16/16 [00:42<00:00,  2.69s/it]


171 same features have been deleted.
Meet early-stopping in successive feature-wise halving.


100%|██████████| 16/16 [04:22<00:00, 16.40s/it]


The number of remaining candidate features is 515
Start stage II selection.


100%|██████████| 16/16 [00:47<00:00,  2.95s/it]


Finish data processing.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.114742 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79650
[LightGBM] [Info] Number of data points in the train set: 30000, number of used features: 524


In [34]:
X_train_hho_expanded, X_test_hho_expanded = transform(X_train_hho, X_test_hho, ofe.new_features_list, n_jobs = 4)

In [35]:
print('Number of features after using OpenFE:', len(X_train_hho_expanded.columns))

Number of features after using OpenFE: 526


In [36]:
hho_openFE_score = get_score(X_train_hho_expanded, X_test_hho_expanded, y_train, y_test)
results['PSO + OpenFE'] = hho_openFE_score
hho_openFE_score

{'Accuracy': 0.76016,
 'F1 Score': 0.8114151081321431,
 'MCC': 0.6671932723045716}