In [100]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [101]:
df = pd.read_csv('./data/CICIDS2017_sample.csv')
features_map = {}

In [102]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,DoS
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,DoS


In [103]:
df['Label'].unique()

array(['BENIGN', 'DoS', 'PortScan', 'Bot', 'Infiltration', 'WebAttack',
       'BruteForce'], dtype=object)

In [104]:
## handle inf values, first replace with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [105]:
## missing values in the following columns and imputing with mean
df.isnull().sum()
for col in df.columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mean())
        print(col)

Flow Bytes/s
Flow Packets/s


In [106]:
# feature without Label/target
features_without_target = list(df.dtypes[df.dtypes != 'object'].index)
features_without_target
for cl in features_without_target:
    print(f'column is {cl} ----- min value is  {df[cl].min()} :::::  max value is   {df[cl].max()}')

column is Flow Duration ----- min value is  0 :::::  max value is   119999998
column is Total Fwd Packets ----- min value is  1 :::::  max value is   6930
column is Total Backward Packets ----- min value is  0 :::::  max value is   9877
column is Total Length of Fwd Packets ----- min value is  0 :::::  max value is   2866110
column is Total Length of Bwd Packets ----- min value is  0 :::::  max value is   21500000
column is Fwd Packet Length Max ----- min value is  0 :::::  max value is   23360
column is Fwd Packet Length Min ----- min value is  0 :::::  max value is   1983
column is Fwd Packet Length Mean ----- min value is  0.0 :::::  max value is   5940.857143
column is Fwd Packet Length Std ----- min value is  0.0 :::::  max value is   7049.469004000001
column is Bwd Packet Length Max ----- min value is  0 :::::  max value is   11632
column is Bwd Packet Length Min ----- min value is  0 :::::  max value is   1448
column is Bwd Packet Length Mean ----- min value is  0.0 :::::  max v

In [107]:
# Assigning numerical Label values with LabelEncoder
labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])

  df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])


In [108]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,3
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,3


In [109]:
## Scale the dataset, until now all the missing and inf handled
scaling = MinMaxScaler()
df_scaled = pd.DataFrame(scaling.fit_transform(df[features_without_target]))
df_scaled.columns=features_without_target
df_scaled['Label']=df['Label']

In [110]:
df_scaled

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3.333333e-08,0.000144,0.000000,0.000013,0.000000,0.001327,0.003026,0.003114,0.002508,0.000000,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
1,1.186475e-03,0.006494,0.006277,0.000462,0.004923,0.024401,0.000000,0.004849,0.015804,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
2,9.906083e-04,0.003175,0.002835,0.000408,0.002094,0.024401,0.000000,0.008555,0.022149,0.248968,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
3,1.196475e-03,0.006061,0.005568,0.000454,0.004990,0.024401,0.000000,0.005093,0.016339,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
4,1.197875e-03,0.006927,0.005973,0.000464,0.005125,0.024401,0.000000,0.004572,0.015330,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,1.950000e-06,0.000144,0.000202,0.000022,0.000011,0.001370,0.016137,0.005386,0.000000,0.009972,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
56657,1.110733e-03,0.000144,0.000202,0.000033,0.000022,0.002012,0.023701,0.007911,0.000000,0.020719,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
56658,9.589745e-02,0.000577,0.000405,0.000157,0.000164,0.019264,0.000000,0.015149,0.028548,0.303043,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054652,0.0,0.054652,0.054652,3
56659,9.589756e-02,0.001010,0.000607,0.000145,0.000541,0.017808,0.000000,0.008753,0.020864,0.497937,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054648,0.0,0.054648,0.054648,3


In [111]:
df_scaled['Label'].unique()

array([0, 3, 5, 1, 4, 6, 2])

In [112]:
## write the scaled doc for testing
df_scaled.to_csv('./data/minmaxscaled_test.csv',index=0)

In [113]:
## 4000 samples
df_sample = df_scaled.sample(n=4000)
df_sample.shape

(4000, 78)

In [114]:
df_sample

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
14735,4.083333e-07,0.000000,0.000101,0.000000e+00,2.790698e-07,0.000000,0.000000,0.000000,0.000000,0.000516,...,0.666667,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000,0.0,0.000000,0.000000,5
36694,5.372358e-02,0.000577,0.000101,4.291531e-05,2.139535e-06,0.001969,0.000000,0.004141,0.003302,0.003955,...,0.333333,2.338070e-03,0.0,2.338070e-03,2.338070e-03,0.052177,0.0,0.052177,0.052177,0
2558,6.688893e-01,0.001155,0.000506,2.163211e-05,5.398605e-04,0.000856,0.000000,0.001160,0.000752,0.502063,...,0.333333,4.770063e-02,0.0,4.770063e-02,4.770063e-02,0.634454,0.0,0.634454,0.634454,3
46929,8.315972e-01,0.000866,0.000506,1.123474e-04,5.393023e-04,0.013527,0.000000,0.007743,0.016892,0.871389,...,0.333333,5.000000e-08,0.0,5.000000e-08,5.000000e-08,0.836975,0.0,0.836975,0.836975,3
33532,7.416667e-07,0.000000,0.000101,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.533333,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000,0.0,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26544,9.699444e-01,0.072593,0.051028,5.982045e-02,1.406512e-04,0.043964,0.003026,0.057262,0.033026,0.000516,...,0.333333,2.190000e-01,0.0,2.190000e-01,2.190000e-01,0.348739,0.0,0.348739,0.348739,4
10136,2.666667e-07,0.000000,0.000101,6.978099e-07,2.790698e-07,0.000086,0.001009,0.000337,0.000000,0.000516,...,0.400000,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000,0.0,0.000000,0.000000,5
15861,2.333333e-07,0.000000,0.000101,6.978099e-07,2.790698e-07,0.000086,0.001009,0.000337,0.000000,0.000516,...,0.400000,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000,0.0,0.000000,0.000000,5
19763,8.333333e-09,0.000144,0.000000,2.152744e-04,0.000000e+00,0.026413,0.000000,0.051929,0.061889,0.000000,...,0.533333,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000,0.0,0.000000,0.000000,0


In [115]:
df_sample['Label'].unique()

array([5, 0, 3, 1, 2, 6, 4])

In [116]:
## Result without any feature selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

y=df_sample['Label']
X=df_sample.drop('Label', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30,random_state=1)

clf = RandomForestClassifier(random_state=0) 
 

clf.fit(X_train, y_train)
 

y_pred = clf.predict(X_test)
 

from sklearn import metrics 
print()
 

print("ACCURACY OF THE MODEL without any feature selection: ", metrics.accuracy_score(y_test, y_pred))


ACCURACY OF THE MODEL without any feature selection:  0.9708333333333333


In [117]:
y_test.unique()

array([0, 3, 5, 6, 1, 2])

In [118]:
## Variance Threshold
##VARIANCE THRESHOLD

pipe1 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('vt', VarianceThreshold()),
        ('rfc',  RandomForestClassifier())])

param_grid1 = dict(
                  vt__threshold=[0,0.01,0.02,0.03,0.04,0.05],
                  rfc__random_state=[0] 
                  )

grid1 = GridSearchCV(pipe1, param_grid=param_grid1, cv=10, n_jobs=1, scoring= 'accuracy')
grid1.fit(X_train, y_train)
print(f'The score ***********   {grid1.score(X_test,y_test)}')
print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid1.best_estimator_}')



The score ***********   0.9758333333333333
{'mean_fit_time': array([0.38016925, 0.2807508 , 0.25228446, 0.25020499, 0.2681313 ,
       0.23706801]), 'std_fit_time': array([0.01864169, 0.01351661, 0.01134349, 0.00903615, 0.01623457,
       0.01027592]), 'mean_score_time': array([0.01012468, 0.0070967 , 0.00797265, 0.00816562, 0.00826883,
       0.00969801]), 'std_score_time': array([0.00555025, 0.00443239, 0.00318185, 0.00338899, 0.00363627,
       0.00344365]), 'param_rfc__random_state': masked_array(data=[0, 0, 0, 0, 0, 0],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_vt__threshold': masked_array(data=[0, 0.01, 0.02, 0.03, 0.04, 0.05],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'rfc__random_state': 0, 'vt__threshold': 0}, {'rfc__random_state': 0, 'vt__threshold': 0.01}, {'rfc__random_state': 0, 'vt__threshold': 0.02}, {'rfc__random_

In [119]:
grid1.best_estimator_.named_steps["vt"].get_support()

array([ True, False, False, False, False, False, False, False, False,
        True, False,  True,  True, False, False, False,  True,  True,
       False,  True, False,  True,  True, False,  True, False,  True,
        True, False,  True, False, False, False, False, False, False,
       False, False,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True, False, False, False,  True, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True, False,  True, False, False, False,
       False,  True,  True,  True,  True])

In [120]:
selected=[]
idx=0
toSelect=grid1.best_estimator_.named_steps["vt"].get_support()
for i in toSelect:
  if(i == True):
    print(f'--- > {features_without_target[idx]}')
    selected.append(features_without_target[idx])

  idx=idx+1  

print(f'---selected variance threshold columns    {selected}')
print(f" number of features selected by variance threshold -->> {len(selected)}")
features_map["variance_threshold"] = selected 

--- > Flow Duration
--- > Bwd Packet Length Max
--- > Bwd Packet Length Mean
--- > Bwd Packet Length Std
--- > Flow IAT Std
--- > Flow IAT Max
--- > Fwd IAT Total
--- > Fwd IAT Std
--- > Fwd IAT Max
--- > Bwd IAT Total
--- > Bwd IAT Std
--- > Bwd IAT Max
--- > Fwd PSH Flags
--- > Max Packet Length
--- > Packet Length Mean
--- > Packet Length Std
--- > Packet Length Variance
--- > FIN Flag Count
--- > SYN Flag Count
--- > PSH Flag Count
--- > ACK Flag Count
--- > URG Flag Count
--- > Average Packet Size
--- > Avg Bwd Segment Size
--- > Init_Win_bytes_forward
--- > Init_Win_bytes_backward
--- > min_seg_size_forward
--- > Idle Mean
--- > Idle Std
--- > Idle Max
--- > Idle Min
---selected variance threshold columns    ['Flow Duration', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Total', 'Fwd IAT Std', 'Fwd IAT Max', 'Bwd IAT Total', 'Bwd IAT Std', 'Bwd IAT Max', 'Fwd PSH Flags', 'Max Packet Length', 'Packet Length Mea

In [121]:
y_train

3628     3
50205    3
41700    3
26620    0
7208     0
        ..
50503    3
18608    1
9111     5
47951    3
16128    5
Name: Label, Length: 2800, dtype: int32

In [122]:
##Lasso
## LASSO
from sklearn.linear_model import Lasso

pipe2 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('lasso', Lasso())])


param_grid2 = dict(
                  lasso__alpha=np.arange(0.1,10,0.1)
                   
                  )

grid2 = GridSearchCV(pipe2, param_grid=param_grid2, cv=10, n_jobs=1, scoring= "neg_mean_squared_error")
grid2.fit(X_train, y_train)


print(f'The best estimator is --------->>   {grid2.best_estimator_}')
print()
print()
print(f' best params --->>  {grid2.best_params_}')
print()
coefficients = grid2.best_estimator_.named_steps['lasso'].coef_

print(f' the coefficients --->>  {coefficients}')


importance = np.abs(coefficients)

impr_features=np.array(features_without_target)[importance > 0]
print()
print()


print(f' important features   --->>  {impr_features}')

print()
print()

redundant_features=np.array(features_without_target)[importance == 0]
print(f' redundant features --->>  {redundant_features}')


The best estimator is --------->>   Pipeline(steps=[('lasso', Lasso(alpha=0.1))])


 best params --->>  {'lasso__alpha': 0.1}

 the coefficients --->>  [ 0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.          0.         -0.          0.
  0.         -0.          0.          0.          0.          0.
 -0.          0.          0.          0.          0.58005207 -0.
 -0.         -0.          0.          0.         -0.         -0.
  0.          0.          0.         -0.         -0.          0.
  0.         -0.          0.          0.          0.          0.
  0.         -0.          0.          1.39032941  0.1018493  -0.
  0.          0.         -0.          0.         -0.          0.
 -0.          0.          0.          0.          0.          0.
  0.         -0.         -0.         -0.         -0.          0.
  0.         -0.          0.         -0.         -0.         -0.
  0.          0.          0.          0.          0.        ]


 imp

In [123]:
features_map["lasso"]=impr_features

In [124]:
### Now using random forest using the important features

df_lasso=df_sample
df_lasso = df_lasso.drop(redundant_features, axis=1)

#df_lasso.shape
#df_lasso.head()

y=df_lasso['Label']
X=df_lasso.drop('Label', axis=1)
X_train_ll, X_test_ll, y_train_ll, y_test_ll = train_test_split(X, y, test_size = 0.30,random_state=1)

clf = RandomForestClassifier(random_state=0) 

clf.fit(X_train_ll, y_train_ll)

y_pred = clf.predict(X_test_ll)
 

from sklearn import metrics 
print()
 

print("ACCURACY OF THE MODEL with lasso features: ", metrics.accuracy_score(y_test_ll, y_pred))


ACCURACY OF THE MODEL with lasso features:  0.8216666666666667


In [125]:
df_lasso

Unnamed: 0,Fwd IAT Max,PSH Flag Count,ACK Flag Count,Label
14735,0.000000e+00,1.0,0.0,5
36694,5.217714e-02,0.0,1.0,0
2558,6.344538e-01,0.0,1.0,3
46929,8.369748e-01,0.0,1.0,3
33532,0.000000e+00,0.0,1.0,0
...,...,...,...,...
26544,3.487395e-01,0.0,1.0,4
10136,0.000000e+00,1.0,0.0,5
15861,0.000000e+00,1.0,0.0,5
19763,8.403361e-09,0.0,1.0,0


In [126]:
### Random forest feature importance



pipe3 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('rfc',  RandomForestClassifier())])

param_grid3 = dict(
                  
                  rfc__criterion = ['gini','entropy'] ,
                  
                  rfc__random_state= [0]
                  )

grid3 = GridSearchCV(pipe3, param_grid=param_grid3, cv=10, n_jobs=1, scoring= 'accuracy')
grid3.fit(X_train, y_train)
print()
print()
print(f'The score ***********   {grid3.score(X_test,y_test)}')
print()
print()
print(grid3.cv_results_)
print()
print()
print(f'The best estimator is --------->>   {grid3.best_estimator_}')

print()
print()

importances=grid3.best_estimator_.named_steps["rfc"].feature_importances_


indices=np.argsort(importances)[::-1]

print(f' importances  --->   {importances}')
print()
print(f' indices  --->   {indices}')

print()
importantFeature=grid3.best_estimator_.named_steps["rfc"].feature_importances_

df_show=pd.DataFrame(importantFeature,index=features_without_target,columns=['importance']).sort_values('importance',ascending=False)
df_show.head(18)






The score ***********   0.9708333333333333


{'mean_fit_time': array([0.3314218 , 0.42307487]), 'std_fit_time': array([0.01306596, 0.01050217]), 'mean_score_time': array([0.00640571, 0.00646398]), 'std_score_time': array([0.00400664, 0.00447274]), 'param_rfc__criterion': masked_array(data=['gini', 'entropy'],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_rfc__random_state': masked_array(data=[0, 0],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'rfc__criterion': 'gini', 'rfc__random_state': 0}, {'rfc__criterion': 'entropy', 'rfc__random_state': 0}], 'split0_test_score': array([0.98928571, 0.98928571]), 'split1_test_score': array([0.975     , 0.97857143]), 'split2_test_score': array([0.97857143, 0.97142857]), 'split3_test_score': array([0.975     , 0.96785714]), 'split4_test_score': array([0.98571429, 0.98928571]), 'split5_test_score': array([0.97857143, 0.97857143]), 'split6_test_score': ar

Unnamed: 0,importance
Packet Length Std,0.0582
Avg Bwd Segment Size,0.052293
Init_Win_bytes_forward,0.049856
Packet Length Mean,0.045814
Bwd Packet Length Std,0.038669
Bwd Packet Length Mean,0.038214
Average Packet Size,0.036888
Init_Win_bytes_backward,0.034483
Fwd Header Length.1,0.031719
Bwd Packet Length Max,0.031376


In [127]:
selected=[]
for i in range (0,18):
  selected.append(features_without_target[indices[i]])
selected
features_map["random_forest_feature_importance"]= selected

In [128]:
### use the above features... Tuesday

X_train_pi=X_train[selected]
X_train_pi.shape
X_test_pi=X_test[selected]
################################################

pipe3 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid3 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid3 = GridSearchCV(pipe3, param_grid=param_grid3, cv=10)
grid3.fit(X_train_pi, y_train)
print(f'The score ***********   {grid3.score(X_test_pi,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid3.best_estimator_}')



The score ***********   0.9833333333333333
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [129]:
## Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

pipe4 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfe', RFE(RandomForestClassifier())),
        ('rfc', RandomForestClassifier())
        ])

param_grid4 = dict(
                  
                  rfe__n_features_to_select=[18,5,25],
                  rfe__step=[0.5] 
                   
                  )

grid4 = GridSearchCV(pipe4, param_grid=param_grid4, cv=10)
grid4.fit(X_train, y_train)
print(f'The score ***********   {grid4.score(X_test,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid4.best_estimator_}')


#print(f'{grid4.best_estimator_.named_steps["rfe"].support_}')

toSelect=grid4.best_estimator_.named_steps["rfe"].support_
idx=0
print()
print()
selected=[]
for i in toSelect:
  if(i== True):
    print(f'--- > {features_without_target[idx]}')
    selected.append(features_without_target[idx])

  idx=idx+1  

print(f'---selected RFE columns    {selected}')
features_map["recursive_feature_elimination"]=selected




The score ***********   0.9816666666666667
The best estimator is --------->>   Pipeline(steps=[('rfe',
                 RFE(estimator=RandomForestClassifier(),
                     n_features_to_select=25, step=0.5)),
                ('rfc', RandomForestClassifier())])


--- > Total Fwd Packets
--- > Total Backward Packets
--- > Total Length of Fwd Packets
--- > Fwd Packet Length Max
--- > Fwd Packet Length Mean
--- > Bwd Packet Length Max
--- > Bwd Packet Length Mean
--- > Bwd Packet Length Std
--- > Fwd Header Length
--- > Bwd Header Length
--- > Bwd Packets/s
--- > Max Packet Length
--- > Packet Length Mean
--- > Packet Length Std
--- > Packet Length Variance
--- > PSH Flag Count
--- > Average Packet Size
--- > Avg Fwd Segment Size
--- > Avg Bwd Segment Size
--- > Fwd Header Length.1
--- > Subflow Fwd Packets
--- > Subflow Fwd Bytes
--- > Subflow Bwd Bytes
--- > Init_Win_bytes_forward
--- > Init_Win_bytes_backward
---selected RFE columns    ['Total Fwd Packets', 'Total Backward Pack

In [130]:
X_train_rfe=X_train[selected]
X_train_rfe.shape
X_test_rfe=X_test[selected]
################################################

pipe4 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid4 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid4 = GridSearchCV(pipe4, param_grid=param_grid4, cv=10)
grid4.fit(X_train_rfe, y_train)
print(f'The score ***********   {grid4.score(X_test_rfe,y_test)}')

print(f'The best estimator is --------->>   {grid4.best_estimator_}')





The score ***********   0.9833333333333333
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [131]:
#Permutation importance
from sklearn.inspection import permutation_importance
forest = RandomForestClassifier(n_estimators=200,random_state=0)

forest.fit(X_train,y_train)
result =permutation_importance(
    estimator=forest,
    X=X_test,
    y=y_test,
    scoring='accuracy',
    n_repeats=50,
    random_state=0
)

importances=result.importances


indices=np.argsort(result['importances_mean'])[::-1]

print(f' indices  --->   {indices}')




 indices  --->   [65 66 23  9  4 11 34 53 50 17 64 51  6 15  1 36 67 63  5 24 25 33 10 54
 37 73  8 75 76  2 12 16 72 43 26 74 56 29 30 31 32 57 60 44 71 58 59 70
 49 69 48 55  0 61 42 19 45 38 20 68  3 47 41 27 18 52 14 46 22 62  7 28
 35 21 40 13 39]


In [132]:
## Top 18 attributes

selected=[]
for i in range(0,18):
  selected.append(features_without_target[indices[i]])
print (selected)  


['Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'Fwd IAT Min', 'Bwd Packet Length Max', 'Total Length of Bwd Packets', 'Bwd Packet Length Mean', 'Bwd Header Length', 'Avg Bwd Segment Size', 'Down/Up Ratio', 'Flow IAT Max', 'Subflow Bwd Bytes', 'Average Packet Size', 'Fwd Packet Length Min', 'Flow IAT Mean', 'Total Fwd Packets', 'Bwd Packets/s', 'act_data_pkt_fwd', 'Subflow Bwd Packets']


In [133]:
features_map["permutation_importance"] = selected

In [134]:
X_train_pi=X_train[selected]
X_train_pi.shape
X_test_pi=X_test[selected]
################################################

pipe5 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid5 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid5 = GridSearchCV(pipe5, param_grid=param_grid5, cv=10)
grid5.fit(X_train_pi, y_train)
print(f'The score ***********   {grid5.score(X_test_pi,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid5.best_estimator_}')



The score ***********   0.9766666666666667
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [135]:
# for j in gaures:
#     for idx, x in enumerate(df.columns):
#         if idx == j:
#             print(x)
    

In [136]:
print("")
print(features_map)


{'variance_threshold': ['Flow Duration', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Total', 'Fwd IAT Std', 'Fwd IAT Max', 'Bwd IAT Total', 'Bwd IAT Std', 'Bwd IAT Max', 'Fwd PSH Flags', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'Average Packet Size', 'Avg Bwd Segment Size', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'min_seg_size_forward', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min'], 'lasso': array(['Fwd IAT Max', 'PSH Flag Count', 'ACK Flag Count'], dtype='<U27'), 'random_forest_feature_importance': ['Packet Length Std', 'Avg Bwd Segment Size', 'Init_Win_bytes_forward', 'Packet Length Mean', 'Bwd Packet Length Std', 'Bwd Packet Length Mean', 'Average Packet Size', 'Init_Win_bytes_backward', 'Fwd Header Length.1', 'Bwd Packet Length Max', 'Max Packet Length', 'P

In [140]:
## write extracted columns from different selection into files
f = open("data/features_with_all_labels.txt", "a")
for i in features_map:
    f.write("\n")
    f.write(i)
    f.write("------------------")
    f.write(",".join(features_map[i]))
f.close()