In [82]:
## all the imports
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [83]:
## read the packet dataset
df = pd.read_csv('./data/CICIDS2017_sample.csv')
features_map = {}

In [84]:
#priniting the dataset
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,DoS
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,DoS


In [85]:
## finding the unique labels/targets
df['Label'].unique()

array(['BENIGN', 'DoS', 'PortScan', 'Bot', 'Infiltration', 'WebAttack',
       'BruteForce'], dtype=object)

In [86]:
## handle inf values, first replace with NaN
## replace empty/infinity cells with mean values
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [87]:
## missing values in the following columns and imputing with mean
df.isnull().sum()
for col in df.columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mean())
        print(col)

Flow Bytes/s
Flow Packets/s


In [88]:
# feature without Label/target
features_without_target = list(df.dtypes[df.dtypes != 'object'].index)
features_without_target
for cl in features_without_target:
    print(f'column is {cl} ----- min value is  {df[cl].min()} :::::  max value is   {df[cl].max()}')

column is Flow Duration ----- min value is  0 :::::  max value is   119999998
column is Total Fwd Packets ----- min value is  1 :::::  max value is   6930
column is Total Backward Packets ----- min value is  0 :::::  max value is   9877
column is Total Length of Fwd Packets ----- min value is  0 :::::  max value is   2866110
column is Total Length of Bwd Packets ----- min value is  0 :::::  max value is   21500000
column is Fwd Packet Length Max ----- min value is  0 :::::  max value is   23360
column is Fwd Packet Length Min ----- min value is  0 :::::  max value is   1983
column is Fwd Packet Length Mean ----- min value is  0.0 :::::  max value is   5940.857143
column is Fwd Packet Length Std ----- min value is  0.0 :::::  max value is   7049.469004000001
column is Bwd Packet Length Max ----- min value is  0 :::::  max value is   11632
column is Bwd Packet Length Min ----- min value is  0 :::::  max value is   1448
column is Bwd Packet Length Mean ----- min value is  0.0 :::::  max v

In [89]:
# Assigning numerical Label values with LabelEncoder
labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])

  df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])


In [90]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,3
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,3


In [91]:
## Scale the dataset, until now all the missing and inf handled
scaling = MinMaxScaler()
df_scaled = pd.DataFrame(scaling.fit_transform(df[features_without_target]))
df_scaled.columns=features_without_target
df_scaled['Label']=df['Label']

In [92]:
## Min man scaled dataset
df_scaled

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3.333333e-08,0.000144,0.000000,0.000013,0.000000,0.001327,0.003026,0.003114,0.002508,0.000000,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
1,1.186475e-03,0.006494,0.006277,0.000462,0.004923,0.024401,0.000000,0.004849,0.015804,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
2,9.906083e-04,0.003175,0.002835,0.000408,0.002094,0.024401,0.000000,0.008555,0.022149,0.248968,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
3,1.196475e-03,0.006061,0.005568,0.000454,0.004990,0.024401,0.000000,0.005093,0.016339,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
4,1.197875e-03,0.006927,0.005973,0.000464,0.005125,0.024401,0.000000,0.004572,0.015330,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,1.950000e-06,0.000144,0.000202,0.000022,0.000011,0.001370,0.016137,0.005386,0.000000,0.009972,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
56657,1.110733e-03,0.000144,0.000202,0.000033,0.000022,0.002012,0.023701,0.007911,0.000000,0.020719,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
56658,9.589745e-02,0.000577,0.000405,0.000157,0.000164,0.019264,0.000000,0.015149,0.028548,0.303043,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054652,0.0,0.054652,0.054652,3
56659,9.589756e-02,0.001010,0.000607,0.000145,0.000541,0.017808,0.000000,0.008753,0.020864,0.497937,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054648,0.0,0.054648,0.054648,3


In [93]:
## after label encoder, printing the label/target values
df_scaled['Label'].unique()

array([0, 3, 5, 1, 4, 6, 2])

In [94]:
## write the scaled doc for testing
df_scaled.to_csv('./data/minmaxscaled_test.csv',index=0)

In [95]:
## 4000 samples
##using the 4000 samples to work on
df_sample = df_scaled.sample(n=4000)
df_sample.shape

(4000, 78)

In [96]:
## storing the df as csv to work in classification
df_sample.to_csv('./data/final_data_all_labels.csv',index=0)

In [97]:
## the df_sample datatset with 4000 rows
df_sample

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
48618,7.121764e-01,0.001010,0.000607,0.000122,5.393023e-04,0.015026,0.000000,0.007385,0.017604,0.497937,...,0.533333,0.000020,0.0,0.000020,0.000020,0.716807,0.000000,0.716807,0.716807,3
19776,1.100000e-06,0.000144,0.000000,0.000013,0.000000e+00,0.001584,0.000000,0.003114,0.003711,0.000000,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
36899,1.952333e-04,0.000000,0.000101,0.000016,3.581395e-06,0.001926,0.022693,0.007575,0.000000,0.006620,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
27553,4.452895e-01,0.000144,0.000202,0.000036,1.381395e-05,0.002483,0.023197,0.008753,0.001204,0.015990,...,0.533333,0.000307,0.0,0.000307,0.000307,0.448739,0.000000,0.448739,0.448739,0
1024,6.053253e-01,0.001010,0.000506,0.000020,5.398605e-04,0.000856,0.000000,0.001178,0.000802,0.753095,...,0.333333,0.000010,0.0,0.000010,0.000010,0.305042,0.586006,0.543697,0.065990,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26075,1.128042e-03,0.000000,0.000101,0.000016,6.651163e-06,0.001926,0.022693,0.007575,0.000000,0.012294,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
14569,4.416667e-07,0.000000,0.000101,0.000000,2.790698e-07,0.000000,0.000000,0.000000,0.000000,0.000516,...,0.666667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5
1854,4.665850e-03,0.000289,0.000607,0.000009,5.398605e-04,0.000856,0.000000,0.001459,0.001456,0.494756,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3
45981,7.134503e-01,0.000866,0.000709,0.000119,5.393023e-04,0.013870,0.000000,0.008224,0.017216,0.746905,...,0.333333,0.000109,0.0,0.000109,0.000109,0.710924,0.000000,0.710924,0.710924,3


In [98]:
df_sample['Label'].unique()

array([3, 0, 5, 2, 1, 6, 4])

### apply random forest classification on the 4000 dataset

In [99]:
## Result without any feature selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

y=df_sample['Label']
X=df_sample.drop('Label', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30,random_state=1)

clf = RandomForestClassifier(random_state=0) 
 

clf.fit(X_train, y_train)
 

y_pred = clf.predict(X_test)
 

from sklearn import metrics 
print()
 

print("ACCURACY OF THE MODEL without any feature selection: ", metrics.accuracy_score(y_test, y_pred))


ACCURACY OF THE MODEL without any feature selection:  0.9791666666666666


In [100]:
y_test.unique()

array([0, 3, 5, 2, 1, 6, 4])

### Feature selection process phase, From now on we will be using different feature selection algorithm to reduce the number
### of columns.
#### The algorithms are variance threshold, lasso regression, random forest classifer, recursive feature elimination, permutation importance
#### From the output of the feature extraction we construct text files,with important column names 

In [101]:
## Variance Threshold
##VARIANCE THRESHOLD

pipe1 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('vt', VarianceThreshold()),
        ('rfc',  RandomForestClassifier())])

param_grid1 = dict(
                  vt__threshold=[0,0.01,0.02,0.03,0.04,0.05],
                  rfc__random_state=[0] 
                  )

grid1 = GridSearchCV(pipe1, param_grid=param_grid1, cv=10, n_jobs=1, scoring= 'accuracy')
grid1.fit(X_train, y_train)
print(f'The score ***********   {grid1.score(X_test,y_test)}')
print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid1.best_estimator_}')



The score ***********   0.9816666666666667
{'mean_fit_time': array([0.37598407, 0.27855189, 0.24361439, 0.2425451 , 0.25552602,
       0.2214859 ]), 'std_fit_time': array([0.0086724 , 0.0127082 , 0.0044203 , 0.00747251, 0.00551536,
       0.0042889 ]), 'mean_score_time': array([0.01128902, 0.00932393, 0.00755963, 0.00716169, 0.00748196,
       0.00817995]), 'std_score_time': array([0.00625438, 0.00413472, 0.00487273, 0.00262347, 0.00323954,
       0.00383319]), 'param_rfc__random_state': masked_array(data=[0, 0, 0, 0, 0, 0],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_vt__threshold': masked_array(data=[0, 0.01, 0.02, 0.03, 0.04, 0.05],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'rfc__random_state': 0, 'vt__threshold': 0}, {'rfc__random_state': 0, 'vt__threshold': 0.01}, {'rfc__random_state': 0, 'vt__threshold': 0.02}, {'rfc__random_

In [102]:
grid1.best_estimator_.named_steps["vt"].get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True, False, False, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [103]:
## selecting the important columns
selected=[]
idx=0
toSelect=grid1.best_estimator_.named_steps["vt"].get_support()
for i in toSelect:
  if(i == True):
    print(f'--- > {features_without_target[idx]}')
    selected.append(features_without_target[idx])

  idx=idx+1  

print(f'---selected variance threshold columns    {selected}')
print(f" number of features selected by variance threshold -->> {len(selected)}")
features_map["variance_threshold"] = selected 

--- > Flow Duration
--- > Total Fwd Packets
--- > Total Backward Packets
--- > Total Length of Fwd Packets
--- > Total Length of Bwd Packets
--- > Fwd Packet Length Max
--- > Fwd Packet Length Min
--- > Fwd Packet Length Mean
--- > Fwd Packet Length Std
--- > Bwd Packet Length Max
--- > Bwd Packet Length Min
--- > Bwd Packet Length Mean
--- > Bwd Packet Length Std
--- > Flow Bytes/s
--- > Flow Packets/s
--- > Flow IAT Mean
--- > Flow IAT Std
--- > Flow IAT Max
--- > Flow IAT Min
--- > Fwd IAT Total
--- > Fwd IAT Mean
--- > Fwd IAT Std
--- > Fwd IAT Max
--- > Fwd IAT Min
--- > Bwd IAT Total
--- > Bwd IAT Mean
--- > Bwd IAT Std
--- > Bwd IAT Max
--- > Bwd IAT Min
--- > Fwd PSH Flags
--- > Fwd Header Length
--- > Bwd Header Length
--- > Fwd Packets/s
--- > Bwd Packets/s
--- > Min Packet Length
--- > Max Packet Length
--- > Packet Length Mean
--- > Packet Length Std
--- > Packet Length Variance
--- > FIN Flag Count
--- > SYN Flag Count
--- > PSH Flag Count
--- > ACK Flag Count
--- > URG Fl

In [104]:
y_train

31211    6
56355    0
35351    0
35436    0
24257    0
        ..
5583     3
21481    0
14298    5
46926    3
8620     5
Name: Label, Length: 2800, dtype: int32

In [105]:
##Lasso
## LASSO
from sklearn.linear_model import Lasso

pipe2 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('lasso', Lasso())])


param_grid2 = dict(
                  lasso__alpha=np.arange(0.1,10,0.1)
                   
                  )

grid2 = GridSearchCV(pipe2, param_grid=param_grid2, cv=10, n_jobs=1, scoring= "neg_mean_squared_error")
grid2.fit(X_train, y_train)


print(f'The best estimator is --------->>   {grid2.best_estimator_}')
print()
print()
print(f' best params --->>  {grid2.best_params_}')
print()
coefficients = grid2.best_estimator_.named_steps['lasso'].coef_

print(f' the coefficients --->>  {coefficients}')


importance = np.abs(coefficients)

impr_features=np.array(features_without_target)[importance > 0]
print()
print()


print(f' important features   --->>  {impr_features}')

print()
print()

redundant_features=np.array(features_without_target)[importance == 0]
print(f' redundant features --->>  {redundant_features}')


The best estimator is --------->>   Pipeline(steps=[('lasso', Lasso(alpha=0.1))])


 best params --->>  {'lasso__alpha': 0.1}

 the coefficients --->>  [ 0.         -0.         -0.          0.         -0.         -0.
 -0.         -0.         -0.          0.         -0.          0.
  0.         -0.          0.          0.          0.          0.
 -0.          0.          0.          0.          0.49420892 -0.
 -0.         -0.          0.          0.         -0.         -0.
  0.          0.          0.         -0.         -0.          0.
  0.         -0.          0.          0.          0.          0.
  0.         -0.          0.          1.32358488  0.17071758 -0.
  0.          0.         -0.          0.         -0.          0.
 -0.          0.          0.          0.          0.          0.
  0.         -0.          0.         -0.         -0.          0.
  0.         -0.          0.         -0.         -0.         -0.
 -0.          0.          0.          0.          0.        ]


 imp

In [106]:
features_map["lasso"]=impr_features

In [107]:
### Now using random forest using the important features

df_lasso=df_sample
df_lasso = df_lasso.drop(redundant_features, axis=1)

#df_lasso.shape
#df_lasso.head()

y=df_lasso['Label']
X=df_lasso.drop('Label', axis=1)
X_train_ll, X_test_ll, y_train_ll, y_test_ll = train_test_split(X, y, test_size = 0.30,random_state=1)

clf = RandomForestClassifier(random_state=0) 

clf.fit(X_train_ll, y_train_ll)

y_pred = clf.predict(X_test_ll)
 

from sklearn import metrics 
print()
 

print("ACCURACY OF THE MODEL with lasso features: ", metrics.accuracy_score(y_test_ll, y_pred))


ACCURACY OF THE MODEL with lasso features:  0.8333333333333334


In [108]:
df_lasso

Unnamed: 0,Fwd IAT Max,PSH Flag Count,ACK Flag Count,Label
48618,0.716807,0.0,1.0,3
19776,0.000001,0.0,1.0,0
36899,0.000000,0.0,0.0,0
27553,0.448739,0.0,0.0,0
1024,0.543697,0.0,1.0,3
...,...,...,...,...
26075,0.000000,0.0,0.0,0
14569,0.000000,1.0,0.0,5
1854,0.000005,1.0,0.0,3
45981,0.710924,0.0,0.0,3


In [109]:
### Random forest feature importance



pipe3 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('rfc',  RandomForestClassifier())])

param_grid3 = dict(
                  
                  rfc__criterion = ['gini','entropy'] ,
                  
                  rfc__random_state= [0]
                  )

grid3 = GridSearchCV(pipe3, param_grid=param_grid3, cv=10, n_jobs=1, scoring= 'accuracy')
grid3.fit(X_train, y_train)
print()
print()
print(f'The score ***********   {grid3.score(X_test,y_test)}')
print()
print()
print(grid3.cv_results_)
print()
print()
print(f'The best estimator is --------->>   {grid3.best_estimator_}')

print()
print()

importances=grid3.best_estimator_.named_steps["rfc"].feature_importances_


indices=np.argsort(importances)[::-1]

print(f' importances  --->   {importances}')
print()
print(f' indices  --->   {indices}')

print()
importantFeature=grid3.best_estimator_.named_steps["rfc"].feature_importances_

df_show=pd.DataFrame(importantFeature,index=features_without_target,columns=['importance']).sort_values('importance',ascending=False)
df_show.head(18)






The score ***********   0.9791666666666666


{'mean_fit_time': array([0.32455561, 0.42943323]), 'std_fit_time': array([0.00799581, 0.01555941]), 'mean_score_time': array([0.00660098, 0.00674379]), 'std_score_time': array([0.00405219, 0.00420019]), 'param_rfc__criterion': masked_array(data=['gini', 'entropy'],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_rfc__random_state': masked_array(data=[0, 0],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'rfc__criterion': 'gini', 'rfc__random_state': 0}, {'rfc__criterion': 'entropy', 'rfc__random_state': 0}], 'split0_test_score': array([0.99642857, 0.99642857]), 'split1_test_score': array([0.98214286, 0.97857143]), 'split2_test_score': array([0.99642857, 0.99285714]), 'split3_test_score': array([0.97857143, 0.97857143]), 'split4_test_score': array([0.98928571, 0.98928571]), 'split5_test_score': array([0.98214286, 0.975     ]), 'split6_test_score': ar

Unnamed: 0,importance
Packet Length Std,0.066987
Init_Win_bytes_forward,0.054504
Packet Length Mean,0.046386
Bwd Packet Length Mean,0.04508
Avg Bwd Segment Size,0.042466
Bwd Packet Length Max,0.034295
Fwd Header Length.1,0.033401
Bwd Packet Length Std,0.03333
Average Packet Size,0.033146
Init_Win_bytes_backward,0.032556


In [110]:
selected=[]
for i in range (0,18):
  selected.append(features_without_target[indices[i]])
selected
features_map["random_forest_feature_importance"]= selected

In [111]:
### use the above features... Tuesday

X_train_pi=X_train[selected]
X_train_pi.shape
X_test_pi=X_test[selected]
################################################

pipe3 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid3 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid3 = GridSearchCV(pipe3, param_grid=param_grid3, cv=10)
grid3.fit(X_train_pi, y_train)
print(f'The score ***********   {grid3.score(X_test_pi,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid3.best_estimator_}')



The score ***********   0.9908333333333333
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [112]:
## Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

pipe4 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfe', RFE(RandomForestClassifier())),
        ('rfc', RandomForestClassifier())
        ])

param_grid4 = dict(
                  
                  rfe__n_features_to_select=[18,5,25],
                  rfe__step=[0.5] 
                   
                  )

grid4 = GridSearchCV(pipe4, param_grid=param_grid4, cv=10)
grid4.fit(X_train, y_train)
print(f'The score ***********   {grid4.score(X_test,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid4.best_estimator_}')


#print(f'{grid4.best_estimator_.named_steps["rfe"].support_}')

toSelect=grid4.best_estimator_.named_steps["rfe"].support_
idx=0
print()
print()
selected=[]
for i in toSelect:
  if(i== True):
    print(f'--- > {features_without_target[idx]}')
    selected.append(features_without_target[idx])

  idx=idx+1  

print(f'---selected RFE columns    {selected}')
features_map["recursive_feature_elimination"]=selected




The score ***********   0.9891666666666666
The best estimator is --------->>   Pipeline(steps=[('rfe',
                 RFE(estimator=RandomForestClassifier(),
                     n_features_to_select=18, step=0.5)),
                ('rfc', RandomForestClassifier())])


--- > Total Length of Fwd Packets
--- > Fwd Packet Length Max
--- > Bwd Packet Length Max
--- > Bwd Packet Length Min
--- > Bwd Packet Length Mean
--- > Bwd Packet Length Std
--- > Fwd Header Length
--- > Bwd Packets/s
--- > Max Packet Length
--- > Packet Length Mean
--- > Packet Length Std
--- > Packet Length Variance
--- > Average Packet Size
--- > Avg Bwd Segment Size
--- > Fwd Header Length.1
--- > Subflow Fwd Bytes
--- > Init_Win_bytes_forward
--- > Init_Win_bytes_backward
---selected RFE columns    ['Total Length of Fwd Packets', 'Fwd Packet Length Max', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Fwd Header Length', 'Bwd Packets/s', 'Max Packet Length', '

In [113]:
X_train_rfe=X_train[selected]
X_train_rfe.shape
X_test_rfe=X_test[selected]
################################################

pipe4 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid4 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid4 = GridSearchCV(pipe4, param_grid=param_grid4, cv=10)
grid4.fit(X_train_rfe, y_train)
print(f'The score ***********   {grid4.score(X_test_rfe,y_test)}')

print(f'The best estimator is --------->>   {grid4.best_estimator_}')





The score ***********   0.99
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [114]:
#Permutation importance
from sklearn.inspection import permutation_importance
forest = RandomForestClassifier(n_estimators=200,random_state=0)

forest.fit(X_train,y_train)
result =permutation_importance(
    estimator=forest,
    X=X_test,
    y=y_test,
    scoring='accuracy',
    n_repeats=50,
    random_state=0
)

importances=result.importances


indices=np.argsort(result['importances_mean'])[::-1]

print(f' indices  --->   {indices}')




 indices  --->   [65 23 66  2 61 28 67 12  1 22 63  8 19 38 18 68 76 59 44 72 71 70 69 27
 60 58 30 31 32 43 29 48 57 56 55 49 75 73 74  4 34 26  0 64 47 50 45 52
 20 24 62 16 46 53 21 25 37 39 42  9  3 11 10  5 17  6 15 51  7 40 41 33
 54 36 13 14 35]


In [115]:
## Top 18 attributes

selected=[]
for i in range(0,18):
  selected.append(features_without_target[indices[i]])
print (selected)  


['Init_Win_bytes_forward', 'Fwd IAT Min', 'Init_Win_bytes_backward', 'Total Backward Packets', 'Subflow Fwd Packets', 'Bwd IAT Min', 'act_data_pkt_fwd', 'Bwd Packet Length Std', 'Total Fwd Packets', 'Fwd IAT Max', 'Subflow Bwd Packets', 'Fwd Packet Length Std', 'Fwd IAT Total', 'Max Packet Length', 'Flow IAT Min', 'min_seg_size_forward', 'Idle Min', 'Bwd Avg Packets/Bulk']


In [116]:
features_map["permutation_importance"] = selected

In [117]:
X_train_pi=X_train[selected]
X_train_pi.shape
X_test_pi=X_test[selected]
################################################

pipe5 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid5 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid5 = GridSearchCV(pipe5, param_grid=param_grid5, cv=10)
grid5.fit(X_train_pi, y_train)
print(f'The score ***********   {grid5.score(X_test_pi,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid5.best_estimator_}')



The score ***********   0.99
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [118]:
# for j in gaures:
#     for idx, x in enumerate(df.columns):
#         if idx == j:
#             print(x)
    

In [119]:
print("")
print(features_map)


{'variance_threshold': ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'Down/Up Ratio', 'Average Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Fwd Heade

In [120]:
## write extracted columns from different selection into files
f = open("data/features_with_all_labels.txt","w")
for i in features_map:
    f.write("\n")
    f.write(i)
    f.write("------------------")
    f.write(",".join(features_map[i]))
f.close()