In [64]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [65]:
df = pd.read_csv('./data/CICIDS2017_sample.csv')
features_map = {}

In [66]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,DoS
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,DoS


In [67]:
df['Label'].unique()

array(['BENIGN', 'DoS', 'PortScan', 'Bot', 'Infiltration', 'WebAttack',
       'BruteForce'], dtype=object)

In [68]:
## handle inf values, first replace with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [69]:
df.replace(['DoS', 'PortScan', 'Bot', 'Infiltration', 'WebAttack',
       'BruteForce'], "Attack",inplace=True)

In [70]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,Attack
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,Attack


In [71]:
## missing values in the following columns and imputing with mean
df.isnull().sum()
for col in df.columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mean())
        print(col)

Flow Bytes/s
Flow Packets/s


In [72]:
# feature without Label/target
features_without_target = list(df.dtypes[df.dtypes != 'object'].index)
features_without_target
for cl in features_without_target:
    print(f'column is {cl} ----- min value is  {df[cl].min()} :::::  max value is   {df[cl].max()}')

column is Flow Duration ----- min value is  0 :::::  max value is   119999998
column is Total Fwd Packets ----- min value is  1 :::::  max value is   6930
column is Total Backward Packets ----- min value is  0 :::::  max value is   9877
column is Total Length of Fwd Packets ----- min value is  0 :::::  max value is   2866110
column is Total Length of Bwd Packets ----- min value is  0 :::::  max value is   21500000
column is Fwd Packet Length Max ----- min value is  0 :::::  max value is   23360
column is Fwd Packet Length Min ----- min value is  0 :::::  max value is   1983
column is Fwd Packet Length Mean ----- min value is  0.0 :::::  max value is   5940.857143
column is Fwd Packet Length Std ----- min value is  0.0 :::::  max value is   7049.469004000001
column is Bwd Packet Length Max ----- min value is  0 :::::  max value is   11632
column is Bwd Packet Length Min ----- min value is  0 :::::  max value is   1448
column is Bwd Packet Length Mean ----- min value is  0.0 :::::  max v

In [73]:
# Assigning numerical Label values with LabelEncoder
labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])

  df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])


In [74]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,0
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,0


In [75]:
## Scale the dataset, until now all the missing and inf handled
scaling = MinMaxScaler()
df_scaled = pd.DataFrame(scaling.fit_transform(df[features_without_target]))
df_scaled_pca = df_scaled 
df_scaled.columns=features_without_target
df_scaled['Label']=df['Label']

In [76]:
df_scaled

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3.333333e-08,0.000144,0.000000,0.000013,0.000000,0.001327,0.003026,0.003114,0.002508,0.000000,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
1,1.186475e-03,0.006494,0.006277,0.000462,0.004923,0.024401,0.000000,0.004849,0.015804,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
2,9.906083e-04,0.003175,0.002835,0.000408,0.002094,0.024401,0.000000,0.008555,0.022149,0.248968,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
3,1.196475e-03,0.006061,0.005568,0.000454,0.004990,0.024401,0.000000,0.005093,0.016339,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
4,1.197875e-03,0.006927,0.005973,0.000464,0.005125,0.024401,0.000000,0.004572,0.015330,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,1.950000e-06,0.000144,0.000202,0.000022,0.000011,0.001370,0.016137,0.005386,0.000000,0.009972,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
56657,1.110733e-03,0.000144,0.000202,0.000033,0.000022,0.002012,0.023701,0.007911,0.000000,0.020719,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
56658,9.589745e-02,0.000577,0.000405,0.000157,0.000164,0.019264,0.000000,0.015149,0.028548,0.303043,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054652,0.0,0.054652,0.054652,0
56659,9.589756e-02,0.001010,0.000607,0.000145,0.000541,0.017808,0.000000,0.008753,0.020864,0.497937,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054648,0.0,0.054648,0.054648,0


In [77]:
df_scaled['Label'].unique()

array([1, 0])

In [78]:
## write the scaled doc for testing
df_scaled.to_csv('./data/minmaxscaled_test.csv',index=0)

In [79]:
df_scaled

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3.333333e-08,0.000144,0.000000,0.000013,0.000000,0.001327,0.003026,0.003114,0.002508,0.000000,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
1,1.186475e-03,0.006494,0.006277,0.000462,0.004923,0.024401,0.000000,0.004849,0.015804,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
2,9.906083e-04,0.003175,0.002835,0.000408,0.002094,0.024401,0.000000,0.008555,0.022149,0.248968,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
3,1.196475e-03,0.006061,0.005568,0.000454,0.004990,0.024401,0.000000,0.005093,0.016339,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
4,1.197875e-03,0.006927,0.005973,0.000464,0.005125,0.024401,0.000000,0.004572,0.015330,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,1.950000e-06,0.000144,0.000202,0.000022,0.000011,0.001370,0.016137,0.005386,0.000000,0.009972,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
56657,1.110733e-03,0.000144,0.000202,0.000033,0.000022,0.002012,0.023701,0.007911,0.000000,0.020719,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
56658,9.589745e-02,0.000577,0.000405,0.000157,0.000164,0.019264,0.000000,0.015149,0.028548,0.303043,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054652,0.0,0.054652,0.054652,0
56659,9.589756e-02,0.001010,0.000607,0.000145,0.000541,0.017808,0.000000,0.008753,0.020864,0.497937,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054648,0.0,0.054648,0.054648,0


In [80]:
## 4000 samples
df_sample = df_scaled.sample(n=4000)
df_sample.shape

(4000, 78)

In [81]:
df_sample.to_csv('./data/final_data_two_labels.csv',index=0)

In [82]:
df_sample

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
6985,6.604203e-02,0.000577,0.000000,1.046715e-05,0.000000e+00,0.000257,0.003026,0.001010,0.000000,0.000000,...,0.333333,0.000010,0.000000,0.000010,0.00001,0.066589,0.000000,0.066589,0.066589,0
40008,2.677800e-03,0.000144,0.000101,1.535182e-05,2.790698e-07,0.001627,0.003026,0.003703,0.003210,0.000516,...,0.333333,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,1
28329,4.237667e-04,0.000144,0.000202,2.581897e-05,5.395349e-06,0.001584,0.018659,0.006228,0.000000,0.004986,...,0.533333,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,1
7514,5.439740e-01,0.001299,0.001012,1.517737e-04,3.237209e-05,0.016567,0.000000,0.007322,0.017123,0.058803,...,0.333333,0.000944,0.004183,0.003172,0.00048,0.084034,0.000369,0.084874,0.084031,1
10171,3.583333e-07,0.000000,0.000101,6.978099e-07,2.790698e-07,0.000086,0.001009,0.000337,0.000000,0.000516,...,0.400000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17151,5.493000e-04,0.000000,0.000101,1.465401e-05,6.418605e-06,0.001798,0.021180,0.007070,0.000000,0.011864,...,0.333333,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,1
51604,8.353488e-01,0.000866,0.000607,1.301416e-04,5.393023e-04,0.015711,0.000000,0.008969,0.019626,0.374398,...,0.333333,0.000010,0.000000,0.000010,0.00001,0.840336,0.000000,0.840336,0.840336,0
21434,4.469917e-04,0.000000,0.000101,1.570072e-05,1.655814e-05,0.001926,0.022693,0.007575,0.000000,0.030605,...,0.333333,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,1
22886,2.042475e-03,0.000000,0.000101,1.849196e-05,6.744186e-06,0.002269,0.026727,0.008921,0.000000,0.012466,...,0.533333,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,1


In [83]:
df_sample['Label'].unique()

array([0, 1])

In [84]:
## Result without any feature selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

y=df_sample['Label']
X=df_sample.drop('Label', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30,random_state=1)

clf = RandomForestClassifier(random_state=0) 
 

clf.fit(X_train, y_train)
 

y_pred = clf.predict(X_test)
 

from sklearn import metrics 
print()
 

print("ACCURACY OF THE MODEL without any feature selection: ", metrics.accuracy_score(y_test, y_pred))


ACCURACY OF THE MODEL without any feature selection:  0.9883333333333333


In [85]:
y_test.unique()

array([0, 1])

In [86]:
## Variance Threshold
##VARIANCE THRESHOLD

pipe1 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('vt', VarianceThreshold()),
        ('rfc',  RandomForestClassifier())])

param_grid1 = dict(
                  vt__threshold=[0,0.01,0.02,0.03,0.04,0.05],
                  rfc__random_state=[0] 
                  )

grid1 = GridSearchCV(pipe1, param_grid=param_grid1, cv=10, n_jobs=1, scoring= 'accuracy')
grid1.fit(X_train, y_train)
print(f'The score ***********   {grid1.score(X_test,y_test)}')
print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid1.best_estimator_}')

The score ***********   0.9875
{'mean_fit_time': array([0.35475457, 0.27020197, 0.24855559, 0.25516489, 0.26014073,
       0.22535186]), 'std_fit_time': array([0.00801019, 0.00504871, 0.01300336, 0.0137005 , 0.00594678,
       0.00774628]), 'mean_score_time': array([0.00793312, 0.00820003, 0.00665612, 0.009551  , 0.00769951,
       0.00743444]), 'std_score_time': array([0.00380275, 0.00240416, 0.0034758 , 0.00496265, 0.00376282,
       0.00300662]), 'param_rfc__random_state': masked_array(data=[0, 0, 0, 0, 0, 0],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_vt__threshold': masked_array(data=[0, 0.01, 0.02, 0.03, 0.04, 0.05],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'rfc__random_state': 0, 'vt__threshold': 0}, {'rfc__random_state': 0, 'vt__threshold': 0.01}, {'rfc__random_state': 0, 'vt__threshold': 0.02}, {'rfc__random_state': 0, '

In [87]:
grid1.best_estimator_.named_steps["vt"].get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True, False, False, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [91]:
selected=[]
idx=0
toSelect=grid1.best_estimator_.named_steps["vt"].get_support()
for i in toSelect:
  if(i == True):
    print(f'--- > {features_without_target[idx]}')
    selected.append(features_without_target[idx])

  idx=idx+1  

print(f'---selected variance threshold columns    {selected}')
print(f" number of features selected by variance threshold -->> {len(selected)}")
features_map["variance_threshold"] = selected 

--- > Flow Duration
--- > Total Fwd Packets
--- > Total Backward Packets
--- > Total Length of Fwd Packets
--- > Total Length of Bwd Packets
--- > Fwd Packet Length Max
--- > Fwd Packet Length Min
--- > Fwd Packet Length Mean
--- > Fwd Packet Length Std
--- > Bwd Packet Length Max
--- > Bwd Packet Length Min
--- > Bwd Packet Length Mean
--- > Bwd Packet Length Std
--- > Flow Bytes/s
--- > Flow Packets/s
--- > Flow IAT Mean
--- > Flow IAT Std
--- > Flow IAT Max
--- > Flow IAT Min
--- > Fwd IAT Total
--- > Fwd IAT Mean
--- > Fwd IAT Std
--- > Fwd IAT Max
--- > Fwd IAT Min
--- > Bwd IAT Total
--- > Bwd IAT Mean
--- > Bwd IAT Std
--- > Bwd IAT Max
--- > Bwd IAT Min
--- > Fwd PSH Flags
--- > Fwd Header Length
--- > Bwd Header Length
--- > Fwd Packets/s
--- > Bwd Packets/s
--- > Min Packet Length
--- > Max Packet Length
--- > Packet Length Mean
--- > Packet Length Std
--- > Packet Length Variance
--- > FIN Flag Count
--- > SYN Flag Count
--- > RST Flag Count
--- > PSH Flag Count
--- > ACK Fl

In [92]:
y_train

20056    0
7948     1
51717    0
13686    0
40217    0
        ..
29764    0
52112    1
10737    0
34794    1
37717    1
Name: Label, Length: 2800, dtype: int32

In [93]:
##Lasso
## LASSO
from sklearn.linear_model import Lasso

pipe2 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('lasso', Lasso())])


param_grid2 = dict(
                  lasso__alpha=np.arange(0.1,10,0.1)
                   
                  )

grid2 = GridSearchCV(pipe2, param_grid=param_grid2, cv=10, n_jobs=1, scoring= "neg_mean_squared_error")
grid2.fit(X_train, y_train)


print(f'The best estimator is --------->>   {grid2.best_estimator_}')
print()
print()
print(f' best params --->>  {grid2.best_params_}')
print()
coefficients = grid2.best_estimator_.named_steps['lasso'].coef_

print(f' the coefficients --->>  {coefficients}')


importance = np.abs(coefficients)

impr_features=np.array(features_without_target)[importance > 0]
print()
print()


print(f' important features   --->>  {impr_features}')

print()
print()

redundant_features=np.array(features_without_target)[importance == 0]
print(f' redundant features --->>  {redundant_features}')


The best estimator is --------->>   Pipeline(steps=[('lasso', Lasso(alpha=0.1))])


 best params --->>  {'lasso__alpha': 0.1}

 the coefficients --->>  [-0.  0.  0.  0.  0.  0.  0.  0.  0. -0.  0. -0. -0.  0. -0. -0. -0. -0.
 -0. -0. -0. -0. -0.  0.  0. -0. -0. -0.  0.  0.  0.  0.  0.  0.  0. -0.
 -0.  0. -0. -0. -0. -0. -0.  0.  0. -0. -0.  0.  0.  0.  0. -0.  0. -0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. -0.  0.  0. -0. -0.  0. -0.
 -0. -0. -0. -0. -0.]


 important features   --->>  []


 redundant features --->>  ['Flow Duration' 'Total Fwd Packets' 'Total Backward Packets'
 'Total Length of Fwd Packets' 'Total Length of Bwd Packets'
 'Fwd Packet Length Max' 'Fwd Packet Length Min' 'Fwd Packet Length Mean'
 'Fwd Packet Length Std' 'Bwd Packet Length Max' 'Bwd Packet Length Min'
 'Bwd Packet Length Mean' 'Bwd Packet Length Std' 'Flow Bytes/s'
 'Flow Packets/s' 'Flow IAT Mean' 'Flow IAT Std' 'Flow IAT Max'
 'Flow IAT Min' 'Fwd IAT Total' 'Fwd IAT Mean' 'Fwd IAT Std' 'Fwd IAT Ma

In [94]:
features_map["lasso"]=impr_features

In [95]:
### Now using random forest using the important features
if len(impr_features)!=0:
    df_lasso=df_sample
    df_lasso = df_lasso.drop(redundant_features, axis=1)
    
    #df_lasso.shape
    #df_lasso.head()
    
    y=df_lasso['Label']
    X=df_lasso.drop('Label', axis=1)
    X_train_ll, X_test_ll, y_train_ll, y_test_ll = train_test_split(X, y, test_size = 0.30,random_state=1)
    
    clf = RandomForestClassifier(random_state=0) 
    
    clf.fit(X_train_ll, y_train_ll)
    
    y_pred = clf.predict(X_test_ll)
     
    
    from sklearn import metrics 
    print()
     
    
    print("ACCURACY OF THE MODEL with lasso features: ", metrics.accuracy_score(y_test_ll, y_pred))

In [97]:
if len(impr_features)!=0:
    df_lasso

In [98]:
### Random forest feature importance



pipe3 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('rfc',  RandomForestClassifier())])

param_grid3 = dict(
                  
                  rfc__criterion = ['gini','entropy'] ,
                  
                  rfc__random_state= [0]
                  )

grid3 = GridSearchCV(pipe3, param_grid=param_grid3, cv=10, n_jobs=1, scoring= 'accuracy')
grid3.fit(X_train, y_train)
print()
print()
print(f'The score ***********   {grid3.score(X_test,y_test)}')
print()
print()
print(grid3.cv_results_)
print()
print()
print(f'The best estimator is --------->>   {grid3.best_estimator_}')

print()
print()

importances=grid3.best_estimator_.named_steps["rfc"].feature_importances_


indices=np.argsort(importances)[::-1]

print(f' importances  --->   {importances}')
print()
print(f' indices  --->   {indices}')

print()
importantFeature=grid3.best_estimator_.named_steps["rfc"].feature_importances_

df_show=pd.DataFrame(importantFeature,index=features_without_target,columns=['importance']).sort_values('importance',ascending=False)
df_show.head(18)




The score ***********   0.9883333333333333


{'mean_fit_time': array([0.33281543, 0.36244385]), 'std_fit_time': array([0.00991418, 0.01352533]), 'mean_score_time': array([0.00609698, 0.00712535]), 'std_score_time': array([0.00566779, 0.00275135]), 'param_rfc__criterion': masked_array(data=['gini', 'entropy'],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_rfc__random_state': masked_array(data=[0, 0],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'rfc__criterion': 'gini', 'rfc__random_state': 0}, {'rfc__criterion': 'entropy', 'rfc__random_state': 0}], 'split0_test_score': array([0.99285714, 0.98214286]), 'split1_test_score': array([0.96785714, 0.975     ]), 'split2_test_score': array([0.97142857, 0.975     ]), 'split3_test_score': array([0.98214286, 0.98928571]), 'split4_test_score': array([0.98928571, 0.98571429]), 'split5_test_score': array([0.975, 0.975]), 'split6_test_score': array([0.978

Unnamed: 0,importance
Init_Win_bytes_forward,0.097333
Init_Win_bytes_backward,0.045848
Min Packet Length,0.04361
Packet Length Std,0.041329
Fwd Packet Length Min,0.041099
Avg Bwd Segment Size,0.039153
Bwd Packet Length Min,0.038558
Bwd Packet Length Mean,0.034415
Fwd Packet Length Max,0.030756
Packet Length Mean,0.030233


In [99]:
selected=[]
for i in range (0,18):
  selected.append(features_without_target[indices[i]])
selected
features_map["random_forest_feature_importance"]= selected

In [100]:
### use the above features... Tuesday

X_train_pi=X_train[selected]
X_train_pi.shape
X_test_pi=X_test[selected]
################################################

pipe3 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid3 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid3 = GridSearchCV(pipe3, param_grid=param_grid3, cv=10)
grid3.fit(X_train_pi, y_train)
print(f'The score ***********   {grid3.score(X_test_pi,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid3.best_estimator_}')

The score ***********   0.9941666666666666
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [101]:
## Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

pipe4 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfe', RFE(RandomForestClassifier())),
        ('rfc', RandomForestClassifier())
        ])

param_grid4 = dict(
                  
                  rfe__n_features_to_select=[18,5,25],
                  rfe__step=[0.5] 
                   
                  )

grid4 = GridSearchCV(pipe4, param_grid=param_grid4, cv=10)
grid4.fit(X_train, y_train)
print(f'The score ***********   {grid4.score(X_test,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid4.best_estimator_}')


#print(f'{grid4.best_estimator_.named_steps["rfe"].support_}')

toSelect=grid4.best_estimator_.named_steps["rfe"].support_
idx=0
print()
print()
selected=[]
for i in toSelect:
  if(i== True):
    print(f'--- > {features_without_target[idx]}')
    selected.append(features_without_target[idx])

  idx=idx+1  

print(f'---selected RFE columns    {selected}')
features_map["recursive_feature_elimination"]=selected


The score ***********   0.9916666666666667
The best estimator is --------->>   Pipeline(steps=[('rfe',
                 RFE(estimator=RandomForestClassifier(),
                     n_features_to_select=18, step=0.5)),
                ('rfc', RandomForestClassifier())])


--- > Total Length of Fwd Packets
--- > Fwd Packet Length Max
--- > Fwd Packet Length Mean
--- > Bwd Packet Length Min
--- > Bwd Packet Length Mean
--- > Bwd Packet Length Std
--- > Fwd Header Length
--- > Min Packet Length
--- > Max Packet Length
--- > Packet Length Mean
--- > Packet Length Std
--- > Packet Length Variance
--- > Average Packet Size
--- > Avg Bwd Segment Size
--- > Fwd Header Length.1
--- > Subflow Fwd Bytes
--- > Init_Win_bytes_forward
--- > Init_Win_bytes_backward
---selected RFE columns    ['Total Length of Fwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Fwd Header Length', 'Min Packet Length', 'Max Packet 

In [102]:
X_train_rfe=X_train[selected]
X_train_rfe.shape
X_test_rfe=X_test[selected]
################################################

pipe4 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid4 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid4 = GridSearchCV(pipe4, param_grid=param_grid4, cv=10)
grid4.fit(X_train_rfe, y_train)
print(f'The score ***********   {grid4.score(X_test_rfe,y_test)}')

print(f'The best estimator is --------->>   {grid4.best_estimator_}')



The score ***********   0.9916666666666667
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [103]:
#Permutation importance
from sklearn.inspection import permutation_importance
forest = RandomForestClassifier(n_estimators=200,random_state=0)

forest.fit(X_train,y_train)
result =permutation_importance(
    estimator=forest,
    X=X_test,
    y=y_test,
    scoring='accuracy',
    n_repeats=50,
    random_state=0
)

importances=result.importances


indices=np.argsort(result['importances_mean'])[::-1]

print(f' indices  --->   {indices}')




 indices  --->   [65 66 23 45  9 41 29  3 46 68 38 12 39 18 11 40 76 73 10 69  6 58 72 71
 32 31 30 28 70 27 26 25 24 22 21 75 59  8 60 57 56 55 74 48 49 61 44 43
 42 19  7 53  1  4 33 20 37 52 67 62 16 35  5 50 47 64  2 54 51 63 34 36
 14 13 15 17  0]


In [104]:
## Top 18 attributes

selected=[]
for i in range(0,18):
  selected.append(features_without_target[indices[i]])
print (selected)  


['Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'Fwd IAT Min', 'PSH Flag Count', 'Bwd Packet Length Max', 'Packet Length Variance', 'Fwd PSH Flags', 'Total Length of Fwd Packets', 'ACK Flag Count', 'min_seg_size_forward', 'Max Packet Length', 'Bwd Packet Length Std', 'Packet Length Mean', 'Flow IAT Min', 'Bwd Packet Length Mean', 'Packet Length Std', 'Idle Min', 'Idle Mean']


In [105]:
features_map["permutation_importance"] = selected

In [106]:
X_train_pi=X_train[selected]
X_train_pi.shape
X_test_pi=X_test[selected]
################################################

pipe5 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid5 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid5 = GridSearchCV(pipe5, param_grid=param_grid5, cv=10)
grid5.fit(X_train_pi, y_train)
print(f'The score ***********   {grid5.score(X_test_pi,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid5.best_estimator_}')

The score ***********   0.9941666666666666
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [107]:
# for j in gaures:
#     for idx, x in enumerate(df.columns):
#         if idx == j:
#             print(x)
    

In [108]:
print("")
print(features_map)


{'variance_threshold': ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Average Packet Size', 'Avg Fwd Segment Size'

In [109]:
## write extracted columns from different selection into files
f = open("data/features_with_two_labels.txt","w")
for i in features_map:
    f.write("\n")
    f.write(i)
    f.write("------------------")
    f.write(",".join(features_map[i]))
f.close()

In [110]:
#  try pca

X = df_sample.iloc[:,:-1]
y = df_sample.iloc[:,-1]


In [111]:
from sklearn.decomposition import PCA
pca= PCA(n_components=18)
pca.fit(X)
x_pca=pca.transform(X)
x_pca.shape
    

(4000, 18)

In [112]:
df_sample_pca = pd.DataFrame(x_pca)

In [113]:
df_sample_pca["Label"] = df_scaled["Label"]

In [114]:
df_sample_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,Label
0,-0.143753,-0.782227,-0.055468,0.065188,-0.018130,-0.113716,-0.243861,0.006122,-0.073843,-0.011362,-0.193028,-0.032397,-0.018387,-0.002426,0.009735,-0.029356,0.023656,-0.007800,1
1,-0.331393,-0.883429,-0.174530,0.279142,1.201023,0.336282,0.245202,-0.177226,0.296793,-0.062374,-0.161110,-0.047008,-0.029037,0.054116,0.040903,-0.198770,0.125681,-0.001151,1
2,-0.470115,-0.155329,0.254311,-0.425293,-0.012344,-0.217913,0.135217,0.105302,0.099356,0.032333,0.038704,0.006909,-0.004991,-0.034499,-0.037582,0.044955,-0.011845,0.005155,1
3,-0.109564,0.625984,0.458123,0.411551,0.008078,0.175520,-0.122121,0.466561,-0.144026,-0.097631,0.037290,-0.171365,0.008826,-0.062001,0.061098,-0.035839,0.035015,-0.037644,1
4,-0.676691,0.464214,0.111799,0.163930,-0.030680,-0.064653,0.053822,-0.008131,-0.313954,-0.019238,-0.019592,0.017496,-0.033681,-0.035608,-0.003461,-0.011992,-0.003990,-0.009252,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,-0.448555,-0.143391,0.227876,-0.465167,-0.010435,-0.199570,0.121895,0.136651,0.043588,0.000977,-0.027237,-0.003681,0.025090,0.041129,0.000564,-0.087721,0.042184,0.004611,0
3996,2.112167,-0.259376,-0.117269,0.346154,-0.094924,-0.349367,0.173809,-0.037016,-0.088626,0.008185,-0.024201,-0.106393,0.013070,0.141838,-0.015168,-0.103481,0.022220,-0.004240,0
3997,-0.406098,-0.111646,0.167673,-0.507146,-0.004153,-0.195307,0.128072,0.152166,0.061530,0.012389,-0.024681,-0.012648,0.023248,0.074446,-0.003947,-0.096878,0.007510,0.015943,0
3998,-0.450077,-0.140715,0.227652,-0.443935,-0.009276,-0.215448,0.138329,0.112996,0.107237,0.038744,0.039460,0.001784,-0.006609,-0.013341,-0.038836,0.044309,-0.026410,0.011852,0


In [115]:
## write the scaled doc for testing
df_sample_pca.to_csv('./data/sample_pca_test.csv',index=0)