In [228]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [229]:
df = pd.read_csv('./data/CICIDS2017_sample.csv')
features_map = {}

In [230]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,DoS
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,DoS


In [231]:
df['Label'].unique()

array(['BENIGN', 'DoS', 'PortScan', 'Bot', 'Infiltration', 'WebAttack',
       'BruteForce'], dtype=object)

In [232]:
## handle inf values, first replace with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [233]:
df.replace(['DoS', 'PortScan', 'Bot', 'Infiltration', 'WebAttack',
       'BruteForce'], "Attack",inplace=True)

In [234]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,Attack
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,Attack


In [235]:
## missing values in the following columns and imputing with mean
df.isnull().sum()
for col in df.columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mean())
        print(col)

Flow Bytes/s
Flow Packets/s


In [236]:
# feature without Label/target
features_without_target = list(df.dtypes[df.dtypes != 'object'].index)
features_without_target
for cl in features_without_target:
    print(f'column is {cl} ----- min value is  {df[cl].min()} :::::  max value is   {df[cl].max()}')

column is Flow Duration ----- min value is  0 :::::  max value is   119999998
column is Total Fwd Packets ----- min value is  1 :::::  max value is   6930
column is Total Backward Packets ----- min value is  0 :::::  max value is   9877
column is Total Length of Fwd Packets ----- min value is  0 :::::  max value is   2866110
column is Total Length of Bwd Packets ----- min value is  0 :::::  max value is   21500000
column is Fwd Packet Length Max ----- min value is  0 :::::  max value is   23360
column is Fwd Packet Length Min ----- min value is  0 :::::  max value is   1983
column is Fwd Packet Length Mean ----- min value is  0.0 :::::  max value is   5940.857143
column is Fwd Packet Length Std ----- min value is  0.0 :::::  max value is   7049.469004000001
column is Bwd Packet Length Max ----- min value is  0 :::::  max value is   11632
column is Bwd Packet Length Min ----- min value is  0 :::::  max value is   1448
column is Bwd Packet Length Mean ----- min value is  0.0 :::::  max v

In [237]:
# Assigning numerical Label values with LabelEncoder
labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])

  df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])


In [238]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,0
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,0


In [239]:
## Scale the dataset, until now all the missing and inf handled
scaling = MinMaxScaler()
df_scaled = pd.DataFrame(scaling.fit_transform(df[features_without_target]))
df_scaled_pca = df_scaled 
df_scaled.columns=features_without_target
df_scaled['Label']=df['Label']

In [240]:
df_scaled

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3.333333e-08,0.000144,0.000000,0.000013,0.000000,0.001327,0.003026,0.003114,0.002508,0.000000,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
1,1.186475e-03,0.006494,0.006277,0.000462,0.004923,0.024401,0.000000,0.004849,0.015804,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
2,9.906083e-04,0.003175,0.002835,0.000408,0.002094,0.024401,0.000000,0.008555,0.022149,0.248968,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
3,1.196475e-03,0.006061,0.005568,0.000454,0.004990,0.024401,0.000000,0.005093,0.016339,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
4,1.197875e-03,0.006927,0.005973,0.000464,0.005125,0.024401,0.000000,0.004572,0.015330,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,1.950000e-06,0.000144,0.000202,0.000022,0.000011,0.001370,0.016137,0.005386,0.000000,0.009972,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
56657,1.110733e-03,0.000144,0.000202,0.000033,0.000022,0.002012,0.023701,0.007911,0.000000,0.020719,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
56658,9.589745e-02,0.000577,0.000405,0.000157,0.000164,0.019264,0.000000,0.015149,0.028548,0.303043,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054652,0.0,0.054652,0.054652,0
56659,9.589756e-02,0.001010,0.000607,0.000145,0.000541,0.017808,0.000000,0.008753,0.020864,0.497937,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054648,0.0,0.054648,0.054648,0


In [241]:
df_scaled['Label'].unique()

array([1, 0])

In [242]:
## write the scaled doc for testing
df_scaled.to_csv('./data/minmaxscaled_test.csv',index=0)

In [243]:
df_scaled

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3.333333e-08,0.000144,0.000000,0.000013,0.000000,0.001327,0.003026,0.003114,0.002508,0.000000,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
1,1.186475e-03,0.006494,0.006277,0.000462,0.004923,0.024401,0.000000,0.004849,0.015804,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
2,9.906083e-04,0.003175,0.002835,0.000408,0.002094,0.024401,0.000000,0.008555,0.022149,0.248968,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
3,1.196475e-03,0.006061,0.005568,0.000454,0.004990,0.024401,0.000000,0.005093,0.016339,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
4,1.197875e-03,0.006927,0.005973,0.000464,0.005125,0.024401,0.000000,0.004572,0.015330,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,1.950000e-06,0.000144,0.000202,0.000022,0.000011,0.001370,0.016137,0.005386,0.000000,0.009972,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
56657,1.110733e-03,0.000144,0.000202,0.000033,0.000022,0.002012,0.023701,0.007911,0.000000,0.020719,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
56658,9.589745e-02,0.000577,0.000405,0.000157,0.000164,0.019264,0.000000,0.015149,0.028548,0.303043,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054652,0.0,0.054652,0.054652,0
56659,9.589756e-02,0.001010,0.000607,0.000145,0.000541,0.017808,0.000000,0.008753,0.020864,0.497937,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054648,0.0,0.054648,0.054648,0


In [244]:
## 4000 samples
df_sample = df_scaled.sample(n=4000)
df_sample.shape

(4000, 78)

In [245]:
df_sample

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
11021,2.750000e-07,0.000000,0.000101,6.978099e-07,2.790698e-07,0.000086,0.001009,0.000337,0.000000,0.000516,...,0.400000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
16986,5.036750e-04,0.000000,0.000101,1.430510e-05,6.651163e-06,0.001755,0.020676,0.006901,0.000000,0.012294,...,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
56583,3.454136e-01,0.001732,0.001417,9.057573e-04,3.704186e-04,0.024187,0.000000,0.033613,0.037167,0.147610,...,0.333333,0.013101,0.057446,0.035142,0.003647,0.073024,0.026671,0.084034,0.051425,1
39539,5.405667e-04,0.000144,0.000202,3.000583e-05,9.674419e-06,0.001841,0.021684,0.007238,0.000000,0.008941,...,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
48711,2.500000e-08,0.000144,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.533333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3703,9.064250e-03,0.000289,0.000506,9.071529e-06,5.398605e-04,0.000856,0.000000,0.001459,0.001456,0.871303,...,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
52092,9.582583e-04,0.000722,0.000506,4.169414e-04,1.563256e-04,0.036130,0.000000,0.033525,0.046458,0.234783,...,0.533333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
40704,1.749930e-01,0.000144,0.000101,3.140145e-06,0.000000e+00,0.000385,0.000000,0.000757,0.000903,0.000000,...,0.533333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
25269,2.605083e-04,0.000144,0.000202,2.442335e-05,1.813953e-05,0.001498,0.017650,0.005891,0.000000,0.016764,...,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1


In [246]:
df_sample['Label'].unique()

array([0, 1])

In [247]:
## Result without any feature selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

y=df_sample['Label']
X=df_sample.drop('Label', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30,random_state=1)

clf = RandomForestClassifier(random_state=0) 
 

clf.fit(X_train, y_train)
 

y_pred = clf.predict(X_test)
 

from sklearn import metrics 
print()
 

print("ACCURACY OF THE MODEL without any feature selection: ", metrics.accuracy_score(y_test, y_pred))


ACCURACY OF THE MODEL without any feature selection:  0.9908333333333333


In [248]:
y_test.unique()

array([0, 1])

In [249]:
## Variance Threshold
##VARIANCE THRESHOLD

pipe1 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('vt', VarianceThreshold()),
        ('rfc',  RandomForestClassifier())])

param_grid1 = dict(
                  vt__threshold=[0,0.01,0.02,0.03,0.04,0.05],
                  rfc__random_state=[0] 
                  )

grid1 = GridSearchCV(pipe1, param_grid=param_grid1, cv=10, n_jobs=1, scoring= 'accuracy')
grid1.fit(X_train, y_train)
print(f'The score ***********   {grid1.score(X_test,y_test)}')
print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid1.best_estimator_}')

The score ***********   0.99
{'mean_fit_time': array([0.41308548, 0.2839678 , 0.24879148, 0.25486648, 0.27272871,
       0.22775006]), 'std_fit_time': array([0.03262719, 0.01134917, 0.01012172, 0.01238517, 0.01215814,
       0.0109493 ]), 'mean_score_time': array([0.00847661, 0.00708714, 0.00954211, 0.00696278, 0.00826547,
       0.00763564]), 'std_score_time': array([0.00261225, 0.00029998, 0.00353552, 0.00013002, 0.00139946,
       0.00163909]), 'param_rfc__random_state': masked_array(data=[0, 0, 0, 0, 0, 0],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_vt__threshold': masked_array(data=[0, 0.01, 0.02, 0.03, 0.04, 0.05],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'rfc__random_state': 0, 'vt__threshold': 0}, {'rfc__random_state': 0, 'vt__threshold': 0.01}, {'rfc__random_state': 0, 'vt__threshold': 0.02}, {'rfc__random_state': 0, 'vt

In [250]:
grid1.best_estimator_.named_steps["vt"].get_support()

array([ True, False, False, False, False, False, False, False, False,
        True, False,  True,  True, False, False, False,  True,  True,
       False,  True, False,  True,  True, False,  True, False,  True,
        True, False,  True, False, False, False, False, False, False,
       False, False,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True, False, False, False,  True, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True, False,  True, False, False, False,
       False,  True, False,  True,  True])

In [251]:
selected=[]
idx=0
toSelect=grid1.best_estimator_.named_steps["vt"].get_support()
for i in toSelect:
  if(i == True):
    print(f'--- > {features_without_target[idx]}')
    selected.append(features_without_target[idx])

  idx=idx+1  

print(f'---selected variance threshold columns    {selected}')
print(f" number of features selected by variance threshold -->> {len(selected)}")
features_map["variance_threshold"] = selected 

--- > Flow Duration
--- > Bwd Packet Length Max
--- > Bwd Packet Length Mean
--- > Bwd Packet Length Std
--- > Flow IAT Std
--- > Flow IAT Max
--- > Fwd IAT Total
--- > Fwd IAT Std
--- > Fwd IAT Max
--- > Bwd IAT Total
--- > Bwd IAT Std
--- > Bwd IAT Max
--- > Fwd PSH Flags
--- > Max Packet Length
--- > Packet Length Mean
--- > Packet Length Std
--- > Packet Length Variance
--- > FIN Flag Count
--- > SYN Flag Count
--- > PSH Flag Count
--- > ACK Flag Count
--- > URG Flag Count
--- > Average Packet Size
--- > Avg Bwd Segment Size
--- > Init_Win_bytes_forward
--- > Init_Win_bytes_backward
--- > min_seg_size_forward
--- > Idle Mean
--- > Idle Max
--- > Idle Min
---selected variance threshold columns    ['Flow Duration', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Total', 'Fwd IAT Std', 'Fwd IAT Max', 'Bwd IAT Total', 'Bwd IAT Std', 'Bwd IAT Max', 'Fwd PSH Flags', 'Max Packet Length', 'Packet Length Mean', 'Packet Len

In [252]:
y_train

28978    0
23664    1
51141    0
50762    0
16064    0
        ..
22817    1
53064    1
36509    1
28851    1
46183    0
Name: Label, Length: 2800, dtype: int32

In [253]:
##Lasso
## LASSO
from sklearn.linear_model import Lasso

pipe2 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('lasso', Lasso())])


param_grid2 = dict(
                  lasso__alpha=np.arange(0.1,10,0.1)
                   
                  )

grid2 = GridSearchCV(pipe2, param_grid=param_grid2, cv=10, n_jobs=1, scoring= "neg_mean_squared_error")
grid2.fit(X_train, y_train)


print(f'The best estimator is --------->>   {grid2.best_estimator_}')
print()
print()
print(f' best params --->>  {grid2.best_params_}')
print()
coefficients = grid2.best_estimator_.named_steps['lasso'].coef_

print(f' the coefficients --->>  {coefficients}')


importance = np.abs(coefficients)

impr_features=np.array(features_without_target)[importance > 0]
print()
print()


print(f' important features   --->>  {impr_features}')

print()
print()

redundant_features=np.array(features_without_target)[importance == 0]
print(f' redundant features --->>  {redundant_features}')


The best estimator is --------->>   Pipeline(steps=[('lasso', Lasso(alpha=0.1))])


 best params --->>  {'lasso__alpha': 0.1}

 the coefficients --->>  [-0. -0. -0. -0.  0.  0.  0.  0.  0. -0.  0. -0. -0.  0. -0. -0. -0. -0.
  0. -0. -0. -0. -0.  0.  0.  0. -0. -0.  0.  0.  0.  0.  0. -0. -0. -0.
 -0.  0. -0. -0. -0. -0. -0.  0.  0. -0. -0.  0.  0.  0.  0. -0.  0. -0.
 -0.  0.  0.  0.  0.  0.  0. -0. -0. -0.  0. -0.  0. -0. -0. -0.  0.  0.
 -0. -0. -0. -0. -0.]


 important features   --->>  []


 redundant features --->>  ['Flow Duration' 'Total Fwd Packets' 'Total Backward Packets'
 'Total Length of Fwd Packets' 'Total Length of Bwd Packets'
 'Fwd Packet Length Max' 'Fwd Packet Length Min' 'Fwd Packet Length Mean'
 'Fwd Packet Length Std' 'Bwd Packet Length Max' 'Bwd Packet Length Min'
 'Bwd Packet Length Mean' 'Bwd Packet Length Std' 'Flow Bytes/s'
 'Flow Packets/s' 'Flow IAT Mean' 'Flow IAT Std' 'Flow IAT Max'
 'Flow IAT Min' 'Fwd IAT Total' 'Fwd IAT Mean' 'Fwd IAT Std' 'Fwd IAT Ma

In [254]:
features_map["lasso"]=impr_features

In [255]:
### Now using random forest using the important features
if len(impr_features)!=0:
    df_lasso=df_sample
    df_lasso = df_lasso.drop(redundant_features, axis=1)
    
    #df_lasso.shape
    #df_lasso.head()
    
    y=df_lasso['Label']
    X=df_lasso.drop('Label', axis=1)
    X_train_ll, X_test_ll, y_train_ll, y_test_ll = train_test_split(X, y, test_size = 0.30,random_state=1)
    
    clf = RandomForestClassifier(random_state=0) 
    
    clf.fit(X_train_ll, y_train_ll)
    
    y_pred = clf.predict(X_test_ll)
     
    
    from sklearn import metrics 
    print()
     
    
    print("ACCURACY OF THE MODEL with lasso features: ", metrics.accuracy_score(y_test_ll, y_pred))

In [256]:
df_lasso

Unnamed: 0,Label
42592,0
38825,1
48350,0
20485,1
35244,1
...,...
25597,1
49797,0
12911,0
20592,1


In [257]:
### Random forest feature importance



pipe3 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('rfc',  RandomForestClassifier())])

param_grid3 = dict(
                  
                  rfc__criterion = ['gini','entropy'] ,
                  
                  rfc__random_state= [0]
                  )

grid3 = GridSearchCV(pipe3, param_grid=param_grid3, cv=10, n_jobs=1, scoring= 'accuracy')
grid3.fit(X_train, y_train)
print()
print()
print(f'The score ***********   {grid3.score(X_test,y_test)}')
print()
print()
print(grid3.cv_results_)
print()
print()
print(f'The best estimator is --------->>   {grid3.best_estimator_}')

print()
print()

importances=grid3.best_estimator_.named_steps["rfc"].feature_importances_


indices=np.argsort(importances)[::-1]

print(f' importances  --->   {importances}')
print()
print(f' indices  --->   {indices}')

print()
importantFeature=grid3.best_estimator_.named_steps["rfc"].feature_importances_

df_show=pd.DataFrame(importantFeature,index=features_without_target,columns=['importance']).sort_values('importance',ascending=False)
df_show.head(18)




The score ***********   0.9908333333333333


{'mean_fit_time': array([0.34718132, 0.36751647]), 'std_fit_time': array([0.01626509, 0.01026276]), 'mean_score_time': array([0.00807867, 0.00747223]), 'std_score_time': array([0.00258492, 0.00127782]), 'param_rfc__criterion': masked_array(data=['gini', 'entropy'],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_rfc__random_state': masked_array(data=[0, 0],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'rfc__criterion': 'gini', 'rfc__random_state': 0}, {'rfc__criterion': 'entropy', 'rfc__random_state': 0}], 'split0_test_score': array([0.975, 0.975]), 'split1_test_score': array([0.98571429, 0.98571429]), 'split2_test_score': array([0.97857143, 0.97857143]), 'split3_test_score': array([0.98214286, 0.97857143]), 'split4_test_score': array([0.98571429, 0.98571429]), 'split5_test_score': array([0.99285714, 0.98928571]), 'split6_test_score': array([0.975

Unnamed: 0,importance
Init_Win_bytes_forward,0.091661
Packet Length Std,0.056531
Init_Win_bytes_backward,0.049709
Fwd Packet Length Min,0.044937
Min Packet Length,0.038962
Fwd Packet Length Max,0.037341
Bwd Packet Length Min,0.033828
Average Packet Size,0.033777
Avg Fwd Segment Size,0.032003
Packet Length Variance,0.030099


In [258]:
selected=[]
for i in range (0,18):
  selected.append(features_without_target[indices[i]])
selected
features_map["random_forest_feature_importance"]= selected

In [259]:
### use the above features... Tuesday

X_train_pi=X_train[selected]
X_train_pi.shape
X_test_pi=X_test[selected]
################################################

pipe3 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid3 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid3 = GridSearchCV(pipe3, param_grid=param_grid3, cv=10)
grid3.fit(X_train_pi, y_train)
print(f'The score ***********   {grid3.score(X_test_pi,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid3.best_estimator_}')

The score ***********   0.9883333333333333
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [260]:
## Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

pipe4 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfe', RFE(RandomForestClassifier())),
        ('rfc', RandomForestClassifier())
        ])

param_grid4 = dict(
                  
                  rfe__n_features_to_select=[18,5,25],
                  rfe__step=[0.5] 
                   
                  )

grid4 = GridSearchCV(pipe4, param_grid=param_grid4, cv=10)
grid4.fit(X_train, y_train)
print(f'The score ***********   {grid4.score(X_test,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid4.best_estimator_}')


#print(f'{grid4.best_estimator_.named_steps["rfe"].support_}')

toSelect=grid4.best_estimator_.named_steps["rfe"].support_
idx=0
print()
print()
selected=[]
for i in toSelect:
  if(i== True):
    print(f'--- > {features_without_target[idx]}')
    selected.append(features_without_target[idx])

  idx=idx+1  

print(f'---selected RFE columns    {selected}')
features_map["recursive_feature_elimination"]=selected


The score ***********   0.9891666666666666
The best estimator is --------->>   Pipeline(steps=[('rfe',
                 RFE(estimator=RandomForestClassifier(),
                     n_features_to_select=25, step=0.5)),
                ('rfc', RandomForestClassifier())])


--- > Total Length of Fwd Packets
--- > Fwd Packet Length Max
--- > Fwd Packet Length Min
--- > Fwd Packet Length Mean
--- > Bwd Packet Length Max
--- > Bwd Packet Length Min
--- > Bwd Packet Length Mean
--- > Bwd Packet Length Std
--- > Fwd IAT Std
--- > Fwd IAT Max
--- > Fwd Header Length
--- > Bwd Packets/s
--- > Min Packet Length
--- > Max Packet Length
--- > Packet Length Mean
--- > Packet Length Std
--- > Packet Length Variance
--- > URG Flag Count
--- > Average Packet Size
--- > Avg Fwd Segment Size
--- > Avg Bwd Segment Size
--- > Fwd Header Length.1
--- > Subflow Fwd Bytes
--- > Init_Win_bytes_forward
--- > Init_Win_bytes_backward
---selected RFE columns    ['Total Length of Fwd Packets', 'Fwd Packet Length Ma

In [261]:
X_train_rfe=X_train[selected]
X_train_rfe.shape
X_test_rfe=X_test[selected]
################################################

pipe4 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid4 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid4 = GridSearchCV(pipe4, param_grid=param_grid4, cv=10)
grid4.fit(X_train_rfe, y_train)
print(f'The score ***********   {grid4.score(X_test_rfe,y_test)}')

print(f'The best estimator is --------->>   {grid4.best_estimator_}')



The score ***********   0.9883333333333333
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [262]:
#Permutation importance
from sklearn.inspection import permutation_importance
forest = RandomForestClassifier(n_estimators=200,random_state=0)

forest.fit(X_train,y_train)
result =permutation_importance(
    estimator=forest,
    X=X_test,
    y=y_test,
    scoring='accuracy',
    n_repeats=50,
    random_state=0
)

importances=result.importances


indices=np.argsort(result['importances_mean'])[::-1]

print(f' indices  --->   {indices}')




 indices  --->   [65 66 47 33 23 68 54 34 35 36 17 16 14 46 61 12 51 18 28 15 40 52 62  8
  2  3 50  5 37 63 53 45  7 38  6  1 10  4 41 39 64 20 11 22 21 27 24 25
 26 76 29 56 70 69 67 60 73 59 58 57 30 55 72 49 48 44 74 43 42 75 32 31
 71  0 13  9 19]


In [263]:
## Top 18 attributes

selected=[]
for i in range(0,18):
  selected.append(features_without_target[indices[i]])
print (selected)  


['Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'URG Flag Count', 'Fwd Header Length', 'Fwd IAT Min', 'min_seg_size_forward', 'Fwd Header Length.1', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Flow IAT Max', 'Flow IAT Std', 'Flow Packets/s', 'ACK Flag Count', 'Subflow Fwd Packets', 'Bwd Packet Length Std', 'Average Packet Size', 'Flow IAT Min']


In [264]:
features_map["permutation_importance"] = selected

In [265]:
X_train_pi=X_train[selected]
X_train_pi.shape
X_test_pi=X_test[selected]
################################################

pipe5 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid5 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid5 = GridSearchCV(pipe5, param_grid=param_grid5, cv=10)
grid5.fit(X_train_pi, y_train)
print(f'The score ***********   {grid5.score(X_test_pi,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid5.best_estimator_}')

The score ***********   0.9908333333333333
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [266]:
# for j in gaures:
#     for idx, x in enumerate(df.columns):
#         if idx == j:
#             print(x)
    

In [267]:
print("")
print(features_map)


{'variance_threshold': ['Flow Duration', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Total', 'Fwd IAT Std', 'Fwd IAT Max', 'Bwd IAT Total', 'Bwd IAT Std', 'Bwd IAT Max', 'Fwd PSH Flags', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'Average Packet Size', 'Avg Bwd Segment Size', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'min_seg_size_forward', 'Idle Mean', 'Idle Max', 'Idle Min'], 'lasso': array([], dtype='<U27'), 'random_forest_feature_importance': ['Init_Win_bytes_forward', 'Packet Length Std', 'Init_Win_bytes_backward', 'Fwd Packet Length Min', 'Min Packet Length', 'Fwd Packet Length Max', 'Bwd Packet Length Min', 'Average Packet Size', 'Avg Fwd Segment Size', 'Packet Length Variance', 'Bwd Packet Length Std', 'Max Packet Length', 'Total Length of Fwd Packets', 'Subfl

In [268]:
## write extracted columns from different selection into files
f = open("data/features_with_two_labels.txt", "a")
for i in features_map:
    f.write("\n")
    f.write(i)
    f.write("------------------")
    f.write(",".join(features_map[i]))
f.close()

In [269]:
#  try pca

X = df_sample.iloc[:,:-1]
y = df_sample.iloc[:,-1]


In [270]:
from sklearn.decomposition import PCA
pca= PCA(n_components=18)
pca.fit(X)
x_pca=pca.transform(X)
x_pca.shape
    

(4000, 18)

In [271]:
df_sample_pca = pd.DataFrame(x_pca)

In [272]:
df_sample_pca["Label"] = df_scaled["Label"]

In [273]:
df_sample_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,Label
0,-0.667574,0.460211,-0.126408,0.155071,-0.034847,-0.067689,-0.045420,-0.004823,-0.312945,-0.027911,-0.022825,-0.036596,0.005145,-0.011081,-0.017778,0.012429,-0.013084,0.021788,1
1,-0.436386,-0.145174,-0.211989,-0.476807,-0.024507,-0.190141,-0.075739,0.182591,0.042333,0.004822,0.042016,0.006413,-0.022735,0.006721,0.051449,0.082873,-0.011579,0.016291,1
2,-0.102347,0.697668,-0.132979,0.213728,0.028864,0.130085,0.191755,0.240649,-0.150049,-0.041221,-0.021601,0.010744,-0.045645,0.026951,0.061763,0.015513,-0.000476,0.023471,1
3,-0.444176,-0.151238,-0.223732,-0.469535,-0.025959,-0.191071,-0.075432,0.179412,0.039352,0.002800,0.042149,0.004833,-0.022019,0.006020,0.044886,0.086589,-0.007689,0.022241,1
4,-0.294665,-0.809548,0.118946,0.024524,-0.015453,-0.122584,0.313961,-0.163844,-0.020292,0.084463,-0.085543,0.072566,0.140437,-0.007629,-0.005742,-0.041474,0.006310,-0.015328,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0.213490,1.155844,1.215711,-0.450003,0.136563,0.218120,0.107377,0.051902,-0.038247,0.071949,0.069303,-0.005570,0.057506,0.008180,-0.201774,0.086173,0.097156,0.068618,0
3996,-0.459320,0.699295,0.223037,0.140002,0.011660,0.006872,-0.020332,-0.116121,0.143719,0.023510,-0.029484,-0.044480,-0.012909,0.006350,-0.032776,0.049548,0.046549,-0.091527,0
3997,-0.053941,-0.959324,0.039060,0.336155,1.191555,0.112925,-0.278901,0.117978,-0.009856,-0.083719,0.075970,-0.120199,-0.021759,0.095115,-0.098136,-0.100407,0.025766,-0.037964,0
3998,-0.427115,-0.138075,-0.198141,-0.485394,-0.022957,-0.189409,-0.076138,0.186356,0.046015,0.006843,0.041796,0.008319,-0.023390,0.006951,0.057916,0.079253,-0.018659,0.009004,0


In [274]:
## write the scaled doc for testing
df_sample_pca.to_csv('./data/sample_pca_test.csv',index=0)