In [79]:
## all the necessary import
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [80]:
## read the packet data
df = pd.read_csv('./data/CICIDS2017_sample.csv')
features_map = {}

In [81]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,DoS
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,DoS


In [82]:
## printing the toal unique labels
df['Label'].unique()

array(['BENIGN', 'DoS', 'PortScan', 'Bot', 'Infiltration', 'WebAttack',
       'BruteForce'], dtype=object)

In [83]:
## handle inf values, first replace with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [84]:
df.replace(['DoS', 'PortScan', 'Bot', 'Infiltration', 'WebAttack',
       'BruteForce'], "Attack",inplace=True)

In [85]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,Attack
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,Attack


In [86]:
## missing values in the following columns and imputing with mean
df.isnull().sum()
for col in df.columns:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mean())
        print(col)

Flow Bytes/s
Flow Packets/s


In [87]:
# feature without Label/target
features_without_target = list(df.dtypes[df.dtypes != 'object'].index)
features_without_target
for cl in features_without_target:
    print(f'column is {cl} ----- min value is  {df[cl].min()} :::::  max value is   {df[cl].max()}')

column is Flow Duration ----- min value is  0 :::::  max value is   119999998
column is Total Fwd Packets ----- min value is  1 :::::  max value is   6930
column is Total Backward Packets ----- min value is  0 :::::  max value is   9877
column is Total Length of Fwd Packets ----- min value is  0 :::::  max value is   2866110
column is Total Length of Bwd Packets ----- min value is  0 :::::  max value is   21500000
column is Fwd Packet Length Max ----- min value is  0 :::::  max value is   23360
column is Fwd Packet Length Min ----- min value is  0 :::::  max value is   1983
column is Fwd Packet Length Mean ----- min value is  0.0 :::::  max value is   5940.857143
column is Fwd Packet Length Std ----- min value is  0.0 :::::  max value is   7049.469004000001
column is Bwd Packet Length Max ----- min value is  0 :::::  max value is   11632
column is Bwd Packet Length Min ----- min value is  0 :::::  max value is   1448
column is Bwd Packet Length Mean ----- min value is  0.0 :::::  max v

In [88]:
# Assigning numerical Label values with LabelEncoder
labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])

  df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])


In [89]:
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4,2,0,37,0,31,6,18.500000,17.677670,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
1,142377,46,62,1325,105855,570,0,28.804348,111.407285,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
2,118873,23,28,1169,45025,570,0,50.826087,156.137367,2896,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
3,143577,43,55,1301,107289,570,0,30.255814,115.178969,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
4,143745,49,59,1331,110185,570,0,27.163265,108.067176,4344,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,234,2,2,64,232,32,32,32.000000,0.000000,116,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
56657,133288,2,2,94,482,47,47,47.000000,0.000000,241,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
56658,11507694,5,4,450,3525,450,0,90.000000,201.246118,3525,...,32,893.0,0.0,893,893,6503640.0,0.0,6503640,6503640,0
56659,11507707,8,6,416,11632,416,0,52.000000,147.078211,5792,...,32,897.0,0.0,897,897,6503122.0,0.0,6503122,6503122,0


In [90]:
## Scale the dataset, until now all the missing and inf handled
scaling = MinMaxScaler()
df_scaled = pd.DataFrame(scaling.fit_transform(df[features_without_target]))
df_scaled_pca = df_scaled 
df_scaled.columns=features_without_target
df_scaled['Label']=df['Label']

In [91]:
df_scaled

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3.333333e-08,0.000144,0.000000,0.000013,0.000000,0.001327,0.003026,0.003114,0.002508,0.000000,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
1,1.186475e-03,0.006494,0.006277,0.000462,0.004923,0.024401,0.000000,0.004849,0.015804,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
2,9.906083e-04,0.003175,0.002835,0.000408,0.002094,0.024401,0.000000,0.008555,0.022149,0.248968,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
3,1.196475e-03,0.006061,0.005568,0.000454,0.004990,0.024401,0.000000,0.005093,0.016339,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
4,1.197875e-03,0.006927,0.005973,0.000464,0.005125,0.024401,0.000000,0.004572,0.015330,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,1.950000e-06,0.000144,0.000202,0.000022,0.000011,0.001370,0.016137,0.005386,0.000000,0.009972,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
56657,1.110733e-03,0.000144,0.000202,0.000033,0.000022,0.002012,0.023701,0.007911,0.000000,0.020719,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
56658,9.589745e-02,0.000577,0.000405,0.000157,0.000164,0.019264,0.000000,0.015149,0.028548,0.303043,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054652,0.0,0.054652,0.054652,0
56659,9.589756e-02,0.001010,0.000607,0.000145,0.000541,0.017808,0.000000,0.008753,0.020864,0.497937,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054648,0.0,0.054648,0.054648,0


In [92]:
df_scaled['Label'].unique()

array([1, 0])

In [93]:
## write the scaled doc for testing
df_scaled.to_csv('./data/minmaxscaled_test.csv',index=0)

In [94]:
df_scaled

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3.333333e-08,0.000144,0.000000,0.000013,0.000000,0.001327,0.003026,0.003114,0.002508,0.000000,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
1,1.186475e-03,0.006494,0.006277,0.000462,0.004923,0.024401,0.000000,0.004849,0.015804,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
2,9.906083e-04,0.003175,0.002835,0.000408,0.002094,0.024401,0.000000,0.008555,0.022149,0.248968,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
3,1.196475e-03,0.006061,0.005568,0.000454,0.004990,0.024401,0.000000,0.005093,0.016339,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
4,1.197875e-03,0.006927,0.005973,0.000464,0.005125,0.024401,0.000000,0.004572,0.015330,0.373453,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56656,1.950000e-06,0.000144,0.000202,0.000022,0.000011,0.001370,0.016137,0.005386,0.000000,0.009972,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
56657,1.110733e-03,0.000144,0.000202,0.000033,0.000022,0.002012,0.023701,0.007911,0.000000,0.020719,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
56658,9.589745e-02,0.000577,0.000405,0.000157,0.000164,0.019264,0.000000,0.015149,0.028548,0.303043,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054652,0.0,0.054652,0.054652,0
56659,9.589756e-02,0.001010,0.000607,0.000145,0.000541,0.017808,0.000000,0.008753,0.020864,0.497937,...,0.533333,0.000009,0.0,0.000009,0.000009,0.054648,0.0,0.054648,0.054648,0


In [95]:
## 4000 samples
df_sample = df_scaled.sample(n=4000)
df_sample.shape

(4000, 78)

In [96]:
df_sample.to_csv('./data/final_data_two_labels.csv',index=0)

In [97]:
df_sample

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
32110,1.969333e-04,0.000144,0.000202,0.000023,9.767442e-06,0.001413,0.016641,0.005555,0.000000,0.009027,...,0.533333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
23058,3.484308e-03,0.000289,0.000405,0.000204,4.390698e-05,0.024743,0.000000,0.032767,0.047094,0.080124,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
40815,9.070804e-01,0.000433,0.000304,0.000124,0.000000e+00,0.015026,0.000000,0.014981,0.024780,0.000000,...,0.533333,0.000053,0.0,0.000053,0.000053,0.915966,0.0,0.915966,0.915966,0
50202,8.226165e-01,0.001010,0.000709,0.000106,5.393023e-04,0.013014,0.000000,0.006396,0.015247,0.497937,...,0.533333,0.000010,0.0,0.000010,0.000010,0.829412,0.0,0.829412,0.829412,0
50204,8.226160e-01,0.000722,0.000607,0.000130,5.393023e-04,0.015154,0.000000,0.010436,0.020297,0.622421,...,0.333333,0.000110,0.0,0.000110,0.000110,0.829412,0.0,0.829412,0.829412,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5601,1.530294e-02,0.000577,0.000202,0.004050,1.209302e-06,0.496361,0.000000,0.390752,0.735389,0.001719,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1
20930,4.425087e-02,0.000866,0.000405,0.000215,7.627907e-06,0.022132,0.000000,0.014837,0.026951,0.013067,...,0.333333,0.001047,0.0,0.001047,0.001047,0.043743,0.0,0.043743,0.043743,1
14156,8.333333e-07,0.000000,0.000101,0.000000,2.790698e-07,0.000000,0.000000,0.000000,0.000000,0.000516,...,0.666667,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0
19160,1.982333e-04,0.000000,0.000101,0.000017,2.232558e-06,0.002055,0.024206,0.008080,0.000000,0.004127,...,0.333333,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1


In [98]:
df_sample['Label'].unique()

array([1, 0])

In [99]:
## Result without any feature selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

y=df_sample['Label']
X=df_sample.drop('Label', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30,random_state=1)

clf = RandomForestClassifier(random_state=0) 
 

clf.fit(X_train, y_train)
 

y_pred = clf.predict(X_test)
 

from sklearn import metrics 
print()
 

print("ACCURACY OF THE MODEL without any feature selection: ", metrics.accuracy_score(y_test, y_pred))


ACCURACY OF THE MODEL without any feature selection:  0.9783333333333334


In [100]:
y_test.unique()

array([0, 1])

### Feature selection process phase, From now on we will be using different feature selection algorithm to reduce the number
### of columns.
#### The algorithms are variance threshold, lasso regression, random forest classifer, recursive feature elimination, permutation importance
#### From the output of the feature extraction we construct text files,with important column names

In [101]:
## Variance Threshold
##VARIANCE THRESHOLD

pipe1 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('vt', VarianceThreshold()),
        ('rfc',  RandomForestClassifier())])

param_grid1 = dict(
                  vt__threshold=[0,0.01,0.02,0.03,0.04,0.05],
                  rfc__random_state=[0] 
                  )

grid1 = GridSearchCV(pipe1, param_grid=param_grid1, cv=10, n_jobs=1, scoring= 'accuracy')
grid1.fit(X_train, y_train)
print(f'The score ***********   {grid1.score(X_test,y_test)}')
print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid1.best_estimator_}')

The score ***********   0.9833333333333333
{'mean_fit_time': array([0.61108437, 0.46123827, 0.39509933, 0.40209329, 0.38230603,
       0.36107242]), 'std_fit_time': array([0.08226494, 0.01462209, 0.03031526, 0.03820406, 0.00721025,
       0.02534792]), 'mean_score_time': array([0.01308846, 0.01213429, 0.01132586, 0.01222916, 0.01093967,
       0.01137631]), 'std_score_time': array([0.00311934, 0.00264854, 0.00249808, 0.00333665, 0.00304696,
       0.00368213]), 'param_rfc__random_state': masked_array(data=[0, 0, 0, 0, 0, 0],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_vt__threshold': masked_array(data=[0, 0.01, 0.02, 0.03, 0.04, 0.05],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'rfc__random_state': 0, 'vt__threshold': 0}, {'rfc__random_state': 0, 'vt__threshold': 0.01}, {'rfc__random_state': 0, 'vt__threshold': 0.02}, {'rfc__random_

In [102]:
grid1.best_estimator_.named_steps["vt"].get_support()

array([ True, False, False, False, False, False, False, False, False,
        True, False,  True,  True, False, False, False,  True,  True,
       False,  True, False,  True,  True, False,  True, False,  True,
        True, False,  True, False, False, False, False, False, False,
       False, False,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True, False, False, False,  True, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True, False,  True, False, False, False,
       False,  True, False,  True,  True])

In [103]:
selected=[]
idx=0
toSelect=grid1.best_estimator_.named_steps["vt"].get_support()
for i in toSelect:
  if(i == True):
    print(f'--- > {features_without_target[idx]}')
    selected.append(features_without_target[idx])

  idx=idx+1  

print(f'---selected variance threshold columns    {selected}')
print(f" number of features selected by variance threshold -->> {len(selected)}")
features_map["variance_threshold"] = selected 

--- > Flow Duration
--- > Bwd Packet Length Max
--- > Bwd Packet Length Mean
--- > Bwd Packet Length Std
--- > Flow IAT Std
--- > Flow IAT Max
--- > Fwd IAT Total
--- > Fwd IAT Std
--- > Fwd IAT Max
--- > Bwd IAT Total
--- > Bwd IAT Std
--- > Bwd IAT Max
--- > Fwd PSH Flags
--- > Max Packet Length
--- > Packet Length Mean
--- > Packet Length Std
--- > Packet Length Variance
--- > FIN Flag Count
--- > SYN Flag Count
--- > PSH Flag Count
--- > ACK Flag Count
--- > URG Flag Count
--- > Average Packet Size
--- > Avg Bwd Segment Size
--- > Init_Win_bytes_forward
--- > Init_Win_bytes_backward
--- > min_seg_size_forward
--- > Idle Mean
--- > Idle Max
--- > Idle Min
---selected variance threshold columns    ['Flow Duration', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Total', 'Fwd IAT Std', 'Fwd IAT Max', 'Bwd IAT Total', 'Bwd IAT Std', 'Bwd IAT Max', 'Fwd PSH Flags', 'Max Packet Length', 'Packet Length Mean', 'Packet Len

In [104]:
y_train

45413    0
53771    1
28897    0
52967    1
25750    1
        ..
52279    1
6337     1
13761    0
24057    1
49832    0
Name: Label, Length: 2800, dtype: int32

In [105]:
##Lasso
## LASSO
from sklearn.linear_model import Lasso

pipe2 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('lasso', Lasso())])


param_grid2 = dict(
                  lasso__alpha=np.arange(0.1,10,0.1)
                   
                  )

grid2 = GridSearchCV(pipe2, param_grid=param_grid2, cv=10, n_jobs=1, scoring= "neg_mean_squared_error")
grid2.fit(X_train, y_train)


print(f'The best estimator is --------->>   {grid2.best_estimator_}')
print()
print()
print(f' best params --->>  {grid2.best_params_}')
print()
coefficients = grid2.best_estimator_.named_steps['lasso'].coef_

print(f' the coefficients --->>  {coefficients}')


importance = np.abs(coefficients)
try:
    impr_features=np.array(features_without_target)[importance > 0]
    print()
    print()
    
    
    print(f' important features   --->>  {impr_features}')
    
    print()
    print()
    
    redundant_features=np.array(features_without_target)[importance == 0]
    print(f' redundant features --->>  {redundant_features}')
except:
    print("important features is not determined by lasso for 2 labels")
    impr_features = []

The best estimator is --------->>   Pipeline(steps=[('lasso', Lasso(alpha=0.1))])


 best params --->>  {'lasso__alpha': 0.1}

 the coefficients --->>  [-0.  0.  0.  0. -0.  0.  0.  0.  0. -0.  0. -0. -0.  0. -0. -0. -0. -0.
 -0. -0. -0. -0. -0.  0.  0. -0. -0. -0.  0.  0.  0.  0.  0. -0.  0. -0.
 -0.  0. -0. -0. -0. -0. -0.  0.  0. -0. -0.  0.  0.  0.  0. -0.  0. -0.
 -0.  0.  0.  0.  0.  0.  0.  0.  0.  0. -0. -0.  0.  0. -0.  0.  0.  0.
 -0. -0. -0. -0. -0.]


 important features   --->>  []


 redundant features --->>  ['Flow Duration' 'Total Fwd Packets' 'Total Backward Packets'
 'Total Length of Fwd Packets' 'Total Length of Bwd Packets'
 'Fwd Packet Length Max' 'Fwd Packet Length Min' 'Fwd Packet Length Mean'
 'Fwd Packet Length Std' 'Bwd Packet Length Max' 'Bwd Packet Length Min'
 'Bwd Packet Length Mean' 'Bwd Packet Length Std' 'Flow Bytes/s'
 'Flow Packets/s' 'Flow IAT Mean' 'Flow IAT Std' 'Flow IAT Max'
 'Flow IAT Min' 'Fwd IAT Total' 'Fwd IAT Mean' 'Fwd IAT Std' 'Fwd IAT Ma

In [106]:
### Now using random forest using the important features
if len(impr_features)!=0:
    df_lasso=df_sample
    df_lasso = df_lasso.drop(redundant_features, axis=1)
    
    #df_lasso.shape
    #df_lasso.head()
    
    y=df_lasso['Label']
    X=df_lasso.drop('Label', axis=1)
    X_train_ll, X_test_ll, y_train_ll, y_test_ll = train_test_split(X, y, test_size = 0.30,random_state=1)
    
    clf = RandomForestClassifier(random_state=0) 
    
    clf.fit(X_train_ll, y_train_ll)
    
    y_pred = clf.predict(X_test_ll)
     
    
    from sklearn import metrics 
    print()
     
    
    print("ACCURACY OF THE MODEL with lasso features: ", metrics.accuracy_score(y_test_ll, y_pred))

In [107]:
if len(impr_features)!=0:
    df_lasso

In [108]:
### Random forest feature importance



pipe3 = Pipeline([
        # ('scale', MinMaxScaler()),
        ('rfc',  RandomForestClassifier())])

param_grid3 = dict(
                  
                  rfc__criterion = ['gini','entropy'] ,
                  
                  rfc__random_state= [0]
                  )

grid3 = GridSearchCV(pipe3, param_grid=param_grid3, cv=10, n_jobs=1, scoring= 'accuracy')
grid3.fit(X_train, y_train)
print()
print()
print(f'The score ***********   {grid3.score(X_test,y_test)}')
print()
print()
print(grid3.cv_results_)
print()
print()
print(f'The best estimator is --------->>   {grid3.best_estimator_}')

print()
print()

importances=grid3.best_estimator_.named_steps["rfc"].feature_importances_


indices=np.argsort(importances)[::-1]

print(f' importances  --->   {importances}')
print()
print(f' indices  --->   {indices}')

print()
importantFeature=grid3.best_estimator_.named_steps["rfc"].feature_importances_

df_show=pd.DataFrame(importantFeature,index=features_without_target,columns=['importance']).sort_values('importance',ascending=False)
df_show.head(18)




The score ***********   0.9783333333333334


{'mean_fit_time': array([0.58268604, 0.58930089]), 'std_fit_time': array([0.06253572, 0.04385824]), 'mean_score_time': array([0.01214938, 0.01411994]), 'std_score_time': array([0.00469579, 0.00435472]), 'param_rfc__criterion': masked_array(data=['gini', 'entropy'],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_rfc__random_state': masked_array(data=[0, 0],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'rfc__criterion': 'gini', 'rfc__random_state': 0}, {'rfc__criterion': 'entropy', 'rfc__random_state': 0}], 'split0_test_score': array([0.98571429, 0.98571429]), 'split1_test_score': array([0.975, 0.975]), 'split2_test_score': array([0.98571429, 0.98214286]), 'split3_test_score': array([0.98214286, 0.98214286]), 'split4_test_score': array([0.98214286, 0.98571429]), 'split5_test_score': array([0.99285714, 0.98928571]), 'split6_test_score': array([0.985

Unnamed: 0,importance
Init_Win_bytes_forward,0.09346
Fwd Packet Length Min,0.047495
Init_Win_bytes_backward,0.047088
Min Packet Length,0.04521
Bwd Packet Length Mean,0.037871
Packet Length Std,0.037753
Bwd Packet Length Min,0.03511
Avg Bwd Segment Size,0.03356
Fwd Packet Length Max,0.031457
Bwd Packet Length Std,0.030864


In [109]:
selected=[]
for i in range (0,18):
  selected.append(features_without_target[indices[i]])
selected
features_map["random_forest_feature_importance"]= selected

In [110]:
### use the above features... Tuesday

X_train_pi=X_train[selected]
X_train_pi.shape
X_test_pi=X_test[selected]
################################################

pipe3 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid3 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid3 = GridSearchCV(pipe3, param_grid=param_grid3, cv=10)
grid3.fit(X_train_pi, y_train)
print(f'The score ***********   {grid3.score(X_test_pi,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid3.best_estimator_}')

The score ***********   0.98
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [111]:
## Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

pipe4 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfe', RFE(RandomForestClassifier())),
        ('rfc', RandomForestClassifier())
        ])

param_grid4 = dict(
                  
                  rfe__n_features_to_select=[18,5,25],
                  rfe__step=[0.5] 
                   
                  )

grid4 = GridSearchCV(pipe4, param_grid=param_grid4, cv=10)
grid4.fit(X_train, y_train)
print(f'The score ***********   {grid4.score(X_test,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid4.best_estimator_}')


#print(f'{grid4.best_estimator_.named_steps["rfe"].support_}')

toSelect=grid4.best_estimator_.named_steps["rfe"].support_
idx=0
print()
print()
selected=[]
for i in toSelect:
  if(i== True):
    print(f'--- > {features_without_target[idx]}')
    selected.append(features_without_target[idx])

  idx=idx+1  

print(f'---selected RFE columns    {selected}')
features_map["recursive_feature_elimination"]=selected


The score ***********   0.9816666666666667
The best estimator is --------->>   Pipeline(steps=[('rfe',
                 RFE(estimator=RandomForestClassifier(),
                     n_features_to_select=18, step=0.5)),
                ('rfc', RandomForestClassifier())])


--- > Total Length of Fwd Packets
--- > Fwd Packet Length Max
--- > Fwd Packet Length Min
--- > Fwd Packet Length Mean
--- > Bwd Packet Length Min
--- > Bwd Packet Length Mean
--- > Bwd Packet Length Std
--- > Fwd Header Length
--- > Min Packet Length
--- > Packet Length Mean
--- > Packet Length Std
--- > Packet Length Variance
--- > Average Packet Size
--- > Avg Bwd Segment Size
--- > Fwd Header Length.1
--- > Subflow Fwd Bytes
--- > Init_Win_bytes_forward
--- > Init_Win_bytes_backward
---selected RFE columns    ['Total Length of Fwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Fwd Header Length', 'Min

In [112]:
X_train_rfe=X_train[selected]
X_train_rfe.shape
X_test_rfe=X_test[selected]
################################################

pipe4 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid4 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid4 = GridSearchCV(pipe4, param_grid=param_grid4, cv=10)
grid4.fit(X_train_rfe, y_train)
print(f'The score ***********   {grid4.score(X_test_rfe,y_test)}')

print(f'The best estimator is --------->>   {grid4.best_estimator_}')



The score ***********   0.9791666666666666
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [113]:
#Permutation importance
from sklearn.inspection import permutation_importance
forest = RandomForestClassifier(n_estimators=200,random_state=0)

forest.fit(X_train,y_train)
result =permutation_importance(
    estimator=forest,
    X=X_test,
    y=y_test,
    scoring='accuracy',
    n_repeats=50,
    random_state=0
)

importances=result.importances


indices=np.argsort(result['importances_mean'])[::-1]

print(f' indices  --->   {indices}')




 indices  --->   [65 23 66 28 40 26 45 25 20 64 75 43 12 32 31 30 29 42 44 48 49 76 58 70
 55 74 56 57 59 60 72 71 46 73 69 68 21  9  8 27 24 67  6 50 37 51 10 22
 11 52 39 47 19 18  3 16 41 53 38  2 62  1  7 13  0 54  5  4 15 61 63 17
 33 36 34 35 14]


In [114]:
## Top 18 attributes

selected=[]
for i in range(0,18):
  selected.append(features_without_target[indices[i]])
print (selected)  


['Init_Win_bytes_forward', 'Fwd IAT Min', 'Init_Win_bytes_backward', 'Bwd IAT Min', 'Packet Length Std', 'Bwd IAT Std', 'PSH Flag Count', 'Bwd IAT Mean', 'Fwd IAT Mean', 'Subflow Bwd Bytes', 'Idle Max', 'SYN Flag Count', 'Bwd Packet Length Std', 'Bwd URG Flags', 'Fwd URG Flags', 'Bwd PSH Flags', 'Fwd PSH Flags', 'FIN Flag Count']


In [115]:
features_map["permutation_importance"] = selected

In [116]:
X_train_pi=X_train[selected]
X_train_pi.shape
X_test_pi=X_test[selected]
################################################

pipe5 = Pipeline([
        # ('scale', StandardScaler()),
        ('rfc', RandomForestClassifier())
        ])

param_grid5 = dict(
                  
                  rfc__random_state=[0],
                  rfc__criterion=['gini'] 
                   
                  )

grid5 = GridSearchCV(pipe5, param_grid=param_grid5, cv=10)
grid5.fit(X_train_pi, y_train)
print(f'The score ***********   {grid5.score(X_test_pi,y_test)}')
#print(grid1.cv_results_)
print(f'The best estimator is --------->>   {grid5.best_estimator_}')

The score ***********   0.9791666666666666
The best estimator is --------->>   Pipeline(steps=[('rfc', RandomForestClassifier(random_state=0))])


In [117]:
# for j in gaures:
#     for idx, x in enumerate(df.columns):
#         if idx == j:
#             print(x)
    

In [118]:
print("")
print(features_map)


{'variance_threshold': ['Flow Duration', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Total', 'Fwd IAT Std', 'Fwd IAT Max', 'Bwd IAT Total', 'Bwd IAT Std', 'Bwd IAT Max', 'Fwd PSH Flags', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'Average Packet Size', 'Avg Bwd Segment Size', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'min_seg_size_forward', 'Idle Mean', 'Idle Max', 'Idle Min'], 'random_forest_feature_importance': ['Init_Win_bytes_forward', 'Fwd Packet Length Min', 'Init_Win_bytes_backward', 'Min Packet Length', 'Bwd Packet Length Mean', 'Packet Length Std', 'Bwd Packet Length Min', 'Avg Bwd Segment Size', 'Fwd Packet Length Max', 'Bwd Packet Length Std', 'Fwd Header Length', 'Fwd Header Length.1', 'Average Packet Size', 'Total Length of Fwd Packets', 'Packet Length Me

In [119]:
## write extracted columns from different selection into files
f = open("data/features_with_two_labels.txt","w")
for i in features_map:
    f.write("\n")
    f.write(i)
    f.write("------------------")
    f.write(",".join(features_map[i]))
f.close()

In [120]:
## apply pca on the sample dataset.
## we will be using the converted pca dataset to train our ensemble model
## and use the unclassified packet to classify after our model is trained  with the pca dataset
X = df_sample.iloc[:,:-1]
y = df_sample.iloc[:,-1]


In [121]:
## pca with 18 components
from sklearn.decomposition import PCA
pca= PCA(n_components=18)
pca.fit(X)
x_pca=pca.transform(X)
x_pca.shape
    

(4000, 18)

In [122]:
df_sample_pca = pd.DataFrame(x_pca)

In [123]:
df_sample_pca["Label"] = df_scaled["Label"]

In [124]:
df_sample_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,Label
0,-0.408905,-0.159307,-0.230454,-0.455372,0.056797,-0.196815,0.019011,-0.205289,0.092246,-0.046188,0.018825,0.020911,-0.003364,-0.012256,-0.052942,0.038146,0.008284,-0.007477,1
1,-0.539372,0.546249,0.041799,0.080600,-0.019903,-0.017423,-0.042803,0.008006,-0.221979,0.057061,-0.047408,0.011112,0.007335,0.043446,0.018192,-0.035558,0.013701,0.051241,1
2,1.733715,0.906429,-1.533672,0.974148,-0.178074,0.180908,-0.174684,0.110237,-0.106085,-0.570437,-0.338356,0.240534,0.339024,0.098736,-0.367856,-0.290621,-0.062947,-0.085403,1
3,2.068454,-0.304433,0.141371,0.415920,-0.005061,-0.441809,-0.168966,-0.122801,-0.027873,-0.029569,0.054650,-0.004729,-0.111452,0.008960,-0.044779,0.005351,0.052300,0.002084,1
4,2.429798,0.624495,-0.768942,-0.548996,-0.072233,0.485596,-0.076786,0.309215,0.004538,-0.052508,-0.066062,0.026297,0.045563,-0.065278,-0.105723,-0.031118,-0.025736,0.018376,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,-0.132174,0.841328,0.655542,-0.212868,0.038786,0.250933,0.124704,0.005859,-0.173765,-0.150177,-0.203967,0.102157,0.143629,0.698327,0.100057,0.155157,1.299533,0.132145,0
3996,-0.532794,0.555259,-0.095646,0.263294,-0.018719,-0.069300,-0.107351,0.064225,0.009482,0.038753,-0.044321,-0.077280,0.031167,0.051528,0.083249,-0.200439,0.049484,-0.026142,0
3997,-0.667760,0.536180,-0.099454,0.266482,-0.019140,-0.069673,-0.150289,0.057599,0.121772,-0.083555,0.043511,-0.083464,0.023645,-0.042068,-0.048759,0.054590,0.000797,0.016756,0
3998,-0.412459,-0.166992,-0.240050,-0.470200,0.049389,-0.183063,0.045612,-0.197898,0.020010,0.013300,-0.031264,0.013438,0.003908,0.031358,0.023607,-0.095272,-0.003672,0.040023,0


In [125]:
## write the scaled doc for testing
df_sample_pca.to_csv('./data/sample_pca_test.csv',index=0)