In [18]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [19]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [20]:
tf.test.is_built_with_cuda()

True

In [21]:
dataset = pd.read_csv("./MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")
feature_list = dataset.columns.values

In [22]:
dataset.shape  
dataset = dataset.replace(np.inf,np.nan)    # replacing inf with nan
dataset = dataset.fillna(dataset.mean(numeric_only=True)) # ghen converting nan to mean values

In [23]:
X = dataset.iloc[:,:-1].values
print("shape of X",X.shape)

Y = dataset.iloc[:,-1].values
print("shape of Y",Y.shape)

shape of X (225745, 78)
shape of Y (225745,)


In [24]:
print("is NaN present:",np.any(np.isnan(X)))  # to check whether the array contains nan
print("is inf present:",np.any(np.isinf(X)))  # to check whether the array contains inf
X[X < 0] = 0   # to replace all negative values with zeros

is NaN present: False
is inf present: False


In [25]:
from sklearn.feature_selection import SelectKBest  # feature selection
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

In [26]:
bestfeatures = SelectKBest(score_func = mutual_info_classif, k=50)
fit = bestfeatures.fit(X,Y)
#create df for scores
dfscores = pd.DataFrame(fit.scores_)
#create df for column names
dfcolumns = pd.DataFrame(feature_list)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)

#naming the dataframe columns
featureScores.columns = ['Selected_columns','Score_chi2']  
#print 50 best features
print(featureScores.nlargest(50,'Score_chi2')) 


                Selected_columns  Score_chi2
4    Total Length of Fwd Packets    0.652321
63             Subflow Fwd Bytes    0.652271
52           Average Packet Size    0.563413
5    Total Length of Bwd Packets    0.548778
65             Subflow Bwd Bytes    0.548703
12        Bwd Packet Length Mean    0.544560
34             Fwd Header Length    0.544465
54          Avg Bwd Segment Size    0.544073
55           Fwd Header Length.1    0.543935
0               Destination Port    0.538541
10         Bwd Packet Length Max    0.528925
66        Init_Win_bytes_forward    0.496586
53          Avg Fwd Segment Size    0.490605
8         Fwd Packet Length Mean    0.489709
6          Fwd Packet Length Max    0.485860
35             Bwd Header Length    0.475500
23                   Fwd IAT Max    0.460042
20                 Fwd IAT Total    0.457025
21                  Fwd IAT Mean    0.447676
2              Total Fwd Packets    0.410077
62           Subflow Fwd Packets    0.409837
22        

In [27]:
# print(featureScores.nlargest(50,'Score_chi2').Selected_columns.values)
featureScore_after_filter = featureScores.nlargest(50,'Score_chi2')
print(featureScore_after_filter.index[0])
count = 0
ind = []
for i in featureScore_after_filter.Score_chi2:
    if i < 0.2:
        ind.append(featureScore_after_filter.index[count])
    count = count + 1
featureScore_after_filter = featureScore_after_filter.drop(ind,axis = 0)  # contains all the filtered features
X = pd.DataFrame(X)
X = X.loc[:,featureScore_after_filter.index] # contains data after filter from feature selection
print(X)

4
          4     63   52   5    65   12    34   54    55       0   ...  \
0       12.0  12.0  9.0  0.0  0.0  0.0  40.0  0.0  40.0  54865.0  ...   
1        6.0   6.0  9.0  6.0  6.0  6.0  20.0  6.0  20.0  55054.0  ...   
2        6.0   6.0  9.0  6.0  6.0  6.0  20.0  6.0  20.0  55055.0  ...   
3        6.0   6.0  9.0  6.0  6.0  6.0  20.0  6.0  20.0  46236.0  ...   
4       12.0  12.0  9.0  0.0  0.0  0.0  40.0  0.0  40.0  54863.0  ...   
...      ...   ...  ...  ...  ...  ...   ...  ...   ...      ...  ...   
225740   6.0   6.0  9.0  6.0  6.0  6.0  20.0  6.0  20.0  61374.0  ...   
225741   6.0   6.0  9.0  6.0  6.0  6.0  20.0  6.0  20.0  61378.0  ...   
225742   6.0   6.0  9.0  6.0  6.0  6.0  20.0  6.0  20.0  61375.0  ...   
225743  12.0  12.0  9.0  0.0  0.0  0.0  40.0  0.0  40.0  61323.0  ...   
225744   6.0   6.0  9.0  6.0  6.0  6.0  20.0  6.0  20.0  61326.0  ...   

                   36   39     1             14   11     18   17  \
0       666666.666700  6.0    3.0  4.000000e+06  0.0 

In [28]:
from sklearn.preprocessing import LabelEncoder
labelencoder_y= LabelEncoder()  
Y = labelencoder_y.fit_transform(Y)  

In [29]:
from sklearn.model_selection import train_test_split  
x_train, x_test, y_train, y_test= train_test_split(X, Y, test_size= 0.2, random_state=0)

In [30]:
from sklearn.preprocessing import StandardScaler  # scaling of the data

scaler_X = StandardScaler()
x_train_scaled = scaler_X.fit_transform(x_train) # preprocessed training data
x_test_scaled = scaler_X.fit_transform(x_test) # preprocessed testing data

In [31]:
from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential([
    keras.layers.Dense(64, input_shape=(45,), activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(2, activation='sigmoid')
])
 

In [32]:
# Compiling the model
model.compile(optimizer='adam',
              loss=keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])
 

In [33]:
# fitting the model
model.fit(x_train_scaled, y_train, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21ecd365790>

In [34]:
model.evaluate(x_test_scaled, y_test,64)



[0.0032083799596875906, 0.9992469549179077]

In [54]:
y_pred = model.predict(x_test_scaled,64)
y_pred = np.argmax(y_pred,axis = 1)
y_pred

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [55]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9992469379166758