In [63]:
import json
import csv
import pandas as pd
import numpy as np
import scipy as sci
import keras
from keras.models import Sequential
from keras import regularizers
from keras.layers.core import Dense, Activation
from keras.layers import Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint
import io
import requests
import tensorflow as tf
from scipy import sparse
from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as sk_text
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, f1_score
import collections
from sklearn import preprocessing
import matplotlib.pyplot as plt
import shutil
import os

In [37]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

In [38]:
network_df= pd.read_csv('network_intrusion_data.csv', delimiter =",")

In [39]:
network_df.columns = [
'duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'outcome'
]

In [40]:
network_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
0,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [41]:
network_df.shape

(494020, 42)

In [42]:
network_df.drop_duplicates(keep='first', inplace=True)

In [43]:
network_df.shape

(145585, 42)

In [44]:
network_df.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [46]:
#Normalize numeric features

def normalize_numeric_minmax(df, name):
    if(df[name].max() > 0):
        df[name] = ((df[name] - df[name].min()) / (df[name].max() - df[name].min())).astype(np.float32)
    else:
        df[name] = df[name].astype(np.float32)

normalize_numeric_minmax(network_df,"duration")
encode_text_dummy(network_df,"protocol_type") 
encode_text_dummy(network_df,"service") 
encode_text_dummy(network_df,"flag")
normalize_numeric_minmax(network_df,"src_bytes") 
normalize_numeric_minmax(network_df,"dst_bytes")
encode_text_dummy(network_df,"land")
normalize_numeric_minmax(network_df,"wrong_fragment") 
normalize_numeric_minmax(network_df,"urgent")
normalize_numeric_minmax(network_df,"hot")
normalize_numeric_minmax(network_df,"num_failed_logins")
encode_text_dummy(network_df,"logged_in")
normalize_numeric_minmax(network_df,"num_compromised")
normalize_numeric_minmax(network_df,"root_shell")
normalize_numeric_minmax(network_df,"su_attempted")
normalize_numeric_minmax(network_df,"num_root")
normalize_numeric_minmax(network_df,"num_file_creations")
normalize_numeric_minmax(network_df,"num_shells")
normalize_numeric_minmax(network_df,"num_access_files")
normalize_numeric_minmax(network_df,"num_outbound_cmds")
encode_text_dummy(network_df,"is_host_login")
encode_text_dummy(network_df,"is_guest_login")
normalize_numeric_minmax(network_df,"count") 
normalize_numeric_minmax(network_df,"srv_count") 
normalize_numeric_minmax(network_df,"serror_rate") 
normalize_numeric_minmax(network_df,"srv_serror_rate") 
normalize_numeric_minmax(network_df,"rerror_rate") 
normalize_numeric_minmax(network_df,"srv_rerror_rate") 
normalize_numeric_minmax(network_df,"same_srv_rate") 
normalize_numeric_minmax(network_df,"diff_srv_rate") 
normalize_numeric_minmax(network_df,"srv_diff_host_rate") 
normalize_numeric_minmax(network_df,"dst_host_count") 
normalize_numeric_minmax(network_df,"dst_host_srv_count") 
normalize_numeric_minmax(network_df,"dst_host_same_srv_rate") 
normalize_numeric_minmax(network_df,"dst_host_diff_srv_rate") 
normalize_numeric_minmax(network_df,"dst_host_same_src_port_rate") 
normalize_numeric_minmax(network_df,"dst_host_srv_diff_host_rate") 
normalize_numeric_minmax(network_df,"dst_host_serror_rate") 
normalize_numeric_minmax(network_df,"dst_host_srv_serror_rate") 
normalize_numeric_minmax(network_df,"dst_host_rerror_rate") 
normalize_numeric_minmax(network_df,"dst_host_srv_rerror_rate") 

In [47]:
network_df.dtypes

duration                       float32
src_bytes                      float32
dst_bytes                      float32
wrong_fragment                 float32
urgent                         float32
hot                            float32
num_failed_logins              float32
num_compromised                float32
root_shell                     float32
su_attempted                   float32
num_root                       float32
num_file_creations             float32
num_shells                     float32
num_access_files               float32
num_outbound_cmds              float32
count                          float32
srv_count                      float32
serror_rate                    float32
srv_serror_rate                float32
rerror_rate                    float32
srv_rerror_rate                float32
same_srv_rate                  float32
diff_srv_rate                  float32
srv_diff_host_rate             float32
dst_host_count                 float32
dst_host_srv_count       

In [48]:
# create a function called encodeLabelBinarizer

encodeLabelBinary = lambda x: 0 if x == 'normal.' else 1

In [49]:
network_df['outcome'] = network_df['outcome'].apply(encodeLabelBinary)

In [50]:
x,y=to_xy(network_df,'outcome')

In [51]:
x.shape

(145585, 121)

In [52]:
y.shape

(145585, 2)

In [53]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [54]:
x_train.shape

(116468, 121)

In [55]:
y_train.shape

(116468, 2)

In [56]:
x_test.shape

(29117, 121)

In [57]:
y_test.shape

(29117, 2)

# CNN

In [58]:
# we now reshape the x_train and x_test to image form used in CNN 2D
x_train = x_train.reshape(x_train.shape[0], 1, 121, 1)
x_test = x_test.reshape(x_test.shape[0], 1, 121, 1)

In [59]:
x_train.shape

(116468, 1, 121, 1)

In [60]:
x_test.shape

(29117, 1, 121, 1)

In [61]:
x_train[0:5]

array([[[[0.0000000e+00],
         [2.9853948e-07],
         [1.4954996e-04],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [2.7397260e-02],
         [3.1311154e-02],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [1.0000000e+00],
         [0.0000000e+00],
         [1.2000000e-01],
         [6.1176473e-01],
         [1.0000000e+00],
         [1.0000000e+00],
         [0.0000000e+00],
         [9.9999998e-03],
         [2.0000000e-02],
         [9.9999998e-03],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [1.0000000e+00],
         [0.0000000e+00],
         [0.0000000e+00],
         [0.

In [67]:
# finally the much avaited CNN 2D

cnn = Sequential()

# Conv2D layer 1
cnn.add(Conv2D(41, kernel_size=(1, 3), strides=(1, 1), padding='valid',
                 activation='relu',
                 input_shape=(1,121,1)))

In [68]:
cnn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 1, 119, 41)        164       
Total params: 164
Trainable params: 164
Non-trainable params: 0
_________________________________________________________________


In [69]:
cnn.add(Conv2D(64, (1, 3), activation='relu'))

cnn.add(MaxPooling2D(pool_size=(1, 2), strides=None))

cnn.add(Dropout(0.25))

In [70]:
cnn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 1, 119, 41)        164       
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 1, 117, 64)        7936      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 58, 64)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 58, 64)         0         
Total params: 8,100
Trainable params: 8,100
Non-trainable params: 0
_________________________________________________________________


In [73]:
cnn.add(Dense(128, activation='relu'))

cnn.add(Dropout(0.5))

cnn.add(Dense(2, activation='softmax'))

In [74]:
cnn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 1, 119, 41)        164       
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 1, 117, 64)        7936      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 58, 64)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 58, 64)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3712)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               475264    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
__________

In [75]:
cnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [77]:
cnn.fit(x_train, y_train,     
          batch_size=128,
          epochs=10,
          verbose=2,
          validation_data=(x_test, y_test))

Train on 116468 samples, validate on 29117 samples
Epoch 1/10
 - 24s - loss: 0.0388 - acc: 0.9884 - val_loss: 0.0089 - val_acc: 0.9972
Epoch 2/10
 - 24s - loss: 0.0129 - acc: 0.9963 - val_loss: 0.0070 - val_acc: 0.9982
Epoch 3/10
 - 24s - loss: 0.0104 - acc: 0.9971 - val_loss: 0.0058 - val_acc: 0.9984
Epoch 4/10
 - 25s - loss: 0.0088 - acc: 0.9976 - val_loss: 0.0052 - val_acc: 0.9985
Epoch 5/10
 - 25s - loss: 0.0085 - acc: 0.9978 - val_loss: 0.0055 - val_acc: 0.9981
Epoch 6/10
 - 25s - loss: 0.0080 - acc: 0.9978 - val_loss: 0.0045 - val_acc: 0.9985
Epoch 7/10
 - 24s - loss: 0.0071 - acc: 0.9980 - val_loss: 0.0055 - val_acc: 0.9985
Epoch 8/10
 - 26s - loss: 0.0066 - acc: 0.9982 - val_loss: 0.0035 - val_acc: 0.9989
Epoch 9/10
 - 23s - loss: 0.0064 - acc: 0.9982 - val_loss: 0.0049 - val_acc: 0.9986
Epoch 10/10
 - 23s - loss: 0.0063 - acc: 0.9983 - val_loss: 0.0041 - val_acc: 0.9987


<keras.callbacks.History at 0x1a1ae19748>

In [78]:
# Evaluate Accuracy in Keras
score = cnn.evaluate(x_test, y_test, verbose=0)
score

[0.00407639158677397, 0.9986605762956349]

In [79]:
print('Test loss: {}'.format(score[0]))
print('Test accuracy: {}'.format(score[1]))

Test loss: 0.00407639158677397
Test accuracy: 0.9986605762956349
