## Mini-Project 2: Network Intrusion Detector


#### CSC 180  Intelligent Systems (Fall 2019)
#### Derrek Gass, Alexander Lee, Jimmy Le
#### 10-11-2019

In [50]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure, show

import collections

import io
import requests
import shutil
import os, json
import csv

from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as sk_text
from sklearn.metrics import confusion_matrix, classification_report


import numpy as np
import pandas as pd
import dask.dataframe as dd

import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import optimizers, regularizers




# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

    
# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    

# Plot an ROC. pred - the predictions, y - the expected output.
def plot_roc(pred,y):
    fpr, tpr, thresholds = roc_curve(y, pred)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show()
    

In [2]:
from dask.distributed import Client, progress
client = Client()

In [3]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:50162  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 8.59 GB


In [4]:
#load csv into dask dataframe
#df = dd.read_csv('data/network_intrusion_data.csv')
#load csv into dask dataframe
df = dd.read_csv('data/network_intrusion_onehot.csv')
X = dd.read_csv('data/network_intrusion_normalized.csv')


In [57]:
#load csv into pandas dataframe
df = pd.read_csv('data/network_intrusion_data.csv')

In [41]:
df.columns = [
 'duration',
 'protocol_type',
 'service',
 'flag',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'outcome'
]

In [42]:
df = df.drop_duplicates(subset=None, keep='first', inplace=False)

In [43]:
from dask_ml.preprocessing import Categorizer, DummyEncoder

In [None]:
df = df.categorize()
df.dtypes

In [None]:
encoder = DummyEncoder()
df = encoder.fit_transform(df)

In [None]:
df.compute()

In [71]:
df.head(20)

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,outcome_imap.,outcome_satan.,outcome_phf.,outcome_nmap.,outcome_multihop.,outcome_warezmaster.,outcome_warezclient.,outcome_spy.,outcome_rootkit.,intruder_status
0,0,239,486,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,235,1337,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,219,1337,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,217,2032,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,217,2032,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,0,212,1940,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,0,159,4087,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,0,210,151,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,0,212,786,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,0,210,624,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df['intruder_status']=df['outcome_normal.'].apply(lambda x: 0 if x ==1 else 1, meta=('outcome_normal.', 'int64'))


In [None]:
df.compute()

In [151]:
encode_csv = X_copy.to_csv(r'data/network_intrusion_normalized.csv', single_file=True, header=True, index=False)

In [8]:
X = X.drop('intruder_status', axis=1)

In [166]:
X['intruder_status']=y

In [9]:
print(len(X.columns))

141


In [10]:
y = df['intruder_status']

In [17]:
X_copy = X.to_dask_array()

In [21]:
X_copy.compute()

array([[-0.10785025, -0.00426104, -0.03903572, ...,  0.        ,
         0.        ,  0.        ],
       [-0.10785025, -0.00426324, -0.02504131, ...,  0.        ,
         0.        ,  0.        ],
       [-0.10785025, -0.00427203, -0.02504131, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.10785025, -0.00428082, -0.02729423, ...,  0.        ,
         0.        ,  0.        ],
       [-0.10785025, -0.00423248, -0.02729423, ...,  0.        ,
         0.        ,  0.        ],
       [-0.10785025, -0.00427203, -0.02673511, ...,  0.        ,
         0.        ,  0.        ]])

In [22]:
x = X_copy.compute().reshape((len(X), 1, len(X.columns), 1))

In [62]:
y_one_hot = tf.keras.utils.to_categorical(y, 2)

In [28]:
x

array([[[[-0.10785025],
         [-0.00426104],
         [-0.03903572],
         ...,
         [ 0.        ],
         [ 0.        ],
         [ 0.        ]]],


       [[[-0.10785025],
         [-0.00426324],
         [-0.02504131],
         ...,
         [ 0.        ],
         [ 0.        ],
         [ 0.        ]]],


       [[[-0.10785025],
         [-0.00427203],
         [-0.02504131],
         ...,
         [ 0.        ],
         [ 0.        ],
         [ 0.        ]]],


       ...,


       [[[-0.10785025],
         [-0.00428082],
         [-0.02729423],
         ...,
         [ 0.        ],
         [ 0.        ],
         [ 0.        ]]],


       [[[-0.10785025],
         [-0.00423248],
         [-0.02729423],
         ...,
         [ 0.        ],
         [ 0.        ],
         [ 0.        ]]],


       [[[-0.10785025],
         [-0.00427203],
         [-0.02673511],
         ...,
         [ 0.        ],
         [ 0.        ],
         [ 0.        ]]]])

In [95]:
from dask_ml.model_selection import train_test_split

In [31]:
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y_one_hot, random_state=42)

In [45]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

In [46]:
x_train.shape

(109188, 1, 141, 1)

In [44]:
y_train.shape

(109188, 2)

In [48]:
x_test.shape

(36397, 1, 141, 1)

In [63]:
y_test.shape

(36397, 2)

In [52]:
# define a CNN

cnn = Sequential()
cnn.add(Conv2D(64, kernel_size=(1, y_train.shape[1]), strides=(1, 1),
                 activation='relu',
                 input_shape=(1, x_train.shape[2], 1)))

# the above code is equivalent to 
# model.add(Conv1D(64, kernel_size=3, strides=1, activation='relu', input_shape=(128, 1)))

cnn.add(MaxPooling2D(pool_size=(1,2)))

cnn.add(Conv2D(x_train.shape[2], kernel_size=(1, y_train.shape[1]), strides=(1, 1),
                 activation='relu'))
cnn.add(MaxPooling2D(pool_size=(1,2)))
    
    
cnn.add(Flatten())
cnn.add(Dense(1024, activation="relu"))
cnn.add(Dropout(0.5))
cnn.add(Dense(y_train.shape[1], activation="softmax"))

# define optimizer and objective, compile cnn

cnn.compile(loss="categorical_crossentropy", optimizer="adam")

In [54]:
cnn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_8 (Conv2D)            (None, 1, 140, 64)        192       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 1, 70, 64)         0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 1, 69, 141)        18189     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 1, 34, 141)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 4794)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              4910080   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
__________

In [68]:
import time

start_time = time.time()

# 1% of dataset
# Define batch_size and # of epochs
batch_size = 128

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-6, patience=5, verbose=1, mode='auto')
checkpointer = ModelCheckpoint(filepath="data/best_weights.hdf5", verbose=0, save_best_only=True) # save best model

cnn.fit(x_train[0:10000], y_train[0:10000],     
          batch_size=batch_size,
          callbacks=[monitor, checkpointer],
          epochs=100,
          verbose=2,
          validation_data=(x_test[0:2000], y_test[0:2000]))

cnn.load_weights('data/best_weights.hdf5')
elapsed_time = time.time() - start_time
print("Elapsed time: {}".format(hms_string(elapsed_time)))

Train on 10000 samples, validate on 2000 samples
Epoch 1/100
 - 27s - loss: 0.2189 - val_loss: 0.1010
Epoch 2/100
 - 27s - loss: 0.0654 - val_loss: 0.0449
Epoch 3/100
 - 27s - loss: 0.0225 - val_loss: 0.0094
Epoch 4/100
 - 27s - loss: 0.0098 - val_loss: 0.0040
Epoch 5/100
 - 27s - loss: 0.0046 - val_loss: 0.0077
Epoch 6/100
 - 27s - loss: 6.5174e-04 - val_loss: 0.0078
Epoch 7/100
 - 27s - loss: 2.3181e-04 - val_loss: 0.0076
Epoch 8/100
 - 26s - loss: 2.2010e-04 - val_loss: 0.0075
Epoch 9/100
 - 27s - loss: 1.7676e-04 - val_loss: 0.0065
Epoch 00009: early stopping
Elapsed time: 0:04:02.21


In [69]:
# evaluate() computes the loss and accuracy
score = cnn.evaluate(x_test[0:100], y_test[0:100], verbose=2)
score

 - 0s - loss: 3.9657e-04


0.00039657415356487036

In [70]:
y_true = np.argmax(y_test[0:500],axis=1)
pred = cnn.predict(x_test[0:500])
pred = np.argmax(pred,axis=1)


score = metrics.accuracy_score(y_true, pred)
print('Accuracy: {}'.format(score))


f1 = metrics.f1_score(y_true, pred, average='weighted')
print('Averaged F1: {}'.format(f1))

           
print(metrics.classification_report(y_true, pred))

Accuracy: 1.0
Averaged F1: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       298
           1       1.00      1.00      1.00       202

    accuracy                           1.00       500
   macro avg       1.00      1.00      1.00       500
weighted avg       1.00      1.00      1.00       500

