In [45]:
import json
import csv
import pandas as pd
import numpy as np
import scipy as sci
import keras
from keras.models import Sequential
from keras import regularizers
from keras.layers.core import Dense, Activation
from keras.layers import Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint
import io
import requests
import tensorflow as tf
from scipy import sparse
from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as sk_text
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, f1_score
import collections
from sklearn import preprocessing
import matplotlib.pyplot as plt
import shutil
import os

# Data PreProcessing

In [46]:
#Read network_intrusion_data.csv file and load data into network_df dataframe 
network_df= pd.read_csv('network_intrusion_data.csv')

In [47]:
#Drop any row with missing values
network_df = network_df.dropna()

In [48]:
#Add column headers to the data in the dataframe
network_df.columns = [
'duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'outcome'
]

In [33]:
#Select only relevant columns for processing
features_df = network_df[['duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
                          
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
                          
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate']]

In [34]:
label_df = network_df[['outcome']]

In [35]:
#Normalize numeric features

def normalize_numeric_minmax(df, name):
    if(df[name].max() > 0):
        df[name] = ((df[name] - df[name].min()) / (df[name].max() - df[name].min())).astype(np.float32)
    else:
        df[name] = df[name].astype(np.float32)
    
normalize_numeric_minmax(features_df,"duration") 
normalize_numeric_minmax(features_df,"src_bytes") 
normalize_numeric_minmax(features_df,"dst_bytes") 
normalize_numeric_minmax(features_df,"wrong_fragment") 
normalize_numeric_minmax(features_df,"urgent") 

normalize_numeric_minmax(features_df,"hot") 
normalize_numeric_minmax(features_df,"num_failed_logins") 
normalize_numeric_minmax(features_df,"num_compromised") 
normalize_numeric_minmax(features_df,"num_root") 
normalize_numeric_minmax(features_df,"num_file_creations") 
normalize_numeric_minmax(features_df,"num_shells") 
normalize_numeric_minmax(features_df,"num_access_files") 
normalize_numeric_minmax(features_df,"num_outbound_cmds") 

normalize_numeric_minmax(features_df,"count") 
normalize_numeric_minmax(features_df,"srv_count") 
normalize_numeric_minmax(features_df,"serror_rate") 
normalize_numeric_minmax(features_df,"srv_serror_rate") 
normalize_numeric_minmax(features_df,"rerror_rate") 
normalize_numeric_minmax(features_df,"srv_rerror_rate")  
normalize_numeric_minmax(features_df,"same_srv_rate") 
normalize_numeric_minmax(features_df,"diff_srv_rate") 
normalize_numeric_minmax(features_df,"srv_diff_host_rate") 
normalize_numeric_minmax(features_df,"dst_host_count") 
normalize_numeric_minmax(features_df,"dst_host_srv_count") 
normalize_numeric_minmax(features_df,"dst_host_same_srv_rate") 
normalize_numeric_minmax(features_df,"dst_host_diff_srv_rate") 
normalize_numeric_minmax(features_df,"dst_host_same_src_port_rate") 
normalize_numeric_minmax(features_df,"dst_host_srv_diff_host_rate") 
normalize_numeric_minmax(features_df,"dst_host_serror_rate") 
normalize_numeric_minmax(features_df,"dst_host_srv_serror_rate") 
normalize_numeric_minmax(features_df,"dst_host_rerror_rate") 
normalize_numeric_minmax(features_df,"dst_host_srv_rerror_rate") 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [36]:
# one-hot cooding of categorical columns

def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name]).astype(np.float32)
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

encode_text_dummy(features_df,"protocol_type") 
encode_text_dummy(features_df,"service") 
encode_text_dummy(features_df,"flag") 
encode_text_dummy(features_df,"land") 

encode_text_dummy(features_df,"logged_in") 
encode_text_dummy(features_df,"root_shell") 
encode_text_dummy(features_df,"su_attempted") 
encode_text_dummy(features_df,"is_host_login") 
encode_text_dummy(features_df,"is_guest_login") 

In [37]:
features_df = features_df[0:10000]

In [38]:
label_df = label_df[0:10000]

In [39]:
# create a function called encodeLabelBinarizer

encodeLabelBinary = lambda x: 0 if x == 'normal.' else 1


In [40]:
label_df['outcome'] = label_df['outcome'].apply(encodeLabelBinary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [41]:
x_train, x_test, y_train, y_test = train_test_split(features_df, label_df['outcome'] , test_size=0.2, random_state=42)

# Training and Prediction using Regression and Classification

** Logistic Regression **

In [42]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

Log_reg_model = LogisticRegression()

Log_reg_model.fit(x_train, y_train)

y_pred_logistic = Log_reg_model.predict(x_test)

In [43]:
# RMS value

score_logistic = np.sqrt(mean_squared_error(y_test, y_pred_logistic))
print("Root Mean Squared Error: %.2f" % score_logistic)
print('R2 score: %.2f' % r2_score(y_test, y_pred_logistic))

Root Mean Squared Error: 0.07
R2 score: 0.97


** KNN **

In [16]:
#implementing Nearest Neighbor

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)

knn.fit(x_train, y_train) 

y_pred_knn = knn.predict(x_test)

In [17]:
# Metrics
from sklearn import metrics

score_knn_acc = metrics.accuracy_score(y_test, y_pred_knn)
print("Accuracy score: {}".format(score_knn_acc))

score_knn_precision = metrics.precision_score(y_test, y_pred_knn, average= "weighted")
print("Precision score: {}".format(score_knn_precision))

score_knn_recall = metrics.recall_score(y_test, y_pred_knn, average= "weighted")
print("Recall score: {}".format(score_knn_recall))

score_knn_f1 = metrics.f1_score(y_test, y_pred_knn, average= "weighted")
print("F1 score: {}".format(score_knn_f1))

Accuracy score: 0.998
Precision score: 0.9980051085568328
Recall score: 0.998
F1 score: 0.9979966916164151


** SVM **

In [18]:
# SVM
from sklearn.svm import SVC

svm_model = SVC(kernel="linear")

svm_model.fit(x_train, y_train)

y_pred_svm = svm_model.predict(x_test)

In [19]:
# Metrics

score_svm_acc = metrics.accuracy_score(y_test, y_pred_svm)
print("Accuracy score: {}".format(score_svm_acc))

score_svm_precision = metrics.precision_score(y_test, y_pred_svm, average= "weighted")
print("Precision score: {}".format(score_svm_precision))

score_svm_recall = metrics.recall_score(y_test, y_pred_svm, average= "weighted")
print("Recall score: {}".format(score_svm_recall))

score_svm_f1 = metrics.f1_score(y_test, y_pred_svm, average= "weighted")
print("F1 score: {}".format(score_svm_f1))

Accuracy score: 0.9985
Precision score: 0.998502875399361
Recall score: 0.9985
F1 score: 0.9984981422199071


** Gaussian Naive Bayes **

In [20]:
#GNB
from sklearn.naive_bayes import GaussianNB
mnb_model = GaussianNB()

mnb_model.fit(x_train, y_train)

y_pred_gnb = mnb_model.predict(x_test)

In [21]:
# Metrics

score_gnb_acc = metrics.accuracy_score(y_test, y_pred_gnb)
print("Accuracy score: {}".format(score_gnb_acc))

score_gnb_precision = metrics.precision_score(y_test, y_pred_gnb, average= "weighted")
print("Precision score: {}".format(score_gnb_precision))

score_gnb_recall = metrics.recall_score(y_test, y_pred_gnb, average= "weighted")
print("Recall score: {}".format(score_gnb_recall))

score_gnb_f1 = metrics.f1_score(y_test, y_pred_gnb, average= "weighted")
print("F1 score: {}".format(score_gnb_f1))

Accuracy score: 0.996
Precision score: 0.9960094405594406
Recall score: 0.996
F1 score: 0.9960032745786752


# Training and Prediction using Fully Connected Neural Network

In [23]:
# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
import collections
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

In [24]:
x,y=to_xy(features_df,'outcome')

In [25]:
x_train_tf, x_test_tf, y_train_tf, y_test_tf = train_test_split(x,y, test_size=0.2, random_state=42)

In [26]:
x_train.shape

(8000, 124)

** ReLU, adam, 4 hidden layers **


In [27]:
# set up checkpointer and complete this
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential

checkpointer_relu = ModelCheckpoint(filepath="./best_weights_relu_4l.hdf5", verbose=1, save_best_only=True)

# Training and Prediction using CNN

In [44]:
# we now reshape the x_train and x_test to image form used in CNN 2D
x_train = x_train_tf.reshape(x_train_tf.shape[0], 1, 121, 1)
x_test = x_test_tf.reshape(x_test_tf.shape[0], 1, 121, 1)

ValueError: cannot reshape array of size 992000 into shape (8000,1,121,1)

In [None]:
x_train.shape

In [None]:
x_test

In [None]:
#CNN 2D

cnn = Sequential()

# Conv2D layer 1
cnn.add(Conv2D(41, kernel_size=(1, 3), strides=(1, 1), padding='valid',
                 activation='relu',
                 input_shape=(1,121,1)))

In [None]:
cnn.add(Conv2D(64, (1, 3), activation='relu'))

cnn.add(MaxPooling2D(pool_size=(1, 2), strides=None))

cnn.add(Dropout(0.25))

In [None]:
cnn.summary()

In [None]:
cnn.add(Dense(128, activation='relu'))

cnn.add(Dropout(0.5))

cnn.add(Dense(2, activation='softmax'))

In [None]:
cnn.summary()

In [None]:
cnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
cnn.fit(x_train, y_train,     
          batch_size=128,
          epochs=10,
          verbose=2,
          validation_data=(x_test, y_test))

In [None]:
# Evaluate Accuracy in Keras
score = cnn.evaluate(x_test, y_test, verbose=0)
score

In [None]:
print('Test loss: {}'.format(score[0]))
print('Test accuracy: {}'.format(score[1]))

# Additional Features

**Remove Redundant Records **

In [None]:
network_df.drop_duplicates(keep='first', inplace=True)

In [None]:
network_df

In [None]:
#Select only relevant columns for processing
features_ad_df = network_df[['duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
                          
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
                          
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate']]

In [None]:
label_ad_df = network_df[['outcome']]

In [None]:
#Normalize numeric features


def normalize_numeric_minmax(df, name):
    if(df[name].max() > 0):
        df[name] = ((df[name] - df[name].min()) / (df[name].max() - df[name].min())).astype(np.float32)
    else:
        df[name] = df[name].astype(np.float32)
    
normalize_numeric_minmax(features_ad_df,"duration") 
normalize_numeric_minmax(features_ad_df,"src_bytes") 
normalize_numeric_minmax(features_ad_df,"dst_bytes") 
normalize_numeric_minmax(features_ad_df,"wrong_fragment") 
normalize_numeric_minmax(features_ad_df,"urgent") 

normalize_numeric_minmax(features_ad_df,"hot") 
normalize_numeric_minmax(features_ad_df,"num_failed_logins") 
normalize_numeric_minmax(features_ad_df,"num_compromised") 
normalize_numeric_minmax(features_ad_df,"num_root") 
normalize_numeric_minmax(features_ad_df,"num_file_creations") 
normalize_numeric_minmax(features_ad_df,"num_shells") 
normalize_numeric_minmax(features_ad_df,"num_access_files") 
normalize_numeric_minmax(features_ad_df,"num_outbound_cmds") 

normalize_numeric_minmax(features_ad_df,"count") 
normalize_numeric_minmax(features_ad_df,"srv_count") 
normalize_numeric_minmax(features_ad_df,"serror_rate") 
normalize_numeric_minmax(features_ad_df,"srv_serror_rate") 
normalize_numeric_minmax(features_ad_df,"rerror_rate") 
normalize_numeric_minmax(features_ad_df,"srv_rerror_rate") 
normalize_numeric_minmax(features_ad_df,"same_srv_rate") 
normalize_numeric_minmax(features_ad_df,"diff_srv_rate") 
normalize_numeric_minmax(features_ad_df,"srv_diff_host_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_count") 
normalize_numeric_minmax(features_ad_df,"dst_host_srv_count") 
normalize_numeric_minmax(features_ad_df,"dst_host_same_srv_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_diff_srv_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_same_src_port_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_srv_diff_host_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_serror_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_srv_serror_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_rerror_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_srv_rerror_rate") 



In [None]:
# one-hot cooding of categorical columns

def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name]).astype(np.float32)
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

encode_text_dummy(features_ad_df,"protocol_type") 
encode_text_dummy(features_ad_df,"service") 
encode_text_dummy(features_ad_df,"flag") 
encode_text_dummy(features_ad_df,"land") 

encode_text_dummy(features_ad_df,"logged_in") 
encode_text_dummy(features_ad_df,"root_shell") 
encode_text_dummy(features_ad_df,"su_attempted") 
encode_text_dummy(features_ad_df,"is_host_login") 
encode_text_dummy(features_ad_df,"is_guest_login") 

In [None]:
#Reduce the dataset size
features_ad_df = features_ad_df[0:40000]
label_ad_df = label_ad_df[0:40000]

In [None]:
# create a function called encodeLabelBinarizer

encodeLabelBinary = lambda x: 0 if x == 'normal.' else 1


In [None]:
label_ad_df['outcome'] = label_ad_df['outcome'].apply(encodeLabelBinary)

In [None]:
x_ad_train, x_ad_test, y_ad_train, y_ad_test = train_test_split(features_ad_df, label_ad_df['outcome'] , test_size=0.2, random_state=42)

In [None]:
features_ad_df.shape

**Logistic Regression **

In [None]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

Log_reg_model = LogisticRegression()

Log_reg_model.fit(x_ad_train, y_ad_train)

y_ad_pred_logistic = Log_reg_model.predict(x_ad_test)

In [None]:
# RMS value

score_logistic = np.sqrt(mean_squared_error(y_ad_test, y_ad_pred_logistic))
print("Root Mean Squared Error: %.2f" % score_logistic)
print('R2 score: %.2f' % r2_score(y_ad_test, y_ad_pred_logistic))

** KNN **

In [None]:
#implementing Nearest Neighbor

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)

knn.fit(x_ad_train, y_ad_train) 

y_ad_pred_knn = knn.predict(x_ad_test)

In [None]:
# Metrics
from sklearn import metrics

score_ad_knn_acc = metrics.accuracy_score(y_ad_test, y_ad_pred_knn)
print("Accuracy score: {}".format(score_ad_knn_acc))

score_ad_knn_precision = metrics.precision_score(y_ad_test, y_ad_pred_knn, average= "weighted")
print("Precision score: {}".format(score_ad_knn_precision))

score_ad_knn_recall = metrics.recall_score(y_ad_test, y_ad_pred_knn, average= "weighted")
print("Recall score: {}".format(score_ad_knn_recall))

score_ad_knn_f1 = metrics.f1_score(y_ad_test, y_ad_pred_knn, average= "weighted")
print("F1 score: {}".format(score_ad_knn_f1))

** SVM **

In [None]:
# SVM
from sklearn.svm import SVC

svm_model = SVC(kernel="linear")

svm_model.fit(x_ad_train, y_ad_train)

y_ad_pred_svm = svm_model.predict(x_ad_test)

In [None]:
# Metrics

score_ad_svm_acc = metrics.accuracy_score(y_ad_test, y_ad_pred_svm)
print("Accuracy score: {}".format(score_ad_svm_acc))

score_ad_svm_precision = metrics.precision_score(y_ad_test, y_ad_pred_svm, average= "weighted")
print("Precision score: {}".format(score_ad_svm_precision))

score_ad_svm_recall = metrics.recall_score(y_ad_test, y_ad_pred_svm, average= "weighted")
print("Recall score: {}".format(score_ad_svm_recall))

score_ad_svm_f1 = metrics.f1_score(y_ad_test, y_ad_pred_svm, average= "weighted")
print("F1 score: {}".format(score_ad_svm_f1))

** Gaussian Naive Bayes **

In [None]:
#GNB
from sklearn.naive_bayes import GaussianNB
mnb_model = GaussianNB()

mnb_model.fit(x_ad_train, y_ad_train)

y_ad_pred_gnb = mnb_model.predict(x_ad_test)

In [None]:
# Metrics

score_ad_gnb_acc = metrics.accuracy_score(y_ad_test, y_ad_pred_gnb)
print("Accuracy score: {}".format(score_ad_gnb_acc))

score_ad_gnb_precision = metrics.precision_score(y_ad_test, y_ad_pred_gnb, average= "weighted")
print("Precision score: {}".format(score_ad_gnb_precision))

score_ad_gnb_recall = metrics.recall_score(y_ad_test, y_ad_pred_gnb, average= "weighted")
print("Recall score: {}".format(score_ad_gnb_recall))

score_ad_gnb_f1 = metrics.f1_score(y_ad_test, y_ad_pred_gnb, average= "weighted")
print("F1 score: {}".format(score_ad_gnb_f1))

** Feature Importance Analysis **

In [None]:
#Read network_intrusion_data.csv file and load data into network_df dataframe 
network_ad_df= pd.read_csv('network_intrusion_data.csv')

In [None]:
#Drop any row with missing values
network_ad_df = network_df.dropna()

In [None]:
#Add column headers to the data in the dataframe
network_ad_df.columns = [
'duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'outcome'
]

In [None]:
#Select input columns
features_ad_df = network_ad_df[['duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
                          
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
                          
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate']]

In [None]:
label_ad_df = network_ad_df[['outcome']]

In [None]:
#Normalize numeric features

def normalize_numeric_minmax(df, name):
    if(df[name].max() > 0):
        df[name] = ((df[name] - df[name].min()) / (df[name].max() - df[name].min())).astype(np.float32)
    else:
        df[name] = df[name].astype(np.float32)
    
normalize_numeric_minmax(features_ad_df,"duration") 
normalize_numeric_minmax(features_ad_df,"src_bytes") 
normalize_numeric_minmax(features_ad_df,"dst_bytes") 
normalize_numeric_minmax(features_ad_df,"wrong_fragment") 
normalize_numeric_minmax(features_ad_df,"urgent") 

normalize_numeric_minmax(features_ad_df,"hot") 
normalize_numeric_minmax(features_ad_df,"num_failed_logins") 
normalize_numeric_minmax(features_ad_df,"num_compromised") 
normalize_numeric_minmax(features_ad_df,"num_root") 
normalize_numeric_minmax(features_ad_df,"num_file_creations") 
normalize_numeric_minmax(features_ad_df,"num_shells") 
normalize_numeric_minmax(features_ad_df,"num_access_files") 
normalize_numeric_minmax(features_ad_df,"num_outbound_cmds") 

normalize_numeric_minmax(features_ad_df,"count") 
normalize_numeric_minmax(features_ad_df,"srv_count") 
normalize_numeric_minmax(features_ad_df,"serror_rate") 
normalize_numeric_minmax(features_ad_df,"srv_serror_rate") 
normalize_numeric_minmax(features_ad_df,"rerror_rate") 
normalize_numeric_minmax(features_ad_df,"srv_rerror_rate") 
normalize_numeric_minmax(features_ad_df,"same_srv_rate") 
normalize_numeric_minmax(features_ad_df,"diff_srv_rate") 
normalize_numeric_minmax(features_ad_df,"srv_diff_host_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_count") 
normalize_numeric_minmax(features_ad_df,"dst_host_srv_count") 
normalize_numeric_minmax(features_ad_df,"dst_host_same_srv_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_diff_srv_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_same_src_port_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_srv_diff_host_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_serror_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_srv_serror_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_rerror_rate") 
normalize_numeric_minmax(features_ad_df,"dst_host_srv_rerror_rate") 



In [None]:
# one-hot cooding of categorical columns

def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name]).astype(np.float32)
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

encode_text_dummy(features_ad_df,"protocol_type") 
encode_text_dummy(features_ad_df,"service") 
encode_text_dummy(features_ad_df,"flag") 
encode_text_dummy(features_ad_df,"land") 

encode_text_dummy(features_ad_df,"logged_in") 
encode_text_dummy(features_ad_df,"root_shell") 
encode_text_dummy(features_ad_df,"su_attempted") 
encode_text_dummy(features_ad_df,"is_host_login") 
encode_text_dummy(features_ad_df,"is_guest_login") 

In [None]:
#Reduce the dataset size
features_ad_df = features_ad_df[0:40000]
label_ad_df = label_ad_df[0:40000]

In [None]:
# create a function called encodeLabelBinarizer

encodeLabelBinary = lambda x: 0 if x == 'normal.' else 1


In [None]:
label_ad_df['outcome'] = label_ad_df['outcome'].apply(encodeLabelBinary)

In [None]:
# Feature Importance
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier

extraTreeClassifier = ExtraTreesClassifier()
extraTreeClassifier.fit(features_ad_df, label_ad_df['outcome'])
# display the relative importance of each attribute
print(extraTreeClassifier.feature_importances_)

In [None]:
feature_importance = extraTreeClassifier.feature_importances_
plt.figure(figsize=(16, 6))
plt.yscale('log', nonposy='clip')
plt.bar(range(len(feature_importance)), feature_importance, align='center')
plt.xticks(range(len(feature_importance)), features_ad_df, rotation='vertical')
plt.title('Feature importance')
plt.ylabel('Importance')
plt.xlabel('Features')
plt.show()

In [None]:
##Choose only relevant features

from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(extraTreeClassifier, prefit=True)
X_new = model.transform(features_ad_df)
X_new.shape

In [None]:
X_new.shape

In [None]:
x_adf_train, x_adf_test, y_adf_train, y_adf_test = train_test_split(X_new, label_ad_df['outcome'] , test_size=0.2, random_state=42)

**Logistic Regression**

In [None]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

Log_reg_model = LogisticRegression()

Log_reg_model.fit(x_adf_train, y_adf_train)

y_adf_pred_logistic = Log_reg_model.predict(x_adf_test)

In [None]:
# RMS value

score_adf_logistic = np.sqrt(mean_squared_error(y_adf_test, y_adf_pred_logistic))
print("Root Mean Squared Error: %.2f" % score_adf_logistic)
print('R2 score: %.2f' % r2_score(y_adf_test, y_adf_pred_logistic))

** KNN ** 

In [None]:
#implementing Nearest Neighbor

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)

knn.fit(x_adf_train, y_adf_train) 

y_adf_pred_knn = knn.predict(x_adf_test)

In [None]:
# Metrics
from sklearn import metrics

score_adf_knn_acc = metrics.accuracy_score(y_adf_test, y_adf_pred_knn)
print("Accuracy score: {}".format(score_adf_knn_acc))

score_adf_knn_precision = metrics.precision_score(y_adf_test, y_adf_pred_knn, average= "weighted")
print("Precision score: {}".format(score_ad_knn_precision))

score_adf_knn_recall = metrics.recall_score(y_adf_test, y_adf_pred_knn, average= "weighted")
print("Recall score: {}".format(score_ad_knn_recall))

score_adf_knn_f1 = metrics.f1_score(y_adf_test, y_adf_pred_knn, average= "weighted")
print("F1 score: {}".format(score_adf_knn_f1))

** SVM ** 

In [None]:
# SVM
from sklearn.svm import SVC

svm_model = SVC(kernel="linear")

svm_model.fit(x_adf_train, y_adf_train)

y_adf_pred_svm = svm_model.predict(x_adf_test)

In [None]:
# Metrics

score_adf_svm_acc = metrics.accuracy_score(y_adf_test, y_adf_pred_svm)
print("Accuracy score: {}".format(score_adf_svm_acc))

score_adf_svm_precision = metrics.precision_score(y_adf_test, y_adf_pred_svm, average= "weighted")
print("Precision score: {}".format(score_adf_svm_precision))

score_adf_svm_recall = metrics.recall_score(y_adf_test, y_adf_pred_svm, average= "weighted")
print("Recall score: {}".format(score_ad_svm_recall))

score_adf_svm_f1 = metrics.f1_score(y_adf_test, y_adf_pred_svm, average= "weighted")
print("F1 score: {}".format(score_ad_svm_f1))

** Gaussian Naive Bayes **

In [None]:
#GNB
from sklearn.naive_bayes import GaussianNB
mnb_model = GaussianNB()

mnb_model.fit(x_adf_train, y_adf_train)

y_adf_pred_gnb = mnb_model.predict(x_adf_test)

In [None]:
# Metrics

score_adf_gnb_acc = metrics.accuracy_score(y_adf_test, y_adf_pred_gnb)
print("Accuracy score: {}".format(score_ad_gnb_acc))

score_adf_gnb_precision = metrics.precision_score(y_adf_test, y_adf_pred_gnb, average= "weighted")
print("Precision score: {}".format(score_ad_gnb_precision))

score_adf_gnb_recall = metrics.recall_score(y_adf_test, y_adf_pred_gnb, average= "weighted")
print("Recall score: {}".format(score_ad_gnb_recall))

score_adf_gnb_f1 = metrics.f1_score(y_adf_test, y_adf_pred_gnb, average= "weighted")
print("F1 score: {}".format(score_ad_gnb_f1))