In [48]:
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Data PreProcessing

In [49]:
#Read network_intrusion_data.csv file and load data into network_df dataframe 
network_df= pd.read_csv('network_intrusion_data.csv')
#network_df = network_df[0:10000]

In [50]:
network_df.drop_duplicates(keep='first', inplace=True)

In [51]:
#Add column headers to the data in the dataframe
network_df.columns = [
'duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'outcome'
]

In [52]:
#Select only relevant columns for processing
network_input_df = network_df[['duration',
                         'protocol_type',
                         'service',
                         'src_bytes',
                         'dst_bytes',
                         'flag',
                         'land',
                         'wrong_fragment',
                         'urgent',
                         'count',
                         'srv_count',
                         'serror_rate',
                         'srv_serror_rate',
                         'rerror_rate',
                         'srv_rerror_rate',
                         'same_srv_rate',
                         'diff_srv_rate',
                         'srv_diff_host_rate',
                         'dst_host_count',
                         'dst_host_srv_count',
                         'dst_host_same_srv_rate',
                         'dst_host_diff_srv_rate',
                         'dst_host_same_src_port_rate',
                         'dst_host_srv_diff_host_rate',
                         'dst_host_serror_rate',
                         'dst_host_srv_serror_rate',
                         'dst_host_rerror_rate',
                         'dst_host_srv_rerror_rate']]

In [53]:
outcome_df = network_df[['outcome']]

In [54]:
#Drop any row with missing values
network_input_df = network_input_df.dropna()

In [55]:
#Normalize numeric features

def normalize_numeric_minmax(df, name):
    df[name] = ((df[name] - df[name].min()) / (df[name].max() - df[name].min())).astype(np.float32)
    
    
normalize_numeric_minmax(network_input_df,"duration") 
normalize_numeric_minmax(network_input_df,"src_bytes") 
normalize_numeric_minmax(network_input_df,"dst_bytes") 
normalize_numeric_minmax(network_input_df,"wrong_fragment") 
normalize_numeric_minmax(network_input_df,"urgent") 
normalize_numeric_minmax(network_input_df,"count") 
normalize_numeric_minmax(network_input_df,"srv_count") 

normalize_numeric_minmax(network_input_df,"serror_rate") 
normalize_numeric_minmax(network_input_df,"srv_serror_rate") 
normalize_numeric_minmax(network_input_df,"rerror_rate") 
normalize_numeric_minmax(network_input_df,"srv_rerror_rate") 
normalize_numeric_minmax(network_input_df,"rerror_rate") 
normalize_numeric_minmax(network_input_df,"srv_rerror_rate") 
normalize_numeric_minmax(network_input_df,"same_srv_rate") 
normalize_numeric_minmax(network_input_df,"diff_srv_rate") 
normalize_numeric_minmax(network_input_df,"srv_diff_host_rate") 


normalize_numeric_minmax(network_input_df,"dst_host_count") 
normalize_numeric_minmax(network_input_df,"dst_host_srv_count") 
normalize_numeric_minmax(network_input_df,"dst_host_same_srv_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_diff_srv_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_same_src_port_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_srv_diff_host_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_serror_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_srv_serror_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_rerror_rate") 
normalize_numeric_minmax(network_input_df,"dst_host_srv_rerror_rate") 



In [56]:
network_input_df.dtypes

duration                       float32
protocol_type                   object
service                         object
src_bytes                      float32
dst_bytes                      float32
flag                            object
land                             int64
wrong_fragment                 float32
urgent                         float32
count                          float32
srv_count                      float32
serror_rate                    float32
srv_serror_rate                float32
rerror_rate                    float32
srv_rerror_rate                float32
same_srv_rate                  float32
diff_srv_rate                  float32
srv_diff_host_rate             float32
dst_host_count                 float32
dst_host_srv_count             float32
dst_host_same_srv_rate         float32
dst_host_diff_srv_rate         float32
dst_host_same_src_port_rate    float32
dst_host_srv_diff_host_rate    float32
dst_host_serror_rate           float32
dst_host_srv_serror_rate 

In [57]:
# one-hot cooding of categorical columns

def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

encode_text_dummy(network_input_df,"protocol_type") 
encode_text_dummy(network_input_df,"service") 
encode_text_dummy(network_input_df,"flag") 
encode_text_dummy(network_input_df,"land") 



In [58]:
network_input_df

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,...,flag-RSTOS0,flag-RSTR,flag-S0,flag-S1,flag-S2,flag-S3,flag-SF,flag-SH,land-0,land-1
0,0.0,3.446905e-07,0.000094,0.0,0.0,0.015656,0.015656,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
1,0.0,3.389216e-07,0.000259,0.0,0.0,0.015656,0.015656,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
2,0.0,3.158461e-07,0.000259,0.0,0.0,0.011742,0.011742,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
3,0.0,3.129617e-07,0.000394,0.0,0.0,0.011742,0.011742,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
4,0.0,3.129617e-07,0.000394,0.0,0.0,0.011742,0.011742,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
5,0.0,3.057506e-07,0.000376,0.0,0.0,0.001957,0.003914,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
6,0.0,2.293129e-07,0.000793,0.0,0.0,0.009785,0.009785,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
7,0.0,3.028661e-07,0.000029,0.0,0.0,0.015656,0.015656,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
8,0.0,3.057506e-07,0.000152,0.0,0.0,0.015656,0.015656,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0
9,0.0,3.028661e-07,0.000121,0.0,0.0,0.035225,0.035225,0.00,0.00,0.0,...,0,0,0,0,0,0,1,0,1,0


In [11]:
network_input_df = network_input_df[0:10000]

In [12]:
outcome_df = outcome_df[0:10000]

In [59]:
# create a function called encodeLabelBinarizer

encodeLabelBinary = lambda x: 0 if x == 'normal.' else 1


In [60]:
outcome_df['outcome'] = outcome_df['outcome'].apply(encodeLabelBinary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [61]:
outcome_df['outcome']

0         0
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         0
9         0
10        0
11        0
12        0
13        0
14        0
15        0
16        0
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        0
28        0
29        0
         ..
493990    0
493991    0
493992    0
493993    0
493994    0
493995    0
493996    0
493997    0
493998    0
493999    0
494000    0
494001    0
494002    0
494003    0
494004    0
494005    0
494006    0
494007    0
494008    0
494009    0
494010    0
494011    0
494012    0
494013    0
494014    0
494015    0
494016    0
494017    0
494018    0
494019    0
Name: outcome, Length: 145585, dtype: int64

In [58]:
#input_matrix = network_input_df.as_matrix()

In [62]:
x_train, x_test, y_train, y_test = train_test_split(network_input_df, outcome_df['outcome'] , test_size=0.2, random_state=42)

In [63]:
x_train.shape

(116468, 106)

In [64]:
y_train.shape

(116468,)

In [65]:
x_test.shape

(29117, 106)

In [66]:
y_test.shape

(29117,)

# Training and Prediction using Regression and Classification

** Logistic Regression **

In [67]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

Log_reg_model = LogisticRegression()

Log_reg_model.fit(x_train, y_train)

y_pred_logistic = Log_reg_model.predict(x_test)

In [68]:
# RMS value

score_logistic = np.sqrt(mean_squared_error(y_test, y_pred_logistic))
print("Root Mean Squared Error: %.2f" % score_logistic)
print('R2 score: %.2f' % r2_score(y_test, y_pred_logistic))

Root Mean Squared Error: 0.12
R2 score: 0.94


** KNN **

In [66]:
#implementing Nearest Neighbor

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)

knn.fit(x_train, y_train) 

y_pred_knn = knn.predict(x_test)

In [71]:
# Metrics
from sklearn import metrics

score_knn_acc = metrics.accuracy_score(y_test, y_pred_knn)
print("Accuracy score: {}".format(score_knn_acc))

score_knn_precision = metrics.precision_score(y_test, y_pred_knn, average= "weighted")
print("Precision score: {}".format(score_knn_precision))

score_knn_recall = metrics.recall_score(y_test, y_pred_knn, average= "weighted")
print("Recall score: {}".format(score_knn_recall))

score_knn_f1 = metrics.f1_score(y_test, y_pred_knn, average= "weighted")
print("F1 score: {}".format(score_knn_f1))

NameError: name 'y_pred_knn' is not defined

** SVM **

In [68]:
# SVM
from sklearn.svm import SVC

svm_model = SVC(kernel="linear")

svm_model.fit(x_train, y_train)

y_pred_svm = svm_model.predict(x_test)

In [69]:
# Metrics

score_svm_acc = metrics.accuracy_score(y_test, y_pred_svm)
print("Accuracy score: {}".format(score_svm_acc))

score_svm_precision = metrics.precision_score(y_test, y_pred_svm, average= "weighted")
print("Precision score: {}".format(score_svm_precision))

score_svm_recall = metrics.recall_score(y_test, y_pred_svm, average= "weighted")
print("Recall score: {}".format(score_svm_recall))

score_svm_f1 = metrics.f1_score(y_test, y_pred_svm, average= "weighted")
print("F1 score: {}".format(score_svm_f1))

Accuracy score: 0.998
Precision score: 0.9980051085568328
Recall score: 0.998
F1 score: 0.9979966916164151


** Gaussian Naive Bayes **

In [69]:
#GNB
from sklearn.naive_bayes import GaussianNB
mnb_model = GaussianNB()

mnb_model.fit(x_train, y_train)

y_pred_gnb = mnb_model.predict(x_test)

In [74]:
# Metrics

score_gnb_acc = metrics.accuracy_score(y_test, y_pred_gnb)
print("Accuracy score: {}".format(score_gnb_acc))

score_gnb_precision = metrics.precision_score(y_test, y_pred_gnb, average= "weighted")
print("Precision score: {}".format(score_gnb_precision))

score_gnb_recall = metrics.recall_score(y_test, y_pred_gnb, average= "weighted")
print("Recall score: {}".format(score_gnb_recall))

score_gnb_f1 = metrics.f1_score(y_test, y_pred_gnb, average= "weighted")
print("F1 score: {}".format(score_gnb_f1))

Accuracy score: 0.9173678606999347
Precision score: 0.9262402012109815
Recall score: 0.9173678606999347
F1 score: 0.915327464019793


** Fully Connected Neural Network **

** Neural Network Regression **

In [72]:
# set up checkpointer
from keras.callbacks import EarlyStopping, ModelCheckpoint
checkpointer_relu = ModelCheckpoint(filepath="./best_weights_sigmoid_4l.hdf5", verbose=1, save_best_only=True)

In [73]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation

for i in range(10):
    print(i)
    
    # Build network
    model_reg_relu = Sequential()

    model_reg_relu.add(Dense(80, input_dim=x_train.shape[1], activation='sigmoid'))  
    model_reg_relu.add(Dense(60, activation='sigmoid')) # Hidden 2
    model_reg_relu.add(Dense(20, activation='sigmoid')) # Hidden 3
    model_reg_relu.add(Dense(10, activation='sigmoid')) # Hidden 4
    model_reg_relu.add(Dense(1)) # Output
    model_reg_relu.compile(loss='mean_squared_error', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    model_reg_relu.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer_relu],verbose=2,epochs=100) 
    
print('Training finished...Loading the best model') 
print()
model_reg_relu.load_weights('./best_weights_sigmoid_4l.hdf5')

0
Train on 8000 samples, validate on 2000 samples
Epoch 1/100
 - 1s - loss: 0.1162 - val_loss: 0.0047

Epoch 00001: val_loss improved from inf to 0.00468, saving model to ./best_weights_sigmoid_4l.hdf5
Epoch 2/100
 - 0s - loss: 0.0020 - val_loss: 0.0030

Epoch 00002: val_loss improved from 0.00468 to 0.00301, saving model to ./best_weights_sigmoid_4l.hdf5
Epoch 3/100
 - 0s - loss: 0.0011 - val_loss: 0.0026

Epoch 00003: val_loss improved from 0.00301 to 0.00261, saving model to ./best_weights_sigmoid_4l.hdf5
Epoch 4/100
 - 0s - loss: 8.1853e-04 - val_loss: 0.0026

Epoch 00004: val_loss improved from 0.00261 to 0.00260, saving model to ./best_weights_sigmoid_4l.hdf5
Epoch 5/100
 - 1s - loss: 7.4710e-04 - val_loss: 0.0026

Epoch 00005: val_loss improved from 0.00260 to 0.00255, saving model to ./best_weights_sigmoid_4l.hdf5
Epoch 6/100
 - 0s - loss: 7.0814e-04 - val_loss: 0.0026

Epoch 00006: val_loss did not improve from 0.00255
Epoch 7/100
 - 0s - loss: 6.5821e-04 - val_loss: 0.0024

E

In [74]:
# Predict stars
pred_reg = model_reg_relu.predict(x_test)

In [75]:
# Measure RMSE error.  RMSE is common for regression.
score_reg = np.sqrt(mean_squared_error(y_test,pred_reg))
print("Final score (RMSE): {}".format(score_reg))
print('R2 score: %.2f' % r2_score(y_test, pred_reg))

Final score (RMSE): 0.047878782711463695
R2 score: 0.99


** Neural Network Classification **

In [77]:
# set up checkpointer
checkpointer_classification = ModelCheckpoint(filepath="./best_weights_softmax.hdf5", verbose=1, save_best_only=True)

In [85]:
print(y_train.shape[0])

8000


In [86]:
#Tensorflow classification
for i in range(10):
    print(i)
    
    # Build network
    model_classification = Sequential()
    model_classification.add(Dense(100, input_dim=x_train.shape[1], activation='relu')) # Hidden 1
    model_classification.add(Dense(50, activation='relu')) # Hidden 2
    model_classification.add(Dense(y_train.shape[0], activation='softmax')) # Output
    model_classification.compile(loss='categorical_crossentropy', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=2, mode='auto')
    model_classification.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer_classification],verbose=2,epochs=100)
    
print('Training finished...Loading the best model') 
print()
model_classification.load_weights('./best_weights_softmax.hdf5')

0


ValueError: Error when checking target: expected dense_147 to have shape (8000,) but got array with shape (1,)

In [None]:
pred_class = model_classification.predict(x_test)
print("Shape: {}".format(pred_class.shape))
print(pred_class)

In [None]:
predict_stars = np.argmax(pred_class,axis=1)
true_stars = np.argmax(y_test,axis=1)

In [None]:
# Metrics

score_tf_acc_stopping = metrics.accuracy_score(true_stars, predict_stars)
print("Accuracy score: {}".format(score_tf_acc_stopping))

score_tf_precision_stopping = metrics.precision_score(true_stars, predict_stars, average= "weighted")
print("Precision score: {}".format(score_tf_precision_stopping))

score_tf_recall_stopping = metrics.recall_score(true_stars, predict_stars, average= "weighted")
print("Recall score: {}".format(score_tf_recall_stopping))

score_tf_f1_stopping = metrics.f1_score(true_stars, predict_stars, average= "weighted")
print("F1 score: {}".format(score_tf_f1_stopping))