In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

import tensorflow as tf
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
from keras.layers import BatchNormalization

import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('kddcup99_csv.csv')

In [None]:
#Check if any null values are exisiting in the data, also getting the number of categorical as well as numerical columns 
df.info()

In [None]:
#Checking for duplicate rows
duplicate_rows=df[df.duplicated()]
print(duplicate_rows)
df.drop_duplicates(keep=False,inplace=True)

In [None]:
#Protocol_type, Flag, Urgent were the categorical columns
print(df['protocol_type'].unique())
print(df['flag'].unique())
print(df['urgent'].unique())

In [None]:
df_copy=df

In [None]:
onehotencode=OneHotEncoder()
labelencode=LabelEncoder()

#Creating the labelencode for label class 0:- attack, 1:- normal
df_copy.loc[df_copy['label']!='normal','label']='attack'
df_copy['label']=labelencode.fit_transform(df_copy['label'])


#Creating OnehotEncoding for the categorical columns
X_protocol_type = onehotencode.fit_transform(df_copy.protocol_type.values.reshape(-1,1)).toarray()
X_service = onehotencode.fit_transform(df_copy.service.values.reshape(-1,1)).toarray()
X_flag = onehotencode.fit_transform(df_copy.flag.values.reshape(-1,1)).toarray()
 
#Creating dataframe for the OnehotEncoded columns 
df_protocol = pd.DataFrame(X_protocol_type, columns =df_copy['protocol_type'].unique()) 
df_service = pd.DataFrame(X_service, columns = df_copy['service'].unique()) 
df_flag = pd.DataFrame(X_flag, columns = df_copy['flag'].unique()) 



df_onehot_encoded=df_copy
lst_proto=df_protocol.columns
lst_service=df_service.columns
lst_flag=df_flag.columns


df_onehot_encoded= df_onehot_encoded.drop(['protocol_type','flag','service'], axis=1)

#Assigning remaining numerical columns to a single dataframe
for i in range(0,len(lst_proto)):
  df_onehot_encoded[lst_proto[i]]=df_protocol[lst_proto[i]].values
print(df_onehot_encoded.isna().sum().sum())

for i in range(0,len(lst_service)):
  df_onehot_encoded[lst_service[i]]=df_service[lst_service[i]].values

for i in range(0,len(lst_flag)):
  df_onehot_encoded[lst_flag[i]]=df_flag[lst_flag[i]].values


In [None]:
#Test Set is 20% of the total data
#Cross Validation set is 20% of the training data 

data_split_pct=0.2
#Test set
df_train_onehot_encoded,df_test_onehot_encoded=train_test_split(df_onehot_encoded,test_size=data_split_pct)
#cross validation set
df_train_onehot_encoded,df_cv_onehot_encoded=train_test_split(df_train_onehot_encoded,test_size=data_split_pct)

In [None]:
#Training the data on single class
df_train_onehot_encoded_normal=df_train_onehot_encoded.loc[df_train_onehot_encoded['label']!=0]
df_train_onehot_encoded_normal_wo_label=df_train_onehot_encoded_normal.drop(['label'],axis=1)
df_test_onehot_encoded_wo_label=df_test_onehot_encoded.drop(['label'],axis=1)
df_cv_onehot_encoded_wo_label=df_cv_onehot_encoded.drop(['label'],axis=1)

In [None]:
#Standardizing the values 
scaler = StandardScaler().fit(df_train_onehot_encoded_normal_wo_label)
df_train_sel_col_wo_label_rescaled = scaler.transform(df_train_onehot_encoded_normal_wo_label)
df_test_sel_col_wo_label_rescaled = scaler.transform(df_test_onehot_encoded_wo_label)
df_cv_sel_col_wo_label_rescaled = scaler.transform(df_cv_onehot_encoded_wo_label)

In [None]:
#AutoEncoder Architecture

epoch=500
batch_size=2000
input_dim=df_train_sel_col_wo_label_rescaled.shape[1]
encoding_dim=64
hidden_dim=int(encoding_dim/2)
learning_rate=1e-3


input_layer=Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation="relu", activity_regularizer=regularizers.l1(learning_rate))(input_layer)
encoder=BatchNormalization()(encoder)
encoder = Dense(hidden_dim, activation="relu")(encoder)#encoded features
encoder=BatchNormalization()(encoder)
decoder = Dense(encoding_dim, activation="relu")(encoder)
decoder=BatchNormalization()(decoder)
decoder = Dense(input_dim, activation="linear")(decoder)#decoded features
autoencoder = Model(inputs=input_layer, outputs=decoder)
initial_weights=autoencoder.get_weights()
autoencoder.summary()

In [None]:
#Fitting the model using error metric as mean squared error and adam gradient optimizer
autoencoder.compile(metrics=['mse'],
                    loss='mean_squared_error',
                    optimizer='adam')
cp = ModelCheckpoint(filepath="autoencoder_classifier.h5",
                               save_best_only=True,
                               verbose=0)
tb = TensorBoard(log_dir='./logs',
                histogram_freq=0,
                write_graph=True,
                write_images=True)
history = autoencoder.fit(df_train_sel_col_wo_label_rescaled, df_train_sel_col_wo_label_rescaled,
                    epochs=epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(df_cv_sel_col_wo_label_rescaled, df_cv_sel_col_wo_label_rescaled),
                    verbose=1,
                    callbacks=[cp, tb]).history

In [None]:
#EVALUATING CROSS VALIDATION SET
valid_x_predictions = autoencoder.predict(df_cv_sel_col_wo_label_rescaled)

#Calculating mean squared error between predictions and cros vaildation values
mse = np.mean(np.power(df_cv_sel_col_wo_label_rescaled - valid_x_predictions, 2), axis=1)
error_df = pd.DataFrame({'Reconstruction_error': mse,
                        'True_class': df_cv_onehot_encoded['label']})
error_df = error_df.reset_index()

In [None]:
#Calculating precision and recall value at various threshold values
precision=[]
recall=[]
threshold_cv=[]
i=0
while(i<5):
  threshold_cv.append(i)
  pred_y_cv = [0 if e > i else 1 for e in error_df.Reconstruction_error.values]
  conf_matrix_cv = confusion_matrix(error_df.True_class, pred_y_cv)
  #print(conf_matrix_cv)
  print('threshold = ',i)
  print('Accuracy = ', round((conf_matrix_cv[0][0]+conf_matrix_cv[1][1])/(conf_matrix_cv[0][1]+conf_matrix_cv[0][0]+conf_matrix_cv[1][0]+conf_matrix_cv[1][1])*100,2))
  Recall_cv = round(conf_matrix_cv[0][0]/(conf_matrix_cv[0][1]+conf_matrix_cv[0][0])*100,2)
  Precision_cv = round(conf_matrix_cv[0][0]/(conf_matrix_cv[1][0]+conf_matrix_cv[0][0])*100,2)
  precision.append(Precision_cv)
  recall.append(Recall_cv)
  i+=0.1


In [None]:
#Plotting graph between recall and precision based on various thresholds
plt.plot(threshold_cv, precision, label="Precision",linewidth=5)
plt.plot(threshold_cv, recall, label="Recall",linewidth=5)
plt.title('Precision and recall for different threshold values')
plt.xlabel('Threshold')
plt.ylabel('Precision/Recall')
plt.legend()
#plt.savefig('recall&precisionVSthreshold.png')
plt.show()

In [None]:
#Selecting threshold based on the graph
best_threshold=1.1
pred_y_test = [0 if e > best_threshold else 1 for e in error_df_test.Reconstruction_error.values]
conf_matrix = confusion_matrix(error_df_test.True_class, pred_y_test)
print(conf_matrix)

print('Accuracy = ', round((conf_matrix[0][0]+conf_matrix[1][1])/(conf_matrix[0][1]+conf_matrix[0][0]+conf_matrix[1][0]+conf_matrix[1][1])*100,2))
Recall = round(conf_matrix[0][0]/(conf_matrix[0][1]+conf_matrix[0][0])*100,2)
Precision = round(conf_matrix[0][0]/(conf_matrix[1][0]+conf_matrix[0][0])*100,2)
print('Recall = ', Recall)
print('Precision = ', Precision)
print('F1-Score = ',round((2*Precision*Recall)/(Precision+Recall),2))
print('************************')

Labels = ["Attack","Normal"]
plt.figure(figsize=(3, 3))
sns.heatmap(conf_matrix, xticklabels=Labels, yticklabels=Labels, annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
#plt.savefig('confusion_matrix.png')
plt.show()
