In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model
from tensorflow.python.keras.utils.np_utils import to_categorical
import matplotlib.pyplot as plt
#from featureencodinglibrary import featureEncodingUsingLabelEncoder
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.preprocessing import StandardScaler
#feature scaling library
#from featurescalinglibrary import featureScalingUsingStandardScalar


#Libraries for printing tables in readable format
from tabulate import tabulate

#Library for creating an excel sheet
import xlsxwriter

labelName = 'attack_type'	

In [2]:

#This function is used to perform label encoding on the categorical features in the given dataset
def featureEncodingUsingLabelEncoder(dataSetForFeatureEncoding):
    print("****** Start label encoding on the categorical features in the given dataset *****")

    #Extract the categorical features, leave the label
    categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop([labelName],axis=1).select_dtypes(['object'])
    #Get the names of the categorical features
    categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values
 
    print("****** Number of features before label encoding: ",len(dataSetForFeatureEncoding.columns))
    print("****** Number of categorical features in the dataset: ",len(categoricalColumnNames))
    print("****** Categorical feature names in the dataset: ",categoricalColumnNames)

    print('\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n')
    labelEncoder = LabelEncoder() 
    for feature in categoricalColumnNames:
        uniq = np.unique(dataSetForFeatureEncoding[feature])
        print('\n{}: {} '.format(feature,len(uniq)))
        printList(dataSetForFeatureEncoding[feature].unique(),'distinct values')
        dataSetForFeatureEncoding[feature] = labelEncoder.fit_transform(dataSetForFeatureEncoding[feature]) 
    print("****** Number of features after label encoding: ",len(dataSetForFeatureEncoding.columns))    
    
    print("****** End label encoding on the categorical features in the given dataset *****\n")
    return dataSetForFeatureEncoding
	


In [3]:

def createExcelFromArray(array, fileName):
    workbook = xlsxwriter.Workbook(fileName)
    worksheet = workbook.add_worksheet()

    row = 0
    for col, data in enumerate(array):
        worksheet.write_row(col, row, data)

    workbook.close()

def printList (list,heading):
    for i in range(0, len(list)): 
        list[i] = str(list[i]) 
    if len(list)>0:
        print(tabulate([i.strip("[]").split(", ") for i in list], headers=[heading], tablefmt='orgtbl')+"\n")



In [4]:

def featureSelectionUsingExtraTreesClassifier(dataSetForFeatureSelection):
    print("\n****** Start performing feature selection using ExtraTreesClassifier *****")
    print("****** Falls under wrapper methods (feature importance) *****")


    #Applying feature encoding before applying the ExtraTreesClassification
    dataSetForFeatureSelection = featureEncodingUsingLabelEncoder(dataSetForFeatureSelection)
    dataSetAfterFeatuerSelection = dataSetForFeatureSelection
    #features = dataSetForFeatureSelection.iloc[:,0:len(dataSetForFeatureSelection.columns)-1]  
    features = dataSetForFeatureSelection.drop([labelName],axis=1)
    label = dataSetForFeatureSelection[labelName]

    labelencoder = LabelEncoder()
    labelTransformed = labelencoder.fit_transform(label)
	
    print("****** ExtraTreesClassification is in progress *****")
    #Train using ExtraTreesClassifier
    trainedforest = ExtraTreesClassifier(n_estimators=700).fit(features,labelTransformed)
    importances = trainedforest.feature_importances_ #array with importances of each feature
    idx = np.arange(0, features.shape[1]) #create an index array, with the number of features
    features_to_keep = idx[importances > np.mean(importances)] #only keep features whose importance is greater than the mean importance
    featureImportances = pd.Series(importances, index= features.columns)
    selectedFeatures = featureImportances.nlargest(len(features_to_keep))
    print("\n selectedFeatures after ExtraTreesClassification: ", selectedFeatures)
    print("****** Completed ExtraTreesClassification *****")

    #Plot the feature Importance to see which features have been considered as most important for our model to make its predictions
    #figure(num=None, figsize=(20, 22), dpi=80, facecolor='w', edgecolor='k')
    #selectedFeatures.plot(kind='barh')

    selectedFeaturesNames = selectedFeatures.keys()
    dataSetForFeatureSelection = dataSetForFeatureSelection.drop(selectedFeaturesNames,axis=1)
    dataSetAfterFeatuerSelection = dataSetAfterFeatuerSelection.drop(dataSetForFeatureSelection.columns, axis=1)
    dataSetAfterFeatuerSelection[labelName] = label
    
    numberOfFeaturesInTheDatasetAfterFeatureSelection = len(dataSetAfterFeatuerSelection.columns)    
    print('\n***** Number of columns in the dataSet after feature selection: ', len(dataSetAfterFeatuerSelection.columns))
    print('***** Columns in the dataSet after feature selection: \n', dataSetAfterFeatuerSelection.columns)
    print("****** End performing feature selection using ExtraTreesClassifier *****")
    return dataSetAfterFeatuerSelection



In [5]:
def featureEncodingUsingOneHotEncoder(dataSetForFeatureEncoding):
    print("****** Start one hot encoding on the categorical features in the given dataset *****")
    #Extract the categorical features, leave the label
    categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop([labelName],axis=1).select_dtypes(['object'])
    #Get the names of the categorical features
    categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values
    
    print("****** Number of features before one hot encoding: ",len(dataSetForFeatureEncoding.columns))
    print("****** Number of categorical features in the dataset: ",len(categoricalColumnNames))
    print("****** Categorical feature names in the dataset: ",categoricalColumnNames)
    
    print('\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n')
    categoricalFeaturesInTheDataset = list(set(dataSetForFeatureEncoding.columns) - set(dataSetForFeatureEncoding._get_numeric_data().columns))
    numericalFeaturesInTheDataset = list(dataSetForFeatureEncoding._get_numeric_data().columns)
    for feature in categoricalFeaturesInTheDataset:
        uniq = np.unique(dataSetForFeatureEncoding[feature])
        print('\n{}: {} '.format(feature,len(uniq)))
        printList(dataSetForFeatureEncoding[feature].unique(),'distinct values')
        
    #Using get_dummies function to get the dummy variables for the categorical columns
    onHotEncodedDataSet=pd.get_dummies(dataSetForFeatureEncoding, columns=categoricalColumnNames, prefix=categoricalColumnNames)
    
    #Move the label column to the end
    label = onHotEncodedDataSet.pop(labelName)
    onHotEncodedDataSet[labelName] = label
    numberOfColumnsInOneHotEncodedDataset = len(onHotEncodedDataSet.columns)
    print("****** Number of features after one hot encoding: ",numberOfColumnsInOneHotEncodedDataset)

    print("****** End one hot encoding on the categorical features in the given dataset *****\n")
    return onHotEncodedDataSet


In [6]:
	
def featureScalingUsingStandardScalar(dataSetForFeatureScaling):
    print("****** Start feature scaling of the features present in the dataset using StandardScalar *****")

    numberOfColumnsInEncodedDataset = len(dataSetForFeatureScaling.columns)
    dataSetInArrayFormat = dataSetForFeatureScaling.values

    #Remove the label column from the dataset
    
    label = dataSetForFeatureScaling.pop(labelName)

    print(dataSetInArrayFormat)
    features = dataSetInArrayFormat[:,0:numberOfColumnsInEncodedDataset-1]
    print("\n****** Number of features in the dataset before performing scaling: ",np.size(features,1))
    print("\n****** Features in the dataset before performing scaling ***** \n",features)
    
    #Perform feature scaling
    scaler=StandardScaler()
    scaledFeatures=scaler.fit_transform(features)    
    print("\n****** Number of features in the dataset after performing scaling: ",np.size(scaledFeatures,1))
    print("\n****** Features in the dataset after performing scaling ***** \n",scaledFeatures)

    #Convert from array format to dataframe
    scaledFeatures = pd.DataFrame(scaledFeatures, columns=dataSetForFeatureScaling.columns)
    scaledFeatures = scaledFeatures.reset_index(drop=True)
    label = label.reset_index(drop=True)
    scaledFeatures[labelName]=label
    print("scaledFeatures.head(): ",scaledFeatures.head())
    print("scaledFeatures.shape: ",scaledFeatures.shape)
    
    print("\n****** End of feature scaling of the features present in the dataset using StandardScalar *****\n")
    return scaledFeatures


In [7]:
#Split the complete dataSet into training dataSet and testing dataSet
def splitCompleteDataSetIntoTrainingSetAndTestingSet(completeDataSet):
	label = completeDataSet[labelName]
	features = completeDataSet.drop(labelName,axis=1)
	featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet=train_test_split(features,label,test_size=0.4, random_state=42)
	print("features.shape: ",features.shape)
	print("label.shape: ",label.shape)
	print("features: ",features)
	print("label: ",label)
	return featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet




In [8]:
def performPreprocessing(trainingDataSet, testingDataSet, arrayOfModels):
    for i in range(0,len(arrayOfModels)):
        print('***************************************************************************************************************************')
        print('********************************************* Building Model-', i ,' As Below *************************************************')
        print('\t -- Feature Selection: \t ', arrayOfModels[i][0], ' \n\t -- Feature Encoding: \t ', arrayOfModels[i][1], ' \n\t -- Feature Scaling: \t ', arrayOfModels[i][2], '\n')

        #Combining the test and training datasets for preprocessing then together, because we observed that in sme datasets
        #the values in the categorical columns in test dataset and train dataset are being different this causes issues while
        #applying classification techniques
        completeDataSet = pd.concat(( trainingDataSet, testingDataSet ))

        #difficultyLevel = completeDataSet.pop('difficulty_level')
        
        print("completeDataSet.shape: ",completeDataSet.shape)
        print("completeDataSet.head: ",completeDataSet.head())

        #Feature Selection  
        if arrayOfModels[i][0] == 'ExtraTreesClassifier':
            #Perform feature selection using ExtraTreesClassifier
            completeDataSetAfterFeatuerSelection = featureSelectionUsingExtraTreesClassifier(completeDataSet)        
        #Feature Encoding        
 
        if arrayOfModels[i][1] == 'OneHotEncoder':
            #Perform OnHot encoding to convert categorical values into one-hot encoded features
            completeEncodedDataSet = featureEncodingUsingOneHotEncoder(completeDataSetAfterFeatuerSelection)

        #Feature Scaling        
 
        if arrayOfModels[i][2] == 'Standardization':
            #Perform Standardization to scale the features of the dataset into same range
            completeEncodedAndScaledDataset = featureScalingUsingStandardScalar(completeEncodedDataSet)
        
        #Split the complete dataSet into training dataSet and testing dataSet
        featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet = splitCompleteDataSetIntoTrainingSetAndTestingSet(completeEncodedAndScaledDataset)
        
        trainingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTrainingDataSet, labelInPreProcessedTrainingDataSet], axis=1, sort=False)
        testingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTestingDataSet, labelInPreProcessedTestingDataSet], axis=1, sort=False)
    
    return 	completeEncodedAndScaledDataset




In [9]:

arrayOfModels = [["ExtraTreesClassifier","OneHotEncoder","Standardization"]]


trainingFileNameWithAbsolutePath = r"C:\Users\PRAMILA\Downloads\NetworkIntrusionDetection-master\NetworkIntrusionDetection-master\Datasets\NSL-KDD\KDDTrain+_20Percent.csv"
testingFileNameWithAbsolutePath = r"C:\Users\PRAMILA\Downloads\NetworkIntrusionDetection-master\NetworkIntrusionDetection-master\Datasets\NSL-KDD\KDDTest-21.csv"

#Train data
trainingDataSet =  pd.read_csv(trainingFileNameWithAbsolutePath)
difficultyLevel = trainingDataSet.pop('difficulty_level')
label = trainingDataSet[labelName]
print(trainingDataSet.head())

#Test data
testingDataSet = pd.read_csv(testingFileNameWithAbsolutePath)
difficultyLevel = testingDataSet.pop('difficulty_level')
print(testingDataSet.head())



   Duration Protocol_type  Service Flag  Src_bytes  Dst_bytes  Land  \
0         0           udp    other   SF        146          0     0   
1         0           tcp  private   S0          0          0     0   
2         0           tcp     http   SF        232       8153     0   
3         0           tcp     http   SF        199        420     0   
4         0           tcp  private  REJ          0          0     0   

   Wrong_fragment  Urgent  Hot  ...  Dst_host_srv_count  \
0               0       0    0  ...                   1   
1               0       0    0  ...                  26   
2               0       0    0  ...                 255   
3               0       0    0  ...                 255   
4               0       0    0  ...                  19   

   Dst_host_same_srv_rate  Dst_host_diff_srv_rate  \
0                    0.00                    0.60   
1                    0.10                    0.05   
2                    1.00                    0.00   
3     

In [10]:
completeEncodedAndScaledDataset = performPreprocessing(trainingDataSet, testingDataSet, arrayOfModels)
print(completeEncodedAndScaledDataset.head())

***************************************************************************************************************************
********************************************* Building Model- 0  As Below *************************************************
	 -- Feature Selection: 	  ExtraTreesClassifier  
	 -- Feature Encoding: 	  OneHotEncoder  
	 -- Feature Scaling: 	  Standardization 

completeDataSet.shape:  (37041, 42)
completeDataSet.head:     Duration Protocol_type  Service Flag  Src_bytes  Dst_bytes  Land  \
0         0           udp    other   SF        146          0     0   
1         0           tcp  private   S0          0          0     0   
2         0           tcp     http   SF        232       8153     0   
3         0           tcp     http   SF        199        420     0   
4         0           tcp  private  REJ          0          0     0   

   Wrong_fragment  Urgent  Hot  ...  Dst_host_srv_count  \
0               0       0    0  ...                   1   
1            

In [11]:
x = completeEncodedAndScaledDataset.drop('attack_type',axis=1)
y = completeEncodedAndScaledDataset['attack_type']
print(x.shape, y.shape)
print('Number of unique values in label: ',len(np.unique(y)))
print('Unique values in label: ',np.unique(y))
#print(y.value_counts())
onehot = pd.get_dummies(y)
y = onehot.values
print(x.shape, y.shape)



(37041, 20) (37041,)
Number of unique values in label:  40
Unique values in label:  ['apache2' 'back' 'buffer_overflow' 'ftp_write' 'guess_passwd'
 'httptunnel' 'imap' 'ipsweep' 'land' 'loadmodule' 'mailbomb' 'mscan'
 'multihop' 'named' 'neptune' 'nmap' 'normal' 'perl' 'phf' 'pod'
 'portsweep' 'processtable' 'ps' 'rootkit' 'saint' 'satan' 'sendmail'
 'smurf' 'snmpgetattack' 'snmpguess' 'spy' 'sqlattack' 'teardrop'
 'udpstorm' 'warezclient' 'warezmaster' 'worm' 'xlock' 'xsnoop' 'xterm']
(37041, 20) (37041, 40)


In [12]:
'''
This function is used to define, compile and filt a neural network
'''
def nn_model(trainx, trainy, valx,valy,bt_size,epochs, layers):
  model = Sequential()
  model.add(Dense(layers[0],activation='relu', input_shape=(trainx.shape[1],)))
  for l in layers[1:]:
    model.add(Dense(l, activation='relu' ))
    model.add(Dropout(0.30))
  model.add(Dense(trainy.shape[1], activation='softmax'))
  model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  hist=model.fit(trainx, trainy, batch_size=bt_size, epochs=epochs, shuffle=True, validation_data=(valx,valy), verbose=True)
  model.save('modelnew.h5')
  return hist



In [13]:
trainx, testx, trainy, testy = train_test_split(x,y, test_size=0.25, random_state=42)
layers=[trainx.shape[1],800,500,400,300,200,100,50,10]
print(trainx.shape[1])
print(trainx.columns.values)
print(trainx.iloc[0].values)
hist = nn_model(trainx, trainy, testx, testy,16,2,layers)

20
['Protocol_type' 'Service' 'Flag' 'Src_bytes' 'Logged_in' 'Count'
 'Srv_count' 'Serror_rate' 'Srv_serror_rate' 'Same_srv_rate'
 'Diff_srv_rate' 'Dst_host_count' 'Dst_host_srv_count'
 'Dst_host_same_srv_rate' 'Dst_host_diff_srv_rate'
 'Dst_host_same_src_port_rate' 'Dst_host_srv_diff_host_rate'
 'Dst_host_serror_rate' 'Dst_host_srv_serror_rate' 'Dst_host_rerror_rate']
[-0.15478617  1.61854439  0.73536923 -0.0112619  -0.73291357 -0.6746598
 -0.2458976  -0.55658419 -0.35972625  0.72192361 -0.3773118  -0.1323311
 -0.8948217  -1.17034772 -0.36938667 -0.46836203  0.72011226 -0.56138989
 -0.54981386 -0.48776502]
Epoch 1/2
Epoch 2/2


In [14]:
print(trainy.shape[1])
print(trainy[0])

40
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


In [15]:
import tensorflow

# """
['Protocol_type' 'Service' 'Flag' 'Src_bytes' 'Logged_in' 'Count'
 'Srv_count' 'Serror_rate' 'Srv_serror_rate' 'Rerror_rate' 'Same_srv_rate'
 'Diff_srv_rate' 'Dst_host_count' 'Dst_host_srv_count'
 'Dst_host_same_srv_rate' 'Dst_host_diff_srv_rate'
 'Dst_host_same_src_port_rate' 'Dst_host_serror_rate'
 'Dst_host_srv_serror_rate' 'Dst_host_rerror_rate']
"""

"""
n label:  40
Unique values in label:  [
    0         1            2             3           4
 'apache2' 'back' 'buffer_overflow' 'ftp_write' 'guess_passwd'
    5           6        7        8        9
 'httptunnel' 'imap' 'ipsweep' 'land' 'loadmodule' 
  10          11        12        13     14
 'mailbomb' 'mscan' 'multihop' 'named' 'neptune' 
 15        16      17      18   19
 'nmap' 'normal' 'perl' 'phf' 'pod'
 20             21           22    23         24
 'portsweep' 'processtable' 'ps' 'rootkit' 'saint' 
  25        26          27        28          29
 'satan' 'sendmail' 'smurf' 'snmpgetattack' 'snmpguess' 
 30         31       32         33          34
 'spy' 'sqlattack' 'teardrop' 'udpstorm' 'warezclient' 
 35               36      37      38     39
 'warezmaster' 'worm' 'xlock' 'xsnoop' 'xterm']
"""

In [16]:
result=['apache2','back', 'buffer_overflow', 'ftp_write', 'guess_passwd',
 'httptunnel', 'imap', 'ipsweep', 'land', 'loadmodule', 'mailbomb', 'mscan',
 'multihop', 'named', 'neptune', 'nmap', 'normal', 'perl', 'phf', 'pod',
 'portsweep', 'processtable', 'ps', 'rootkit', 'saint', 'satan', 'sendmail',
 'smurf', 'snmpgetattack', 'snmpguess', 'spy', 'sqlattack', 'teardrop',
 'udpstorm', 'warezclient', 'warezmaster', 'worm', 'xlock', 'xsnoop', 'xterm']

In [17]:
print(type(test_data))

NameError: name 'test_data' is not defined

In [None]:
model =tensorflow.keras.models.load_model('modelnew.h5')
test_data = np.array([[-0.15478617,1.61854439,  0.73536923, -0.0112619 , -0.73291357, -0.6746598,-0.2458976 , -0.55658419, -0.35972625 ,-0.4608058,   0.72192361 ,-0.3773118,-0.1323311 , -0.8948217,  -1.17034772, -0.36938667, -0.46836203 ,0.72011226, -0.56138989, -0.54981386 ,-0.48776502]])
o=model.predict(test_data, batch_size=1)
print(o)
print(len(o[0]))
print(o.argmax())
print(result[int(o.argmax())])

In [None]:
scaler=StandardScaler()
features = dataSetInArrayFormat[:,0:numberOfColumnsInEncodedDataset-1]

#Perform feature scaling
scaler=StandardScaler()
df1=scaler.fit_transform(np.array(df1))
df1=scaler.fit_transform(np.array(df1))