In [12]:
import pandas as pd
import tensorflow as tf
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from tensorflow.keras.utils import get_file

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
#from tensorflow.keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

#tf.debugging.set_log_device_placement(True)

In [13]:
try:
    #path = get_file('kddcup.data.gz', origin='http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz')
    #path = get_file('kddcup.data_10_percent.gz', origin='http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz')
    path = './nslkdd/KDDTrain+.txt'
except:
    print('Error downloading')
    raise
    
df = pd.read_csv(path, header=None)

print("Read {} rows.".format(len(df)))
#df = df.sample(frac=0.01, replace=False) # Uncomment this line to sample only 1% of the dataset
df.dropna(inplace=True,axis=1) # For now, just drop NA's (rows with missing values)


df.columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'outcome',
    'difficulty_rating'
]

Read 125973 rows.


In [14]:
# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd
    
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

In [15]:
# Now encode the feature vector

encode_numeric_zscore(df, 'duration')
encode_text_dummy(df, 'protocol_type')
encode_text_dummy(df, 'service')
encode_text_dummy(df, 'flag')
encode_numeric_zscore(df, 'src_bytes')
encode_numeric_zscore(df, 'dst_bytes')
encode_text_dummy(df, 'land')
encode_numeric_zscore(df, 'wrong_fragment')
encode_numeric_zscore(df, 'urgent')
encode_numeric_zscore(df, 'hot')
encode_numeric_zscore(df, 'num_failed_logins')
encode_text_dummy(df, 'logged_in')
encode_numeric_zscore(df, 'num_compromised')
encode_numeric_zscore(df, 'root_shell')
encode_numeric_zscore(df, 'su_attempted')
encode_numeric_zscore(df, 'num_root')
encode_numeric_zscore(df, 'num_file_creations')
encode_numeric_zscore(df, 'num_shells')
encode_numeric_zscore(df, 'num_access_files')
encode_numeric_zscore(df, 'num_outbound_cmds')
encode_text_dummy(df, 'is_host_login')
encode_text_dummy(df, 'is_guest_login')
encode_numeric_zscore(df, 'count')
encode_numeric_zscore(df, 'srv_count')
encode_numeric_zscore(df, 'serror_rate')
encode_numeric_zscore(df, 'srv_serror_rate')
encode_numeric_zscore(df, 'rerror_rate')
encode_numeric_zscore(df, 'srv_rerror_rate')
encode_numeric_zscore(df, 'same_srv_rate')
encode_numeric_zscore(df, 'diff_srv_rate')
encode_numeric_zscore(df, 'srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_count')
encode_numeric_zscore(df, 'dst_host_srv_count')
encode_numeric_zscore(df, 'dst_host_same_srv_rate')
encode_numeric_zscore(df, 'dst_host_diff_srv_rate')
encode_numeric_zscore(df, 'dst_host_same_src_port_rate')
encode_numeric_zscore(df, 'dst_host_srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_serror_rate')
encode_numeric_zscore(df, 'dst_host_srv_serror_rate')
encode_numeric_zscore(df, 'dst_host_rerror_rate')
encode_numeric_zscore(df, 'dst_host_srv_rerror_rate')

# display 5 rows

df.dropna(inplace=True,axis=1)
df[0:5]
# This is the numeric feature vector, as it goes to the neural net

df_trainer = None

print(df['outcome'].nunique())
print(df.groupby('outcome')['outcome'].count())


23
outcome
back                 956
buffer_overflow       30
ftp_write              8
guess_passwd          53
imap                  11
ipsweep             3599
land                  18
loadmodule             9
multihop               7
neptune            41214
nmap                1493
normal             67343
perl                   3
phf                    4
pod                  201
portsweep           2931
rootkit               10
satan               3633
smurf               2646
spy                    2
teardrop             892
warezclient          890
warezmaster           20
Name: outcome, dtype: int64


In [16]:
num_classes = 0
while num_classes != 23: ###I need a sample that contains all the types of attacks, otherwise I cannot classify some of them

    df_trainer = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
    
    # Convert to numpy - Classification
    x_columns = df_trainer.columns.drop(['outcome','difficulty_rating'])
    x = df_trainer[x_columns].values
    dummies = pd.get_dummies(df_trainer['outcome']) # Classification
    #dummies = pd.get_dummies(df['outcome']) # Classification

    outcomes = dummies.columns
    num_classes = len(outcomes)
    y = dummies.values


In [17]:
print(df_trainer.groupby('outcome')['outcome'].count())
print(df_trainer.head())

outcome
back                 99
buffer_overflow       2
ftp_write             2
guess_passwd          6
imap                  1
ipsweep             338
land                  2
loadmodule            1
multihop              2
neptune            4120
nmap                134
normal             6709
perl                  1
phf                   1
pod                  22
portsweep           301
rootkit               1
satan               395
smurf               264
spy                   1
teardrop            100
warezclient          93
warezmaster           2
Name: outcome, dtype: int64
       duration  src_bytes  dst_bytes  wrong_fragment    urgent        hot  \
38085 -0.110249  -0.007762  -0.004919       -0.089486 -0.007736  -0.095075   
25407 -0.110249  -0.007710  -0.004724       -0.089486 -0.007736  -0.095075   
53643 -0.110249  -0.007762  -0.004919       -0.089486 -0.007736  -0.095075   
19883 -0.109865  -0.007548  -0.004309       -0.089486 -0.007736  12.928372   
49816 -0.110249  -0.00

In [18]:
def baseline_model():
    model = Sequential()
    model.add(Dense(32, input_dim=x.shape[1], activation='linear'))
    model.add(Dense(32, input_dim=x.shape[1], activation='linear'))
    model.add(Dense(y.shape[1], activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimator = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=50, verbose=0)

In [19]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=42)

In [20]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(64,64,),max_iter=50,verbose=True)
mlp.fit(x_train, y_train)

Iteration 1, loss = 8.96907568
Iteration 2, loss = 1.11829688
Iteration 3, loss = 0.58966816
Iteration 4, loss = 0.43957917
Iteration 5, loss = 0.36132636
Iteration 6, loss = 0.31517468
Iteration 7, loss = 0.27995554
Iteration 8, loss = 0.25246804
Iteration 9, loss = 0.22802262
Iteration 10, loss = 0.20742925
Iteration 11, loss = 0.18633664
Iteration 12, loss = 0.16655498
Iteration 13, loss = 0.14845235
Iteration 14, loss = 0.13196549
Iteration 15, loss = 0.11904668
Iteration 16, loss = 0.10758298
Iteration 17, loss = 0.09947519
Iteration 18, loss = 0.09183095
Iteration 19, loss = 0.08630136
Iteration 20, loss = 0.08131677
Iteration 21, loss = 0.07597667
Iteration 22, loss = 0.07311726
Iteration 23, loss = 0.06885225
Iteration 24, loss = 0.06636278
Iteration 25, loss = 0.06321704
Iteration 26, loss = 0.06097287
Iteration 27, loss = 0.05912293
Iteration 28, loss = 0.05930900
Iteration 29, loss = 0.05585196
Iteration 30, loss = 0.05373133
Iteration 31, loss = 0.05245718
Iteration 32, los



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(64, 64), learning_rate='constant',
              learning_rate_init=0.001, max_iter=50, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=True, warm_start=False)

In [21]:
from sklearn.metrics import classification_report, confusion_matrix
predictions = mlp.predict(x_test)
#print(confusion_matrix(y_test, predictions))
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       1.00      0.95      0.98        22
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.92      1.00      0.96        80
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         2
           9       1.00      1.00      1.00      1050
          10       0.94      0.86      0.90        36
          11       0.98      0.99      0.99      1659
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         1
          14       1.00      1.00      1.00         6
          15       1.00      0.96      0.98        77
          16       0.00      0.00      0.00         1
          17       1.00    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [22]:
from tensorflow.keras.callbacks import EarlyStopping

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
estimator.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=2,epochs=50)

Train on 9447 samples, validate on 3150 samples
Epoch 1/50
9447/9447 - 11s - loss: 0.6644 - accuracy: 0.8667 - val_loss: 0.2356 - val_accuracy: 0.9419
Epoch 2/50
9447/9447 - 10s - loss: 0.1782 - accuracy: 0.9488 - val_loss: 0.1666 - val_accuracy: 0.9597
Epoch 3/50
9447/9447 - 11s - loss: 0.1330 - accuracy: 0.9596 - val_loss: 0.1401 - val_accuracy: 0.9632
Epoch 4/50
9447/9447 - 10s - loss: 0.1122 - accuracy: 0.9638 - val_loss: 0.1282 - val_accuracy: 0.9717
Epoch 5/50
9447/9447 - 9s - loss: 0.0953 - accuracy: 0.9695 - val_loss: 0.1316 - val_accuracy: 0.9711
Epoch 6/50
9447/9447 - 9s - loss: 0.0840 - accuracy: 0.9730 - val_loss: 0.1483 - val_accuracy: 0.9752
Epoch 7/50
9447/9447 - 11s - loss: 0.0774 - accuracy: 0.9758 - val_loss: 0.1771 - val_accuracy: 0.9765
Epoch 8/50
9447/9447 - 9s - loss: 0.0705 - accuracy: 0.9775 - val_loss: 0.2099 - val_accuracy: 0.9737
Epoch 9/50
9447/9447 - 11s - loss: 0.0694 - accuracy: 0.9776 - val_loss: 0.2418 - val_accuracy: 0.9781
Epoch 00009: early stopping


<tensorflow.python.keras.callbacks.History at 0x250aef2bb48>

In [23]:
import numpy as np
from sklearn import metrics

pred = estimator.predict(x_test)

In [24]:
y_eval = np.argmax(y_test, axis=1)
print(y_eval)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

[15  9 11 ...  9 11 11]
Validation score: 0.9780952380952381


In [25]:
# Convert to numpy - Classification
x_columns = df.columns.drop(['outcome','difficulty_rating'])
x = df[x_columns].values
dummies = pd.get_dummies(df['outcome']) # Classification
outcomes = dummies.columns
num_classes = len(outcomes)
y = dummies.values

print(num_classes)

23


In [26]:
y_eval_full = np.argmax(y, axis=1)
full_pred = estimator.predict(x)
score = metrics.accuracy_score(y_eval_full, full_pred)
print("Validation score for entire dataset: {}".format(score))


Validation score for entire dataset: 0.9788446730648631
