In [1]:
import pandas as pd
import tensorflow as tf
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from tensorflow.keras.utils import get_file

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
#from tensorflow.keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

#tf.debugging.set_log_device_placement(True)

In [2]:
try:
    #path = get_file('kddcup.data.gz', origin='http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz')
    #path = get_file('kddcup.data_10_percent.gz', origin='http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz')
    path = './nslkdd/KDDTrain+.txt'
except:
    print('Error downloading')
    raise
    
df = pd.read_csv(path, header=None)

print("Read {} rows.".format(len(df)))
#df = df.sample(frac=0.01, replace=False) # Uncomment this line to sample only 1% of the dataset
df.dropna(inplace=True,axis=1) # For now, just drop NA's (rows with missing values)


df.columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'outcome',
    'difficulty_rating'
]

Read 125973 rows.


In [3]:
# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd
    
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

In [4]:
# Now encode the feature vector

encode_numeric_zscore(df, 'duration')
encode_text_dummy(df, 'protocol_type')
encode_text_dummy(df, 'service')
encode_text_dummy(df, 'flag')
encode_numeric_zscore(df, 'src_bytes')
encode_numeric_zscore(df, 'dst_bytes')
encode_text_dummy(df, 'land')
encode_numeric_zscore(df, 'wrong_fragment')
encode_numeric_zscore(df, 'urgent')
encode_numeric_zscore(df, 'hot')
encode_numeric_zscore(df, 'num_failed_logins')
encode_text_dummy(df, 'logged_in')
encode_numeric_zscore(df, 'num_compromised')
encode_numeric_zscore(df, 'root_shell')
encode_numeric_zscore(df, 'su_attempted')
encode_numeric_zscore(df, 'num_root')
encode_numeric_zscore(df, 'num_file_creations')
encode_numeric_zscore(df, 'num_shells')
encode_numeric_zscore(df, 'num_access_files')
encode_numeric_zscore(df, 'num_outbound_cmds')
encode_text_dummy(df, 'is_host_login')
encode_text_dummy(df, 'is_guest_login')
encode_numeric_zscore(df, 'count')
encode_numeric_zscore(df, 'srv_count')
encode_numeric_zscore(df, 'serror_rate')
encode_numeric_zscore(df, 'srv_serror_rate')
encode_numeric_zscore(df, 'rerror_rate')
encode_numeric_zscore(df, 'srv_rerror_rate')
encode_numeric_zscore(df, 'same_srv_rate')
encode_numeric_zscore(df, 'diff_srv_rate')
encode_numeric_zscore(df, 'srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_count')
encode_numeric_zscore(df, 'dst_host_srv_count')
encode_numeric_zscore(df, 'dst_host_same_srv_rate')
encode_numeric_zscore(df, 'dst_host_diff_srv_rate')
encode_numeric_zscore(df, 'dst_host_same_src_port_rate')
encode_numeric_zscore(df, 'dst_host_srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_serror_rate')
encode_numeric_zscore(df, 'dst_host_srv_serror_rate')
encode_numeric_zscore(df, 'dst_host_rerror_rate')
encode_numeric_zscore(df, 'dst_host_srv_rerror_rate')

# display 5 rows

df.dropna(inplace=True,axis=1)
df[0:5]
# This is the numeric feature vector, as it goes to the neural net

df_trainer = None

print(df['outcome'].nunique())
print(df.groupby('outcome')['outcome'].count())


23
outcome
back                 956
buffer_overflow       30
ftp_write              8
guess_passwd          53
imap                  11
ipsweep             3599
land                  18
loadmodule             9
multihop               7
neptune            41214
nmap                1493
normal             67343
perl                   3
phf                    4
pod                  201
portsweep           2931
rootkit               10
satan               3633
smurf               2646
spy                    2
teardrop             892
warezclient          890
warezmaster           20
Name: outcome, dtype: int64


In [5]:
num_classes = 0
while num_classes != 23: ###I need a sample that contains all the types of attacks, otherwise I cannot classify some of them

    df_trainer = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
    
    # Convert to numpy - Classification
    x_columns = df_trainer.columns.drop(['outcome','difficulty_rating'])
    x = df_trainer[x_columns].values
    dummies = pd.get_dummies(df_trainer['outcome']) # Classification
    #dummies = pd.get_dummies(df['outcome']) # Classification

    outcomes = dummies.columns
    num_classes = len(outcomes)
    y = dummies.values


In [8]:
print(df_trainer.groupby('outcome')['outcome'].count())
print(df_trainer.head())

outcome
back                 89
buffer_overflow       5
ftp_write             1
guess_passwd          6
imap                  2
ipsweep             384
land                  2
loadmodule            1
multihop              1
neptune            4091
nmap                147
normal             6745
perl                  1
phf                   1
pod                  21
portsweep           291
rootkit               2
satan               358
smurf               262
spy                   1
teardrop             90
warezclient          94
warezmaster           2
Name: outcome, dtype: int64
        duration  src_bytes  dst_bytes  wrong_fragment    urgent       hot  \
40068  -0.110249  -0.007762  -0.004919       -0.089486 -0.007736 -0.095075   
117806 -0.110249  -0.007762  -0.004919       -0.089486 -0.007736 -0.095075   
49753  -0.110249  -0.007711  -0.004393       -0.089486 -0.007736 -0.095075   
20685  -0.110249  -0.007755  -0.004892       -0.089486 -0.007736 -0.095075   
2793   -0.110249  -0.0

In [7]:
def baseline_model():
    model = Sequential()
    model.add(Dense(32, input_dim=x.shape[1], activation='linear'))
    model.add(Dense(32, input_dim=x.shape[1], activation='linear'))
    model.add(Dense(y.shape[1], activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimator = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=50, verbose=0)

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=42)

In [9]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(64,64,),max_iter=50,verbose=True)
mlp.fit(x_train, y_train)

Iteration 1, loss = 9.13316983
Iteration 2, loss = 1.10341638
Iteration 3, loss = 0.58622171
Iteration 4, loss = 0.42617125
Iteration 5, loss = 0.35652697
Iteration 6, loss = 0.31286368
Iteration 7, loss = 0.27849616
Iteration 8, loss = 0.24891004
Iteration 9, loss = 0.22031929
Iteration 10, loss = 0.19524708
Iteration 11, loss = 0.17108333
Iteration 12, loss = 0.14910900
Iteration 13, loss = 0.13262478
Iteration 14, loss = 0.11868741
Iteration 15, loss = 0.10883525
Iteration 16, loss = 0.10029824
Iteration 17, loss = 0.09410164
Iteration 18, loss = 0.08836339
Iteration 19, loss = 0.08470695
Iteration 20, loss = 0.07939677
Iteration 21, loss = 0.07628634
Iteration 22, loss = 0.07274443
Iteration 23, loss = 0.06988907
Iteration 24, loss = 0.06780811
Iteration 25, loss = 0.06520805
Iteration 26, loss = 0.06287977
Iteration 27, loss = 0.06121192
Iteration 28, loss = 0.05891338
Iteration 29, loss = 0.05795267
Iteration 30, loss = 0.05703427
Iteration 31, loss = 0.05441126
Iteration 32, los



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(64, 64), learning_rate='constant',
              learning_rate_init=0.001, max_iter=50, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=True, warm_start=False)

In [10]:
from sklearn.metrics import classification_report, confusion_matrix
predictions = mlp.predict(x_test)
#print(confusion_matrix(y_test, predictions))
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       1.00      1.00      1.00         1
           4       0.00      0.00      0.00         1
           5       0.93      0.99      0.96        92
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       1.00      1.00      1.00      1019
          10       0.97      0.92      0.94        36
          11       0.99      0.99      0.99      1712
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         0
          14       1.00      1.00      1.00         7
          15       1.00      0.97      0.99        74
          16       0.00      0.00      0.00         2
          17       0.99    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [11]:
from tensorflow.keras.callbacks import EarlyStopping

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
estimator.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=2,epochs=50)

Train on 9447 samples, validate on 3150 samples
Epoch 1/50
9447/9447 - 15s - loss: 0.6204 - accuracy: 0.8796 - val_loss: 0.2454 - val_accuracy: 0.9463
Epoch 2/50
9447/9447 - 15s - loss: 0.1953 - accuracy: 0.9462 - val_loss: 0.1661 - val_accuracy: 0.9517
Epoch 3/50
9447/9447 - 14s - loss: 0.1440 - accuracy: 0.9548 - val_loss: 0.1345 - val_accuracy: 0.9625
Epoch 4/50
9447/9447 - 15s - loss: 0.1157 - accuracy: 0.9632 - val_loss: 0.1218 - val_accuracy: 0.9660
Epoch 5/50


KeyboardInterrupt: 

In [None]:
import numpy as np
from sklearn import metrics

pred = estimator.predict(x_test)

In [None]:
y_eval = np.argmax(y_test, axis=1)
print(y_eval)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

In [None]:
# Convert to numpy - Classification
x_columns = df.columns.drop(['outcome','difficulty_rating'])
x = df[x_columns].values
dummies = pd.get_dummies(df['outcome']) # Classification
outcomes = dummies.columns
num_classes = len(outcomes)
y = dummies.values

print(num_classes)

In [None]:
y_eval_full = np.argmax(y, axis=1)
full_pred = estimator.predict(x)
score = metrics.accuracy_score(y_eval_full, full_pred)
print("Validation score for entire dataset: {}".format(score))
