In [1]:
import pandas as pd
import tensorflow as tf
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from tensorflow.keras.utils import get_file

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
#from tensorflow.keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

#tf.debugging.set_log_device_placement(True)

In [2]:
try:
    #path = get_file('kddcup.data.gz', origin='http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz')
    #path = get_file('kddcup.data_10_percent.gz', origin='http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz')
    path = './nslkdd/KDDTest+.txt'
except:
    print('Error downloading')
    raise
    
df = pd.read_csv(path, header=None)

print("Read {} rows.".format(len(df)))

df.dropna(inplace=True,axis=1) # For now, just drop NA's (rows with missing values)


df.columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'outcome',
    'difficulty_rating'
]

Read 22544 rows.


In [3]:
print(df.groupby('outcome')['outcome'].count())

outcome
apache2             737
back                359
buffer_overflow      20
ftp_write             3
guess_passwd       1231
httptunnel          133
imap                  1
ipsweep             141
land                  7
loadmodule            2
mailbomb            293
mscan               996
multihop             18
named                17
neptune            4657
nmap                 73
normal             9711
perl                  2
phf                   2
pod                  41
portsweep           157
processtable        685
ps                   15
rootkit              13
saint               319
satan               735
sendmail             14
smurf               665
snmpgetattack       178
snmpguess           331
sqlattack             2
teardrop             12
udpstorm              2
warezmaster         944
worm                  2
xlock                 9
xsnoop                4
xterm                13
Name: outcome, dtype: int64


In [4]:
DOS_TYPES = ('back','land','neptune','pod','smurf','teardrop')
U2R_TYPES = ('buffer_overflow','loadmodule','perl','rootkit')
R2L_TYPES = ('ftp_write','guess_passwd','imap','multihop','phf','spy','warezclient','warezmaster')
PROBE_TYPES = ('ipsweep','nmap','portsweep','satan')

print(len(DOS_TYPES + U2R_TYPES + R2L_TYPES + PROBE_TYPES))

for i, row in df.iterrows():
    val = 'normal'
    old_val = row['outcome'].split('.')[0]
    if old_val in DOS_TYPES:
        val = 'dos'
    elif old_val in U2R_TYPES:
        val = 'u2r'
    elif old_val in R2L_TYPES:
        val = 'r2l'
    elif old_val in PROBE_TYPES:
        val = 'probe'
    df.at[i,'outcome'] = val       
        
print(df['outcome'].value_counts())

22
normal    13461
dos        5741
r2l        2199
probe      1106
u2r          37
Name: outcome, dtype: int64


In [5]:
# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd
    
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

In [6]:
# Now encode the feature vector

encode_numeric_zscore(df, 'duration')
encode_text_dummy(df, 'protocol_type')
encode_text_dummy(df, 'service')
encode_text_dummy(df, 'flag')
encode_numeric_zscore(df, 'src_bytes')
encode_numeric_zscore(df, 'dst_bytes')
encode_text_dummy(df, 'land')
encode_numeric_zscore(df, 'wrong_fragment')
encode_numeric_zscore(df, 'urgent')
encode_numeric_zscore(df, 'hot')
encode_numeric_zscore(df, 'num_failed_logins')
encode_text_dummy(df, 'logged_in')
encode_numeric_zscore(df, 'num_compromised')
encode_numeric_zscore(df, 'root_shell')
encode_numeric_zscore(df, 'su_attempted')
encode_numeric_zscore(df, 'num_root')
encode_numeric_zscore(df, 'num_file_creations')
encode_numeric_zscore(df, 'num_shells')
encode_numeric_zscore(df, 'num_access_files')
encode_numeric_zscore(df, 'num_outbound_cmds')
encode_text_dummy(df, 'is_host_login')
encode_text_dummy(df, 'is_guest_login')
encode_numeric_zscore(df, 'count')
encode_numeric_zscore(df, 'srv_count')
encode_numeric_zscore(df, 'serror_rate')
encode_numeric_zscore(df, 'srv_serror_rate')
encode_numeric_zscore(df, 'rerror_rate')
encode_numeric_zscore(df, 'srv_rerror_rate')
encode_numeric_zscore(df, 'same_srv_rate')
encode_numeric_zscore(df, 'diff_srv_rate')
encode_numeric_zscore(df, 'srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_count')
encode_numeric_zscore(df, 'dst_host_srv_count')
encode_numeric_zscore(df, 'dst_host_same_srv_rate')
encode_numeric_zscore(df, 'dst_host_diff_srv_rate')
encode_numeric_zscore(df, 'dst_host_same_src_port_rate')
encode_numeric_zscore(df, 'dst_host_srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_serror_rate')
encode_numeric_zscore(df, 'dst_host_srv_serror_rate')
encode_numeric_zscore(df, 'dst_host_rerror_rate')
encode_numeric_zscore(df, 'dst_host_srv_rerror_rate')

# display 5 rows

df.dropna(inplace=True,axis=1)
df[0:5]
# This is the numeric feature vector, as it goes to the neural net

df_trainer = None





In [7]:
num_classes = 0
while num_classes != 5: ###I need a sample that contains all the types of attacks, otherwise I cannot classify some of them

    df_trainer = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
    
    # Convert to numpy - Classification
    x_columns = df_trainer.columns.drop('outcome')
    x = df_trainer[x_columns].values
    dummies = pd.get_dummies(df_trainer['outcome']) # Classification
    #dummies = pd.get_dummies(df['outcome']) # Classification

    outcomes = dummies.columns
    num_classes = len(outcomes)
    y = dummies.values


In [8]:
print(df_trainer.groupby('outcome')['outcome'].count())
print(df_trainer.head())

outcome
dos        562
normal    1364
probe      113
r2l        209
u2r          6
Name: outcome, dtype: int64
       duration  src_bytes  dst_bytes  wrong_fragment    urgent       hot  \
11456 -0.155531  -0.021988  -0.096894       -0.059103 -0.019459 -0.113519   
14183 -0.155531  -0.021986  -0.096894       -0.059103 -0.019459 -0.113519   
5197  -0.155531  -0.019856  -0.096894       -0.059103 -0.019459 -0.113519   
6562  -0.154109  -0.021988  -0.096187       -0.059103 -0.019459 -0.113519   
5181  -0.155531  -0.021766  -0.090013       -0.059103 -0.019459 -0.113519   

       num_failed_logins  num_compromised  root_shell  su_attempted  ...  \
11456          -0.143996        -0.016493   -0.049452     -0.012637  ...   
14183          -0.143996        -0.016493   -0.049452     -0.012637  ...   
5197           -0.143996        -0.016493   -0.049452     -0.012637  ...   
6562           -0.143996        -0.016493   -0.049452     -0.012637  ...   
5181           -0.143996        -0.016493   -0

In [9]:
def baseline_model():
    model = Sequential()
    model.add(Dense(32, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(32, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(y.shape[1], activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimator = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=50, verbose=0)

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=42)

In [11]:
#from sklearn.neural_network import MLPClassifier
#mlp = MLPClassifier(hidden_layer_sizes=(64,64,),max_iter=50,verbose=True)
#mlp.fit(x_train, y_train)

In [12]:
#from sklearn.metrics import classification_report, confusion_matrix
#predictions = mlp.predict(x_test)
#print(confusion_matrix(y_test, predictions))
#print(classification_report(y_test,predictions))

In [13]:
from tensorflow.keras.callbacks import EarlyStopping

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
estimator.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=2,epochs=50)

Train on 1690 samples, validate on 564 samples
Epoch 1/50
1690/1690 - 1s - loss: 1.5775 - accuracy: 0.3988 - val_loss: 0.9767 - val_accuracy: 0.6152
Epoch 2/50
1690/1690 - 0s - loss: 0.8091 - accuracy: 0.7225 - val_loss: 0.7157 - val_accuracy: 0.7553
Epoch 3/50
1690/1690 - 0s - loss: 0.6044 - accuracy: 0.7787 - val_loss: 0.5585 - val_accuracy: 0.7606
Epoch 4/50
1690/1690 - 0s - loss: 0.4613 - accuracy: 0.8189 - val_loss: 0.4274 - val_accuracy: 0.8333
Epoch 5/50
1690/1690 - 0s - loss: 0.3498 - accuracy: 0.8840 - val_loss: 0.3402 - val_accuracy: 0.8812
Epoch 6/50
1690/1690 - 0s - loss: 0.2765 - accuracy: 0.9142 - val_loss: 0.2854 - val_accuracy: 0.9043
Epoch 7/50
1690/1690 - 0s - loss: 0.2273 - accuracy: 0.9314 - val_loss: 0.2432 - val_accuracy: 0.9184
Epoch 8/50
1690/1690 - 0s - loss: 0.1907 - accuracy: 0.9391 - val_loss: 0.2048 - val_accuracy: 0.9326
Epoch 9/50
1690/1690 - 0s - loss: 0.1577 - accuracy: 0.9450 - val_loss: 0.1799 - val_accuracy: 0.9486
Epoch 10/50
1690/1690 - 0s - loss: 

<tensorflow.python.keras.callbacks.History at 0x144ae1d6108>

In [14]:
import numpy as np
from sklearn import metrics

pred = estimator.predict(x_test)

In [15]:
y_eval = np.argmax(y_test, axis=1)
print(y_eval)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

[0 1 1 1 1 0 0 1 0 1 3 0 1 1 1 1 1 0 1 0 1 0 1 2 1 2 1 1 1 3 0 1 0 0 1 1 1
 0 0 0 1 0 0 1 1 3 0 3 1 1 2 1 1 1 1 0 1 1 1 1 1 0 3 0 1 1 0 2 2 1 1 0 1 2
 1 1 1 0 1 1 0 1 1 1 1 1 1 3 0 3 0 0 1 1 3 3 0 1 0 1 1 1 1 2 1 0 0 1 0 1 0
 1 1 0 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 3 0 1 0 1 1 0 1 1 0 1 1 0 1 0 1 0
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 2 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1 0 3 1 1 1 3 1
 1 1 2 3 1 1 2 1 1 1 1 1 0 1 1 1 0 3 1 1 0 3 0 2 0 1 1 1 1 1 2 3 2 1 1 1 1
 1 0 3 1 1 0 1 1 1 1 3 1 0 3 0 1 1 3 0 1 1 0 1 3 1 3 1 1 1 1 0 0 3 1 1 0 0
 3 1 1 3 0 1 1 1 1 2 1 1 1 1 3 1 1 1 3 0 1 1 1 1 1 1 1 1 0 3 1 3 3 1 4 1 0
 3 1 1 3 1 1 1 3 0 1 1 1 3 1 2 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 3 3 0 1 0 3 0 1 1 3 2 4 0 0 2 0 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 3 0
 3 1 1 1 1 0 2 1 0 1 1 0 1 1 1 1 1 0 0 1 1 1 2 0 1 1 1 0 1 1 3 1 1 1 3 1 1
 1 1 1 1 0 3 0 0 1 2 2 1 1 1 1 0 1 1 1 0 1 1 1 0 3 3 1 3 2 2 1 1 0 1 1 1 1
 3 3 1 1 1 1 1 2 1 0 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 0 0 0 1 1 1 2 1 0 0 0 0
 1 0 1 1 1 1 0 1 1 1 1 1 

In [16]:
# Convert to numpy - Classification
x_columns = df.columns.drop('outcome')
x = df[x_columns].values
dummies = pd.get_dummies(df['outcome']) # Classification
outcomes = dummies.columns
num_classes = len(outcomes)
y = dummies.values

print(num_classes)

5


In [17]:
y_eval_full = np.argmax(y, axis=1)
full_pred = estimator.predict(x)
score = metrics.accuracy_score(y_eval_full, full_pred)
print("Validation score for entire dataset: {}".format(score))


Validation score for entire dataset: 0.977333215046132
