In [30]:
import pandas as pd
import tensorflow as tf
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from tensorflow.keras.utils import get_file

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

#tf.debugging.set_log_device_placement(True)

FULL_DATASET = 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz'
TEN_PERCENT_DATASET = 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz'


In [38]:
def process_kdd99_df(df):
    length = len(df)
    print('Processing {} rows'.format(length))
    df.columns = [
        'duration',
        'protocol_type',
        'service',
        'flag',
        'src_bytes',
        'dst_bytes',
        'land',
        'wrong_fragment',
        'urgent',
        'hot',
        'num_failed_logins',
        'logged_in',
        'num_compromised',
        'root_shell',
        'su_attempted',
        'num_root',
        'num_file_creations',
        'num_shells',
        'num_access_files',
        'num_outbound_cmds',
        'is_host_login',
        'is_guest_login',
        'count',
        'srv_count',
        'serror_rate',
        'srv_serror_rate',
        'rerror_rate',
        'srv_rerror_rate',
        'same_srv_rate',
        'diff_srv_rate',
        'srv_diff_host_rate',
        'dst_host_count',
        'dst_host_srv_count',
        'dst_host_same_srv_rate',
        'dst_host_diff_srv_rate',
        'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate',
        'dst_host_serror_rate',
        'dst_host_srv_serror_rate',
        'dst_host_rerror_rate',
        'dst_host_srv_rerror_rate',
        'outcome'
    ]
    df.dropna(inplace=True,axis=1)
    print('{} fields dropped due to missing data.'.format(length-len(df)))
    print('{} unique outcomes present (should be 23 for KDD99)'.format(df['outcome'].nunique()))
    
    return df

    
def download_to_df(url):
    try:
        path = get_file(url.split('/')[-1], origin=url)
    except:
        print('Error downloading dataset')
        raise
    return pd.read_csv(path, header=None)

def generate_training_set(df, frac=0.1, num_outcomes=23):
    trainer = None
    while (trainer is None or trainer['outcome'].nunique() != num_outcomes):
        trainer = df.sample(frac=frac, replace=False)
    return trainer

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd
    
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)
    
def encode_kdd99_df(df):
    length = len(df)
    
    encode_numeric_zscore(df, 'duration')
    encode_text_dummy(df, 'protocol_type')
    encode_text_dummy(df, 'service')
    encode_text_dummy(df, 'flag')
    encode_numeric_zscore(df, 'src_bytes')
    encode_numeric_zscore(df, 'dst_bytes')
    encode_text_dummy(df, 'land')
    encode_numeric_zscore(df, 'wrong_fragment')
    encode_numeric_zscore(df, 'urgent')
    encode_numeric_zscore(df, 'hot')
    encode_numeric_zscore(df, 'num_failed_logins')
    encode_text_dummy(df, 'logged_in')
    encode_numeric_zscore(df, 'num_compromised')
    encode_numeric_zscore(df, 'root_shell')
    encode_numeric_zscore(df, 'su_attempted')
    encode_numeric_zscore(df, 'num_root')
    encode_numeric_zscore(df, 'num_file_creations')
    encode_numeric_zscore(df, 'num_shells')
    encode_numeric_zscore(df, 'num_access_files')
    encode_numeric_zscore(df, 'num_outbound_cmds')
    encode_text_dummy(df, 'is_host_login')
    encode_text_dummy(df, 'is_guest_login')
    encode_numeric_zscore(df, 'count')
    encode_numeric_zscore(df, 'srv_count')
    encode_numeric_zscore(df, 'serror_rate')
    encode_numeric_zscore(df, 'srv_serror_rate')
    encode_numeric_zscore(df, 'rerror_rate')
    encode_numeric_zscore(df, 'srv_rerror_rate')
    encode_numeric_zscore(df, 'same_srv_rate')
    encode_numeric_zscore(df, 'diff_srv_rate')
    encode_numeric_zscore(df, 'srv_diff_host_rate')
    encode_numeric_zscore(df, 'dst_host_count')
    encode_numeric_zscore(df, 'dst_host_srv_count')
    encode_numeric_zscore(df, 'dst_host_same_srv_rate')
    encode_numeric_zscore(df, 'dst_host_diff_srv_rate')
    encode_numeric_zscore(df, 'dst_host_same_src_port_rate')
    encode_numeric_zscore(df, 'dst_host_srv_diff_host_rate')
    encode_numeric_zscore(df, 'dst_host_serror_rate')
    encode_numeric_zscore(df, 'dst_host_srv_serror_rate')
    encode_numeric_zscore(df, 'dst_host_rerror_rate')
    encode_numeric_zscore(df, 'dst_host_srv_rerror_rate')
    
    df.dropna(inplace=True, axis=1)
    print('{} rows dropped due to incomplete data'.format(length - len(df)))
    return df
    

In [36]:
df = download_to_df(FULL_DATASET)
df_10 = download_to_df(TEN_PERCENT_DATASET)

df = process_kdd99_df(df)
df_10 = process_kdd99_df(df_10)

Processing 4898431 rows
0 fields dropped due to missing data.
23 unique outcomes present \(should be 23 for KDD99\)
Processing 494021 rows
0 fields dropped due to missing data.
23 unique outcomes present \(should be 23 for KDD99\)


In [39]:
df_trainer = generate_training_set(df)

KeyboardInterrupt: 

In [23]:
num_classes = 0
while num_classes != 23: ###I need a sample that contains all the types of attacks, otherwise I cannot classify some of them

    df_trainer = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
    
    dummies = pd.get_dummies(df_trainer['outcome']) # Classification
    outcomes = dummies.columns
    num_classes = len(outcomes)
    
# Convert to numpy - Classification
x_columns = df_trainer.columns.drop('outcome')
x = df_trainer[x_columns].values

y = dummies.values


KeyboardInterrupt: 

In [25]:
print(df_trainer.groupby('outcome')['outcome'].count())
print(df_trainer.head())
print(df['outcome'].nunique())

outcome
back.                  229
buffer_overflow.         4
ftp_write.               1
guess_passwd.            5
imap.                    2
ipsweep.              1308
land.                    1
multihop.                2
neptune.            106864
nmap.                  228
normal.              97127
pod.                    21
portsweep.            1092
satan.                1609
smurf.              281161
spy.                     1
teardrop.               95
warezclient.            90
warezmaster.             3
Name: outcome, dtype: int64
         duration  src_bytes  dst_bytes  wrong_fragment    urgent       hot  \
4081094 -0.066833  -0.001396  -0.001696       -0.015139 -0.001103 -0.026521   
4676900 -0.066833  -0.001949  -0.001696       -0.015139 -0.001103 -0.026521   
4622669 -0.066833  -0.001949  -0.001696       -0.015139 -0.001103 -0.026521   
3948765 -0.066833  -0.001396  -0.001696       -0.015139 -0.001103 -0.026521   
1722296 -0.066833  -0.000853  -0.001696       -0.015139 

In [None]:
def baseline_model():
    model = Sequential()
    model.add(Dense(64, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(64, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(64, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(y.shape[1], activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimator = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=50, verbose=0)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=42)

In [None]:
#from sklearn.neural_network import MLPClassifier
#mlp = MLPClassifier(hidden_layer_sizes=(64,64,),max_iter=50,verbose=True)
#mlp.fit(x_train, y_train)

In [None]:
#from sklearn.metrics import classification_report, confusion_matrix
#predictions = mlp.predict(x_test)
#print(confusion_matrix(y_test, predictions))
#print(classification_report(y_test,predictions))

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ProgbarLogger

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
progress = ProgbarLogger(count_mode='steps')
estimator.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,progress],verbose=2,epochs=50)

In [None]:
import numpy as np
from sklearn import metrics

pred = estimator.predict(x_test)

In [None]:
y_eval = np.argmax(y_test, axis=1)
print(y_eval)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

In [None]:
# Convert to numpy - Classification
x_columns = df.columns.drop('outcome')
x = df[x_columns].values
dummies = pd.get_dummies(df['outcome']) # Classification
outcomes = dummies.columns
num_classes = len(outcomes)
y = dummies.values

print(num_classes)

In [None]:
y_eval_full = np.argmax(y, axis=1)
full_pred = estimator.predict(x)
score = metrics.accuracy_score(y_eval_full, full_pred)
print("Validation score for entire dataset: {}".format(score))
