# Deep Learning

## preprocessing training dataset

In [302]:
import os
import numpy as np
import tensorflow as tf
import pandas as pd
input_file_dir = "/Users/ninglee/Documents/IntrutionDection/datasets"
train_file_name = "kddcup.data_10_percent.txt"
test_file_name = "corrected.txt"
header_file_name = "header.txt"
train_files = os.path.join(input_file_dir, train_file_name)
test_files = os.path.join(input_file_dir, test_file_name)
header_files = os.path.join(input_file_dir, header_file_name)

In [303]:
with open(header_files, 'r') as f:
    header = f.readline().strip().split(',')
train_dataset = pd.read_csv(train_files)
test_dataset = pd.read_csv(test_files)
train_dataset.columns = header
test_dataset.columns = header

In [304]:
train_dataset_size = train_dataset.shape[0]
test_dataset_size = test_dataset.shape[0]
train_dataset = pd.concat([train_dataset, test_dataset], axis=0)
print train_dataset_size, test_dataset_size

494020 311028


In [305]:
def labels_map(label):
    label = str(label).split('.')[0]
    if label == 'normal':
        return 0
    if label in ['ipsweep', 'mscan', 'nmap', 'portsweep', 'saint', 'satan']: #PROBE
        return 1
    if label in ['apache2', 'back', 'land', 'mailbomb', 'neptune', 'pod', 'processtable', 'smurf', 'teardrop', 'udpstorm']: #DOS
        return 2
    if label in ['buffer_overflow', 'httptunnel', 'loadmodule', 'perl', 'ps', 'rootkit', 'sqlattack', 'xterm']: #U2R
        return 3
    if label in ['ftp_write', 'guess_passwd', 'imap', 'multihop', 'named', 'phf', 'sendmail', 'snmpgetattack', 'snmpguess', 'spy', 'warezclient', 'warezmaster', 'worm', 'xlock', 'xsnoop']: #R2L
        return 4
    
train_dataset['labels'] = train_dataset['labels'].apply(labels_map)
labels_dummies = pd.get_dummies(train_dataset['labels'], prefix='label')
train_dataset = pd.concat([train_dataset,labels_dummies], axis=1)

In [306]:
train_dataset.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,labels,label_0,label_1,label_2,label_3,label_4
0,0,tcp,http,SF,239,486,0,0,0,0,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
1,0,tcp,http,SF,235,1337,0,0,0,0,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
2,0,tcp,http,SF,219,1337,0,0,0,0,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
3,0,tcp,http,SF,217,2032,0,0,0,0,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
4,0,tcp,http,SF,217,2032,0,0,0,0,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0


In [307]:
protocal_type_dummies = pd.get_dummies(train_dataset.protocol_type, prefix='protocol_type')
service_dummies = pd.get_dummies(train_dataset.service, prefix='service')
flag_dummies = pd.get_dummies(train_dataset.flag, prefix='flag')
train_dataset = pd.concat([train_dataset, protocal_type_dummies, service_dummies, flag_dummies], axis=1)

In [308]:
max1 = train_dataset.src_bytes.max(); min1 = train_dataset.src_bytes.min();
max2 = train_dataset.dst_bytes.max(); min2 = train_dataset.dst_bytes.min();
train_dataset['src_bytes_norm'] = (train_dataset.src_bytes - min1) / float(max1 - min1)
train_dataset['dst_bytes_norm'] = (train_dataset.dst_bytes - min2) / float(max2 - min2)

In [309]:
train_dataset = train_dataset.drop(['protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes','labels'], axis=1)

In [310]:
train_dataset = train_dataset.astype('float')
# train_dataset = (train_dataset - train_dataset.min()) / (train_dataset.max() - train_dataset.min())

## preprocessing test dataset

In [311]:
sub_train_dataset = train_dataset.iloc[train_dataset_size:, :].sample(n=50000)
sub_test_dataset = train_dataset.iloc[:train_dataset_size, :].sample(n=10000)
sub_train_labels = sub_train_dataset[['label_0', 'label_1', 'label_2', 'label_3', 'label_4']]
sub_test_labels = sub_test_dataset[['label_0', 'label_1', 'label_2', 'label_3', 'label_4']]
sub_train_dataset.drop(['label_0', 'label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
sub_test_dataset.drop(['label_0', 'label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)

In [312]:
test_dataset = train_dataset.iloc[train_dataset_size:,:]
train_dataset = train_dataset.iloc[:train_dataset_size, :]
train_labels = train_dataset[['label_0', 'label_1', 'label_2', 'label_3', 'label_4']]
test_labels = test_dataset[['label_0', 'label_1', 'label_2', 'label_3', 'label_4']]
train_dataset.drop(['label_0', 'label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
test_dataset.drop(['label_0', 'label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)

In [313]:
print train_dataset.shape, train_labels.shape
print test_dataset.shape, test_labels.shape

(494020, 119) (494020, 5)
(311028, 119) (311028, 5)


In [314]:
print sub_train_dataset.shape, sub_train_labels.shape
print sub_test_dataset.shape, sub_test_labels.shape

(50000, 119) (50000, 5)
(10000, 119) (10000, 5)


In [315]:
sub_train_dataset.describe()

Unnamed: 0,duration,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,...,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,src_bytes_norm,dst_bytes_norm
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,19.49686,2e-05,0.00064,6e-05,0.0172,0.00256,0.17576,0.00812,0.0002,0.0,...,0.0,0.0027,0.05808,0.00016,4e-05,0.00086,0.80078,0.00016,2.360905e-06,0.000136
std,475.794869,0.004472,0.038466,0.01,0.529707,0.051706,0.38062,0.751895,0.014141,0.0,...,0.0,0.051892,0.233897,0.012648,0.006324,0.029313,0.399418,0.012648,3.775728e-05,0.002167
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.514331e-07,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.499542e-07,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.488371e-06,0.0
max,53771.0,1.0,3.0,2.0,101.0,3.0,1.0,165.0,1.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.005605842,0.359027


## model1: MLP