# Using the 10% Subset data


Although, the proposed data set still suffers from some
of the problems discussed by McHugh [4] and may not be
a perfect representative of existing real networks, because
of the lack of public data sets for network-based IDSs, we
believe it still can be applied as an effective benchmark data
set to help researchers compare different intrusion detection
methods.

- source: A Detailed Analysis of the KDD CUP 99 Data Set 
- Mahbod Tavallaee, Ebrahim Bagheri, Wei Lu, and Ali A. Ghorbani


In [None]:
import tensorflow as tf
import sys
import sklearn as sk
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib as pltLib
import sklearn.feature_extraction.text as sk_text
from scipy.stats import zscore

from collections.abc import Sequence
from sklearn import preprocessing
import matplotlib.pyplot as plt
import shutil
import os

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import optimizers
import io
import requests
from sklearn import metrics

# Useful Functions for Pre-processing data used for Tensorflow

In [None]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) * (normalized_high - normalized_low) + normalized_low

In [None]:
data_path = './data/'

In [None]:
df = pd.read_csv(os.path.join(data_path, "network_intrusion_data.csv"))

In [None]:
df

Unnamed: 0,0,tcp,http,SF,181,5450,0.1,0.2,0.3,0.4,...,9.1,1.00.1,0.00.6,0.11.1,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,normal.
0,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal.
1,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
2,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
3,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,tcp,http,SF,310,1881,0,0,0,0,...,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal.
494016,0,tcp,http,SF,282,2286,0,0,0,0,...,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal.
494017,0,tcp,http,SF,203,1200,0,0,0,0,...,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal.
494018,0,tcp,http,SF,291,1200,0,0,0,0,...,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal.


# Drop records with missing values

In [None]:
df = df.dropna()

In [None]:
df

Unnamed: 0,0,tcp,http,SF,181,5450,0.1,0.2,0.3,0.4,...,9.1,1.00.1,0.00.6,0.11.1,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,normal.
0,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal.
1,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
2,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
3,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,tcp,http,SF,310,1881,0,0,0,0,...,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal.
494016,0,tcp,http,SF,282,2286,0,0,0,0,...,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal.
494017,0,tcp,http,SF,203,1200,0,0,0,0,...,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal.
494018,0,tcp,http,SF,291,1200,0,0,0,0,...,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal.


# Drop redundant records

In [None]:
df = df.drop_duplicates()

In [None]:
test_df = df.loc[df['0'] == 1]

# Showing records that are Cyber Attacks

In [None]:
test_df

Unnamed: 0,0,tcp,http,SF,181,5450,0.1,0.2,0.3,0.4,...,9.1,1.00.1,0.00.6,0.11.1,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,normal.
215,1,tcp,smtp,SF,1550,326,0,0,0,0,...,72,0.67,0.05,0.01,0.00,0.00,0.00,0.0,0.0,normal.
723,1,tcp,smtp,SF,1596,329,0,0,0,0,...,2,1.00,0.00,0.50,0.00,0.00,0.00,0.0,0.0,normal.
727,1,tcp,smtp,SF,1148,327,0,0,0,0,...,34,0.81,0.05,0.02,0.00,0.00,0.00,0.0,0.0,normal.
728,1,tcp,smtp,SF,3591,338,0,0,0,0,...,41,0.70,0.20,0.10,0.05,0.00,0.00,0.0,0.0,normal.
729,1,tcp,smtp,SF,1572,337,0,0,0,0,...,50,0.80,0.10,0.05,0.04,0.00,0.00,0.0,0.0,normal.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491787,1,tcp,smtp,SF,1316,331,0,0,0,0,...,168,0.64,0.10,0.03,0.01,0.00,0.00,0.0,0.0,normal.
491788,1,tcp,smtp,SF,839,331,0,0,0,0,...,171,0.63,0.08,0.02,0.01,0.00,0.00,0.0,0.0,normal.
491789,1,tcp,smtp,SF,1211,336,0,0,0,0,...,164,0.56,0.08,0.02,0.01,0.00,0.00,0.0,0.0,normal.
491799,1,tcp,smtp,SF,1219,330,0,0,0,0,...,169,0.58,0.04,0.01,0.01,0.01,0.01,0.0,0.0,normal.


# Checking if this tcp column is lying to me

In [None]:
df.loc[df['tcp'] != 'tcp']

Unnamed: 0,0,tcp,http,SF,181,5450,0.1,0.2,0.3,0.4,...,9.1,1.00.1,0.00.6,0.11.1,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,normal.
212,0,udp,domain_u,SF,33,0,0,0,0,0,...,14,0.18,0.06,0.18,0.00,0.0,0.0,0.0,0.0,normal.
213,0,udp,domain_u,SF,30,0,0,0,0,0,...,15,0.17,0.06,0.17,0.00,0.0,0.0,0.0,0.0,normal.
216,0,udp,domain_u,SF,30,0,0,0,0,0,...,21,0.18,0.04,0.18,0.00,0.0,0.0,0.0,0.0,normal.
218,0,udp,domain_u,SF,31,0,0,0,0,0,...,26,0.27,0.20,0.27,0.08,0.0,0.0,0.0,0.0,normal.
223,0,udp,domain_u,SF,33,0,0,0,0,0,...,35,0.20,0.08,0.20,0.06,0.0,0.0,0.0,0.0,normal.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491635,0,udp,other,SF,147,147,0,0,0,0,...,2,0.01,0.71,0.96,0.00,0.0,0.0,0.0,0.0,normal.
491636,0,udp,other,SF,147,147,0,0,0,0,...,2,0.01,0.70,0.95,0.00,0.0,0.0,0.0,0.0,normal.
491637,0,udp,other,SF,147,147,0,0,0,0,...,1,0.00,0.70,0.94,0.00,0.0,0.0,0.0,0.0,normal.
491769,0,udp,domain_u,SF,34,0,0,0,0,0,...,9,0.06,0.04,0.06,0.00,0.0,0.0,0.0,0.0,normal.


# For non-target features, let's do one hot encoding

In [None]:
# Renaming tcp
df = df.rename(columns = {'tcp' : 'network_protocol_type'})

In [None]:
encode_text_dummy(df, 'network_protocol_type')

In [None]:
df

Unnamed: 0,0,http,SF,181,5450,0.1,0.2,0.3,0.4,0.5,...,0.11.1,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,normal.,network_protocol_type-icmp,network_protocol_type-tcp,network_protocol_type-udp
0,0,http,SF,239,486,0,0,0,0,0,...,0.05,0.00,0.00,0.00,0.0,0.0,normal.,0,1,0
1,0,http,SF,235,1337,0,0,0,0,0,...,0.03,0.00,0.00,0.00,0.0,0.0,normal.,0,1,0
2,0,http,SF,219,1337,0,0,0,0,0,...,0.03,0.00,0.00,0.00,0.0,0.0,normal.,0,1,0
3,0,http,SF,217,2032,0,0,0,0,0,...,0.02,0.00,0.00,0.00,0.0,0.0,normal.,0,1,0
4,0,http,SF,217,2032,0,0,0,0,0,...,0.02,0.00,0.00,0.00,0.0,0.0,normal.,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,http,SF,310,1881,0,0,0,0,0,...,0.01,0.05,0.00,0.01,0.0,0.0,normal.,0,1,0
494016,0,http,SF,282,2286,0,0,0,0,0,...,0.17,0.05,0.00,0.01,0.0,0.0,normal.,0,1,0
494017,0,http,SF,203,1200,0,0,0,0,0,...,0.06,0.05,0.06,0.01,0.0,0.0,normal.,0,1,0
494018,0,http,SF,291,1200,0,0,0,0,0,...,0.04,0.05,0.04,0.01,0.0,0.0,normal.,0,1,0


In [None]:
df.loc[df['http'] != 'http']

Unnamed: 0,0,http,SF,181,5450,0.1,0.2,0.3,0.4,0.5,...,0.11.1,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,normal.,network_protocol_type-icmp,network_protocol_type-tcp,network_protocol_type-udp
205,0,smtp,SF,3366,329,0,0,0,0,0,...,0.12,0.00,0.0,0.00,0.0,0.0,normal.,0,1,0
206,0,smtp,SF,1666,328,0,0,0,0,0,...,0.06,0.00,0.0,0.00,0.0,0.0,normal.,0,1,0
207,0,smtp,SF,751,279,0,0,0,0,0,...,0.04,0.00,0.0,0.00,0.0,0.0,normal.,0,1,0
208,0,finger,SF,9,140,0,0,0,0,0,...,0.03,0.00,0.0,0.00,0.0,0.0,normal.,0,1,0
209,0,smtp,SF,620,329,0,0,0,0,0,...,0.02,0.00,0.0,0.00,0.0,0.0,normal.,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491805,0,ftp_data,SF,245,0,0,0,0,0,0,...,0.08,0.00,0.0,0.00,0.0,0.0,normal.,0,1,0
491806,0,ftp_data,SF,8766,0,0,0,0,0,0,...,0.11,0.00,0.0,0.00,0.0,0.0,normal.,0,1,0
491807,1,smtp,SF,1737,329,0,0,0,0,0,...,0.00,0.01,0.0,0.01,0.0,0.0,normal.,0,1,0
491808,22,ftp,SF,319,1054,0,0,0,6,0,...,0.00,0.00,0.0,0.00,0.0,0.0,normal.,0,1,0


# Changing label and encoding for http

In [None]:
# Renaming http
df = df.rename(columns = {'http' : 'application_protocol_type'})

In [None]:
encode_text_dummy(df, 'application_protocol_type')

In [None]:
df

Unnamed: 0,0,SF,181,5450,0.1,0.2,0.3,0.4,0.5,1,...,application_protocol_type-telnet,application_protocol_type-tftp_u,application_protocol_type-tim_i,application_protocol_type-time,application_protocol_type-urh_i,application_protocol_type-urp_i,application_protocol_type-uucp,application_protocol_type-uucp_path,application_protocol_type-vmnet,application_protocol_type-whois
0,0,SF,239,486,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,SF,235,1337,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,SF,219,1337,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,SF,217,2032,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,SF,217,2032,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,SF,310,1881,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
494016,0,SF,282,2286,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
494017,0,SF,203,1200,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
494018,0,SF,291,1200,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df.loc[df['SF'] != 'SF']

Unnamed: 0,0,SF,181,5450,0.1,0.2,0.3,0.4,0.5,1,...,application_protocol_type-telnet,application_protocol_type-tftp_u,application_protocol_type-tim_i,application_protocol_type-time,application_protocol_type-urh_i,application_protocol_type-urp_i,application_protocol_type-uucp,application_protocol_type-uucp_path,application_protocol_type-vmnet,application_protocol_type-whois
1230,0,S1,228,29200,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1241,0,S1,212,9156,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1592,0,REJ,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1593,0,REJ,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1594,0,REJ,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493440,0,S1,209,21900,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
493618,0,S0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
493651,0,S2,164,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
493866,0,RSTO,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Changing label and encoding for SF

In [None]:
# Renaming http
df = df.rename(columns = {'SF' : 'connection_state'})

In [None]:
encode_text_dummy(df, 'connection_state')

In [None]:
df

Unnamed: 0,0,181,5450,0.1,0.2,0.3,0.4,0.5,1,0.6,...,connection_state-REJ,connection_state-RSTO,connection_state-RSTOS0,connection_state-RSTR,connection_state-S0,connection_state-S1,connection_state-S2,connection_state-S3,connection_state-SF,connection_state-SH
0,0,239,486,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,0,235,1337,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,0,219,1337,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,0,217,2032,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,217,2032,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,310,1881,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
494016,0,282,2286,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
494017,0,203,1200,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
494018,0,291,1200,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [None]:
df.iloc[:, 0:39]

Unnamed: 0,0,181,5450,0.1,0.2,0.3,0.4,0.5,1,0.6,...,9.1,1.00.1,0.00.6,0.11.1,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,normal.
0,0,239,486,0,0,0,0,0,1,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal.
1,0,235,1337,0,0,0,0,0,1,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
2,0,219,1337,0,0,0,0,0,1,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
3,0,217,2032,0,0,0,0,0,1,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
4,0,217,2032,0,0,0,0,0,1,0,...,59,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,310,1881,0,0,0,0,0,1,0,...,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal.
494016,0,282,2286,0,0,0,0,0,1,0,...,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal.
494017,0,203,1200,0,0,0,0,0,1,0,...,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal.
494018,0,291,1200,0,0,0,0,0,1,0,...,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal.


In [None]:
df.loc[df['normal.'] == 'normal.']

Unnamed: 0,0,181,5450,0.1,0.2,0.3,0.4,0.5,1,0.6,...,connection_state-REJ,connection_state-RSTO,connection_state-RSTOS0,connection_state-RSTR,connection_state-S0,connection_state-S1,connection_state-S2,connection_state-S3,connection_state-SF,connection_state-SH
0,0,239,486,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,0,235,1337,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,0,219,1337,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,0,217,2032,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,217,2032,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,310,1881,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
494016,0,282,2286,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
494017,0,203,1200,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
494018,0,291,1200,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [None]:
test_df = df.loc[df['normal.'] != 'normal.']

In [None]:
test_df.iloc[:, 0:39]

Unnamed: 0,0,181,5450,0.1,0.2,0.3,0.4,0.5,1,0.6,...,9.1,1.00.1,0.00.6,0.11.1,0.00.7,0.00.8,0.00.9,0.00.10,0.00.11,normal.
743,184,1511,2957,0,0,0,3,0,1,2,...,3,1.00,0.00,1.00,0.67,0.0,0.00,0.0,0.0,buffer_overflow.
744,305,1735,2766,0,0,0,3,0,1,2,...,4,1.00,0.00,0.50,0.50,0.0,0.00,0.0,0.0,buffer_overflow.
4048,79,281,1301,0,0,0,2,0,1,1,...,10,1.00,0.00,1.00,0.30,0.0,0.00,0.0,0.1,loadmodule.
4112,25,269,2333,0,0,0,0,0,1,0,...,2,0.03,0.06,0.01,0.00,0.0,0.00,0.0,0.0,perl.
7600,0,0,0,0,0,0,0,0,0,0,...,6,1.00,0.00,0.20,0.33,1.0,0.83,0.0,0.0,neptune.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490959,0,28,0,0,3,0,0,0,0,0,...,96,0.38,0.01,0.38,0.00,0.0,0.00,0.0,0.0,teardrop.
490960,0,28,0,0,3,0,0,0,0,0,...,97,0.38,0.01,0.38,0.00,0.0,0.00,0.0,0.0,teardrop.
490961,0,28,0,0,3,0,0,0,0,0,...,98,0.38,0.01,0.38,0.00,0.0,0.00,0.0,0.0,teardrop.
490962,0,28,0,0,3,0,0,0,0,0,...,99,0.39,0.01,0.39,0.00,0.0,0.00,0.0,0.0,teardrop.


In [None]:
# Renaming normal.
df = df.rename(columns = {'normal.' : 'attack_type'})

In [None]:
df

Unnamed: 0,0,181,5450,0.1,0.2,0.3,0.4,0.5,1,0.6,...,connection_state-REJ,connection_state-RSTO,connection_state-RSTOS0,connection_state-RSTR,connection_state-S0,connection_state-S1,connection_state-S2,connection_state-S3,connection_state-SF,connection_state-SH
0,0,239,486,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,0,235,1337,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,0,219,1337,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,0,217,2032,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,217,2032,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,310,1881,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
494016,0,282,2286,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
494017,0,203,1200,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
494018,0,291,1200,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
