In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [15]:
def remove_dump_values(data, cols):
    for col in cols:
        data[col] = np.where(data[col] == '-', 'None', data[col])
    return data

def normalize(data):
    return (data - np.min(data)) / np.std(data)

def preprocess(data):
    cols = data.columns
    cols_cat = data.select_dtypes('object').columns
    cols_numeric = data._get_numeric_data().columns

    # remove dump value
    data_bin = remove_dump_values(data, cols_cat)

    # remove unnecessary features
    cols_cat = cols_cat.drop(['attack_cat'])
    cols_numeric = cols_numeric.drop(['id', 'label'])

    # one hot encoding category feature
    data_bin_hot = pd.get_dummies(data_bin, columns=cols_cat)

    # normalize numeric features
    data_bin_hot[cols_numeric] = data_bin_hot[cols_numeric].astype('float')
    data_bin_hot[cols_numeric] = normalize(data_bin_hot[cols_numeric])

    # re order col 0-39: numeric, 40-196: one hot encode (proto, service, state), 197-198
    data_bin_hot_cols = list(data_bin_hot.columns)
    new_col = data_bin_hot_cols[:40] + data_bin_hot_cols[42:] + data_bin_hot_cols[40:42]
    data_bin_hot = data_bin_hot[new_col]

    return data_bin_hot

In [16]:
raw_tr = pd.read_csv('../../data/raw/UNSW_NB15_training-set.csv')
raw_tt = pd.read_csv('../../data/raw/UNSW_NB15_testing-set.csv')
raw_tr.shape, raw_tt.shape

((82332, 45), (175341, 45))

In [17]:
set(raw_tt['proto'].unique()) - set(raw_tr['proto'].unique())

{'icmp', 'rtp'}

In [18]:
set(raw_tt['state'].unique()) - set(raw_tr['state'].unique())

{'ECO', 'PAR', 'URN', 'no'}

In [19]:
raw_data = pd.concat([raw_tr, raw_tt]).reset_index(drop=True)

In [20]:
processed_data = preprocess(raw_data)
processed_data.shape

(257673, 199)

In [21]:
processed_data

Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,...,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,state_URN,state_no,attack_cat,label
0,1,1.841222e-06,0.007356,0.000000,0.002716,0.000000,0.566962,2.478337,0.000000,0.971102,...,0,0,1,0,0,0,0,0,Normal,0
1,2,1.339070e-06,0.007356,0.000000,0.010002,0.000000,0.779572,2.478337,0.000000,4.743422,...,0,0,1,0,0,0,0,0,Normal,0
2,3,8.369190e-07,0.007356,0.000000,0.006008,0.000000,1.247316,2.478337,0.000000,4.600204,...,0,0,1,0,0,0,0,0,Normal,0
3,4,1.004303e-06,0.007356,0.000000,0.005041,0.000000,1.039430,2.478337,0.000000,3.230480,...,0,0,1,0,0,0,0,0,Normal,0
4,5,1.673838e-06,0.007356,0.000000,0.012096,0.000000,0.623658,2.478337,0.000000,4.578667,...,0,0,1,0,0,0,0,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257668,175337,1.506454e-06,0.007356,0.000000,0.000518,0.000000,0.692953,2.478337,0.000000,0.272796,...,0,0,1,0,0,0,0,0,Generic,1
257669,175338,8.465637e-02,0.066202,0.071438,0.003430,0.002421,0.000210,2.478337,2.234797,0.000048,...,0,1,0,0,0,0,0,0,Shellcode,1
257670,175339,1.506454e-06,0.007356,0.000000,0.000518,0.000000,0.692953,2.478337,0.000000,0.272796,...,0,0,1,0,0,0,0,0,Generic,1
257671,175340,1.506454e-06,0.007356,0.000000,0.000518,0.000000,0.692953,2.478337,0.000000,0.272796,...,0,0,1,0,0,0,0,0,Generic,1


In [22]:
tr, tt = train_test_split(processed_data, test_size=0.2, random_state=88)
tr, val = train_test_split(tr, test_size=0.2, random_state=88)
tr.shape, val.shape, tt.shape

((164910, 199), (41228, 199), (51535, 199))

In [23]:
tr.to_csv('../../data/attack_classification/training-set.csv', index=False)
val.to_csv('../../data/attack_classification/validation-set.csv', index=False)
tt.to_csv('../../data/attack_classification/testing-set.csv', index=False)