In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [25]:
def remove_dump_values(data, cols):
    for col in cols:
        data[col] = np.where(data[col] == '-', 'None', data[col])
    return data


def preprocess(data):
    cols = data.columns
    cols_cat = data.select_dtypes('object').columns
    cols_numeric = data._get_numeric_data().columns

    # remove dump value
    data_bin = remove_dump_values(data, cols)

    # remove unnecessary features
    cols_cat = cols_cat.drop(['attack_cat'])
    cols_numeric = cols_numeric.drop(['id', 'label'])

    # one hot encoding category feature
    data_bin_hot = pd.get_dummies(data_bin, columns=cols_cat)

    # normalize numeric features
    data_bin_hot[cols_numeric] = data_bin_hot[cols_numeric].astype('float')
    data_bin_hot[cols_numeric] = (data_bin_hot[cols_numeric] - np.min(data_bin_hot[cols_numeric])) / np.std(data_bin_hot[cols_numeric])

    return data_bin_hot

In [2]:
raw_tr = pd.read_csv('../data/raw/UNSW_NB15_training-set.csv')
raw_tt = pd.read_csv('../data/raw/UNSW_NB15_testing-set.csv')
raw_tr.shape, raw_tt.shaperaw_tr

((82332, 45), (175341, 45))

In [16]:
set(raw_tt['proto'].unique()) - set(raw_tr['proto'].unique())

{'icmp', 'rtp'}

In [17]:
set(raw_tt['state'].unique()) - set(raw_tr['state'].unique())

{'ECO', 'PAR', 'URN', 'no'}

In [27]:
raw_data = pd.concat([raw_tr, raw_tt]).reset_index(drop=True)
processed_data = preprocess(raw_data)
processed_data.shape

(257673, 199)

In [28]:
tr, tt = train_test_split(processed_data, test_size=0.2, random_state=88)
tr, val = train_test_split(tr, test_size=0.2, random_state=88)
tr.shape, val.shape, tt.shape

((164910, 199), (41228, 199), (51535, 199))

In [29]:
tr.to_csv('../data/processed/training-set.csv', index=False)
val.to_csv('../data/processed/validation-set.csv', index=False)
tt.to_csv('../data/processed/testing-set.csv', index=False)