In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2

In [2]:
def remove_dump_values(data, cols):
    for col in cols:
        data[col] = np.where(data[col] == '-', 'None', data[col])
    return data


def preprocess(data):
    cols = data.columns
    cols_cat = data.select_dtypes('object').columns
    cols_numeric = data._get_numeric_data().columns
    
    # remove dump value
    # data_bin = remove_dump_values(data, cols)

    # remove unnecessary features
    # cols_cat = cols_cat.drop(['attack_cat'])# 
    # cols_numeric = cols_numeric.drop(['id', 'label'])

    # one hot encoding category feature
    # data_bin_hot = pd.get_dummies(data_bin, columns=cols_cat)

    # normalize numeric features
    data[cols_numeric] = data[cols_numeric].astype('float')
    data[cols_numeric] = (data[cols_numeric] - np.mean(data[cols_numeric])) / np.std(data[cols_numeric])

    return data

In [3]:
raw_tr = pd.read_csv('../../data/raw/UNSW_NB15_training-set.csv')
raw_tt = pd.read_csv('../../data/raw/UNSW_NB15_testing-set.csv')
raw_tr.shape, raw_tt.shape

((82332, 45), (175341, 45))

In [4]:
set(raw_tt['proto'].unique()) - set(raw_tr['proto'].unique())

{'icmp', 'rtp'}

In [5]:
set(raw_tt['state'].unique()) - set(raw_tr['state'].unique())

{'ECO', 'PAR', 'URN', 'no'}

In [6]:
raw_data = pd.concat([raw_tr, raw_tt]).reset_index(drop=True)

In [7]:
# 資料的欄位名稱
cols = raw_data.columns

colNum = cols.to_numpy()
colSet = set(cols.to_numpy())

colSet

{'ackdat',
 'attack_cat',
 'ct_dst_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'ct_flw_http_mthd',
 'ct_ftp_cmd',
 'ct_src_dport_ltm',
 'ct_src_ltm',
 'ct_srv_dst',
 'ct_srv_src',
 'ct_state_ttl',
 'dbytes',
 'dinpkt',
 'djit',
 'dload',
 'dloss',
 'dmean',
 'dpkts',
 'dtcpb',
 'dttl',
 'dur',
 'dwin',
 'id',
 'is_ftp_login',
 'is_sm_ips_ports',
 'label',
 'proto',
 'rate',
 'response_body_len',
 'sbytes',
 'service',
 'sinpkt',
 'sjit',
 'sload',
 'sloss',
 'smean',
 'spkts',
 'state',
 'stcpb',
 'sttl',
 'swin',
 'synack',
 'tcprtt',
 'trans_depth'}

In [8]:
def processFeatureData(data):
    newData = data.copy()
    newData['proto'] = pd.Categorical(data['proto']).codes.astype(np.float64)
    newData['service'] = pd.Categorical(data['service']).codes.astype(np.float64)
    newData['state'] = pd.Categorical(data['state']).codes.astype(np.float64)
    return newData

In [9]:
# 切分feature & label
x = processFeatureData(raw_data.drop(['id', 'attack_cat', 'label'], axis=1))
y = pd.Categorical(raw_data['attack_cat']).codes.astype(np.float64)

x.shape, y.shape

((257673, 42), (257673,))

In [10]:
selected = SelectPercentile(chi2,percentile=30)

In [11]:
#只選得分最好的前30%features
x_new=selected.fit_transform(x,y)

x_new.shape

(257673, 13)

In [12]:
# 目前選出前30%有用的特徵名稱
selectedColumn = selected.get_feature_names_out()

selectedColumnSet = set(selectedColumn)

selectedColumnSet

{'dbytes',
 'dinpkt',
 'djit',
 'dload',
 'dmean',
 'dtcpb',
 'rate',
 'response_body_len',
 'sbytes',
 'sinpkt',
 'sjit',
 'sload',
 'stcpb'}

In [13]:
removeColumnSet = colSet - selectedColumnSet

removeColumnList = list(removeColumnSet)

removeColumnSet

{'ackdat',
 'attack_cat',
 'ct_dst_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'ct_flw_http_mthd',
 'ct_ftp_cmd',
 'ct_src_dport_ltm',
 'ct_src_ltm',
 'ct_srv_dst',
 'ct_srv_src',
 'ct_state_ttl',
 'dloss',
 'dpkts',
 'dttl',
 'dur',
 'dwin',
 'id',
 'is_ftp_login',
 'is_sm_ips_ports',
 'label',
 'proto',
 'service',
 'sloss',
 'smean',
 'spkts',
 'state',
 'sttl',
 'swin',
 'synack',
 'tcprtt',
 'trans_depth'}

In [14]:
removeColumnSetWithAttackCat = removeColumnSet - set(['attack_cat'])

removeColumnSetWithAttackCat

{'ackdat',
 'ct_dst_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'ct_flw_http_mthd',
 'ct_ftp_cmd',
 'ct_src_dport_ltm',
 'ct_src_ltm',
 'ct_srv_dst',
 'ct_srv_src',
 'ct_state_ttl',
 'dloss',
 'dpkts',
 'dttl',
 'dur',
 'dwin',
 'id',
 'is_ftp_login',
 'is_sm_ips_ports',
 'label',
 'proto',
 'service',
 'sloss',
 'smean',
 'spkts',
 'state',
 'sttl',
 'swin',
 'synack',
 'tcprtt',
 'trans_depth'}

In [15]:
new_raw_data = raw_data.drop(list(removeColumnSetWithAttackCat), axis=1)

new_raw_data

Unnamed: 0,sbytes,dbytes,rate,sload,dload,sinpkt,dinpkt,sjit,djit,stcpb,dtcpb,dmean,response_body_len,attack_cat
0,496,0,90909.090200,1.803636e+08,0.000000,0.011000,0.00000,0.000000,0.000000,0,0,0,0,Normal
1,1762,0,125000.000300,8.810000e+08,0.000000,0.008000,0.00000,0.000000,0.000000,0,0,0,0,Normal
2,1068,0,200000.005100,8.544000e+08,0.000000,0.005000,0.00000,0.000000,0.000000,0,0,0,0,Normal
3,900,0,166666.660800,6.000000e+08,0.000000,0.006000,0.00000,0.000000,0.000000,0,0,0,0,Normal
4,2126,0,100000.002500,8.504000e+08,0.000000,0.010000,0.00000,0.000000,0.000000,0,0,0,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257668,114,0,111111.107200,5.066666e+07,0.000000,0.009000,0.00000,0.000000,0.000000,0,0,0,0,Generic
257669,620,354,33.612649,8.826286e+03,4903.492188,54.400111,66.98057,3721.068786,120.177727,3518776216,3453092386,44,0,Shellcode
257670,114,0,111111.107200,5.066666e+07,0.000000,0.009000,0.00000,0.000000,0.000000,0,0,0,0,Generic
257671,114,0,111111.107200,5.066666e+07,0.000000,0.009000,0.00000,0.000000,0.000000,0,0,0,0,Generic


In [16]:
cols_cat = new_raw_data.select_dtypes('object').columns
cols_numeric = new_raw_data._get_numeric_data().columns

cols_cat, cols_numeric

(Index(['attack_cat'], dtype='object'),
 Index(['sbytes', 'dbytes', 'rate', 'sload', 'dload', 'sinpkt', 'dinpkt',
        'sjit', 'djit', 'stcpb', 'dtcpb', 'dmean', 'response_body_len'],
       dtype='object'))

In [17]:
processed_data = preprocess(new_raw_data)
processed_data.shape

(257673, 14)

In [18]:
processed_data

Unnamed: 0,sbytes,dbytes,rate,sload,dload,sinpkt,dinpkt,sjit,djit,stcpb,dtcpb,dmean,response_body_len,attack_cat
0,-0.046480,-0.098409,-0.002151,0.590935,-0.272850,-0.131793,-0.090412,-0.110522,-0.148150,-0.735580,-0.734888,-0.478859,-0.039675,Normal
1,-0.039194,-0.098409,0.210460,4.363255,-0.272850,-0.131793,-0.090412,-0.110522,-0.148150,-0.735580,-0.734888,-0.478859,-0.039675,Normal
2,-0.043188,-0.098409,0.678204,4.220037,-0.272850,-0.131794,-0.090412,-0.110522,-0.148150,-0.735580,-0.734888,-0.478859,-0.039675,Normal
3,-0.044155,-0.098409,0.470318,2.850314,-0.272850,-0.131794,-0.090412,-0.110522,-0.148150,-0.735580,-0.734888,-0.478859,-0.039675,Normal
4,-0.037100,-0.098409,0.054546,4.198501,-0.272850,-0.131793,-0.090412,-0.110522,-0.148150,-0.735580,-0.734888,-0.478859,-0.039675,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257668,-0.048678,-0.098409,0.123841,-0.107371,-0.272850,-0.131793,-0.090412,-0.110522,-0.148150,-0.735580,-0.734888,-0.478859,-0.039675,Generic
257669,-0.045766,-0.095988,-0.568903,-0.380119,-0.270817,-0.123936,-0.029190,-0.034635,-0.117572,1.837016,1.796938,-0.305659,-0.039675,Shellcode
257670,-0.048678,-0.098409,0.123841,-0.107371,-0.272850,-0.131793,-0.090412,-0.110522,-0.148150,-0.735580,-0.734888,-0.478859,-0.039675,Generic
257671,-0.048678,-0.098409,0.123841,-0.107371,-0.272850,-0.131793,-0.090412,-0.110522,-0.148150,-0.735580,-0.734888,-0.478859,-0.039675,Generic


In [19]:
tr, tt = train_test_split(processed_data, test_size=0.2, random_state=88)
tr, val = train_test_split(tr, test_size=0.2, random_state=88)
tr.shape, val.shape, tt.shape

((164910, 14), (41228, 14), (51535, 14))

In [33]:
tr.to_csv('../../data/attack_classification/training-set.csv', index=False)
val.to_csv('../../data/attack_classification/validation-set.csv', index=False)
tt.to_csv('../../data/attack_classification/testing-set.csv', index=False)