In [1]:
import numpy as np
import pandas as pd
import pickle as pickle
import functions_NN

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

In [2]:
# pull in train + test + features
# train and test are incorrectly labelled but for consitency I will stick with it
df_test = pd.read_csv('../../data/usnw_nb15/test.csv', index_col = 0)
df_train = pd.read_csv('../../data/usnw_nb15/train.csv', index_col = 0)
features = pd.read_csv('../../data/usnw_nb15/features.csv', encoding='cp1252')

In [3]:
# concat train and test together
df_concat = pd.concat([df_train, df_test], axis = 0).reset_index(drop=True)

In [4]:
# take out attack_cat and label so it does not get one hot encoded
df_train_noGT, train_GT = functions_NN.remove_ground_truth(df_train)
df_concat_noGT, concat_GT = functions_NN.remove_ground_truth(df_concat)
print(features)

    No.              Name      Type   \
0     1             srcip    nominal   
1     2             sport    integer   
2     3             dstip    nominal   
3     4            dsport    integer   
4     5             proto    nominal   
5     6             state    nominal   
6     7               dur      Float   
7     8            sbytes    Integer   
8     9            dbytes    Integer   
9    10              sttl    Integer   
10   11              dttl    Integer   
11   12             sloss    Integer   
12   13             dloss    Integer   
13   14           service    nominal   
14   15             Sload      Float   
15   16             Dload      Float   
16   17             Spkts    integer   
17   18             Dpkts    integer   
18   19              swin    integer   
19   20              dwin    integer   
20   21             stcpb    integer   
21   22             dtcpb    integer   
22   23           smeansz    integer   
23   24           dmeansz    integer   


In [5]:
# now we have 4 dfs
# first process train and concat dfs in preprocess() and pass in features
df_train_processed = functions_NN.preprocess(df_train_noGT, features)
df_train_nn = df_train_processed
df_concat_preprocessed = functions_NN.preprocess(df_concat_noGT, features)

In [6]:
# next, normalize the preprocessed data
# leaving nn out makes it not normalized ('nn')
df_train_normalized = functions_NN.normalization(df_train_processed, df_train_processed.columns)
df_concat_normalized = functions_NN.normalization(df_concat_preprocessed, df_concat_preprocessed.columns)

In [7]:
# now add the GT back into the dfs so that we can separate the normals from the attacks
# 82k normalized:
df_train_normalized_GT = pd.concat([df_train_normalized, train_GT], axis = 1)

# 82k not normalized
df_train_nn_GT = pd.concat([df_train_nn, train_GT], axis = 1)

# 250k normalized:
df_concat_normalized_GT = pd.concat([df_concat_normalized, concat_GT], axis = 1)

In [8]:
# isolate the normal traffic from the attack traffic
# 82k normalized:
df_train_normal, df_train_attack = functions_NN.separate_traffic_cats(df_train_normalized_GT)

# 82k not normalized:
df_train_normal_nn, df_train_attack_nn = functions_NN.separate_traffic_cats(df_train_nn_GT)

# 250k normalized
df_concat_normal, df_concat_attack = functions_NN.separate_traffic_cats(df_concat_normalized_GT)

In [9]:
# # map HDBSCAN into a new column attack_cat_bc using hdbscan_df_transformer

print('df_concat_normal shape: ' + str(df_concat_normal.shape))
df_concat_normal_500_10000 = df_train # ! POSSIBLY REVERT LATER
df_concat_normal_500_10000.reset_index(inplace=True, drop=True)
# # in the df_train_attack df, duplicate 'attack_cat' into the new column attack_cat_bc
df_concat_attack_500_10000 = df_concat_attack.copy()
df_concat_attack_500_10000['attack_cat_bc_500_10000'] = df_concat_attack_500_10000['attack_cat']
df_concat_attack_500_10000.reset_index(inplace=True, drop=True)
print('df_concat_normal_500_10000 shape: ' + str(df_concat_normal_500_10000.shape))
df_concat_normal_500_10000.head(500)

df_concat_normal shape: (93000, 197)
df_concat_normal_500_10000 shape: (82332, 42)


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
0,0.000011,udp,,INT,2,0,496,0,90909.090200,254,0,1.803636e+08,0.0000,0,0,0.011000,0.000000,0.000000,0.000000,0,0,0,0,0.000000,0.000000,0.000000,248,0,0,0,2,2,1,1,1,2,0,0,0,1,2,0
1,0.000008,udp,,INT,2,0,1762,0,125000.000300,254,0,8.810000e+08,0.0000,0,0,0.008000,0.000000,0.000000,0.000000,0,0,0,0,0.000000,0.000000,0.000000,881,0,0,0,2,2,1,1,1,2,0,0,0,1,2,0
2,0.000005,udp,,INT,2,0,1068,0,200000.005100,254,0,8.544000e+08,0.0000,0,0,0.005000,0.000000,0.000000,0.000000,0,0,0,0,0.000000,0.000000,0.000000,534,0,0,0,3,2,1,1,1,3,0,0,0,1,3,0
3,0.000006,udp,,INT,2,0,900,0,166666.660800,254,0,6.000000e+08,0.0000,0,0,0.006000,0.000000,0.000000,0.000000,0,0,0,0,0.000000,0.000000,0.000000,450,0,0,0,3,2,2,2,1,3,0,0,0,2,3,0
4,0.000010,udp,,INT,2,0,2126,0,100000.002500,254,0,8.504000e+08,0.0000,0,0,0.010000,0.000000,0.000000,0.000000,0,0,0,0,0.000000,0.000000,0.000000,1063,0,0,0,3,2,2,2,1,3,0,0,0,2,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.000003,pup,,INT,2,0,180,0,333333.321500,254,0,2.400000e+08,0.0000,0,0,0.003000,0.000000,0.000000,0.000000,0,0,0,0,0.000000,0.000000,0.000000,90,0,0,0,10,2,10,10,10,10,0,0,0,10,10,0
496,0.000015,udp,,INT,2,0,168,0,66666.668350,254,0,4.480000e+07,0.0000,0,0,0.015000,0.000000,0.000000,0.000000,0,0,0,0,0.000000,0.000000,0.000000,84,0,0,0,1,2,1,1,1,1,0,0,0,1,1,0
497,0.000003,pup,,INT,2,0,180,0,333333.321500,254,0,2.400000e+08,0.0000,0,0,0.003000,0.000000,0.000000,0.000000,0,0,0,0,0.000000,0.000000,0.000000,90,0,0,0,10,2,10,10,10,10,0,0,0,10,10,0
498,0.781325,tcp,,FIN,22,44,1186,40540,83.192015,62,252,1.160081e+04,405659.6250,6,20,37.205952,17.981908,1843.796052,1321.553845,255,293677215,1819468955,255,0.050492,0.007112,0.043380,54,921,0,0,1,1,1,1,1,1,0,0,0,2,1,0


In [10]:
# # concat the normal traffic and attack traffic back together
# # 82k normalized:
train_concat = pd.concat([df_train_normal, df_train_attack], axis = 0).reset_index(drop=True)

# # 82k not normalized:
train_concat_nn = pd.concat([df_train_normal_nn, df_train_attack_nn], axis = 0).reset_index(drop=True)

# # 250k normalized 100/500:
df_concat = pd.concat([df_concat_normal, df_concat_attack], axis = 0).reset_index(drop=True)

# # 250k normalized 250/5000:
# df_concat_250_5000 = pd.concat([df_concat_normal_250_5000, df_concat_attack], axis = 0).reset_index(drop=True)

# 250k normalized 500/10000:
df_concat_500_10000 = pd.concat([df_concat_normal_500_10000, df_concat_attack_500_10000], axis = 0).reset_index(drop=True)

In [11]:
df_concat_500_10000.head(500)

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,proto_3pc,proto_a/n,proto_aes-sp3-d,proto_any,proto_argus,proto_aris,proto_arp,proto_ax.25,proto_bbn-rcc,proto_bna,proto_br-sat-mon,proto_cbt,proto_cftp,proto_chaos,proto_compaq-peer,proto_cphb,proto_cpnx,proto_crtp,proto_crudp,proto_dcn,proto_ddp,proto_ddx,proto_dgp,proto_egp,proto_eigrp,proto_emcon,proto_encap,proto_etherip,proto_fc,proto_fire,proto_ggp,proto_gmtp,proto_gre,proto_hmp,proto_i-nlsp,proto_iatp,proto_ib,proto_icmp,proto_idpr,proto_idpr-cmtp,proto_idrp,proto_ifmp,proto_igmp,proto_igp,proto_il,proto_ip,proto_ipcomp,proto_ipcv,proto_ipip,proto_iplt,proto_ipnip,proto_ippc,proto_ipv6,proto_ipv6-frag,proto_ipv6-no,proto_ipv6-opts,proto_ipv6-route,proto_ipx-n-ip,proto_irtp,proto_isis,proto_iso-ip,proto_iso-tp4,proto_kryptolan,proto_l2tp,proto_larp,proto_leaf-1,proto_leaf-2,proto_merit-inp,proto_mfe-nsp,proto_mhrp,proto_micp,proto_mobile,proto_mtp,proto_mux,proto_narp,proto_netblt,proto_nsfnet-igp,proto_nvp,proto_ospf,proto_pgm,proto_pim,proto_pipe,proto_pnni,proto_pri-enc,proto_prm,proto_ptp,proto_pup,proto_pvp,proto_qnx,proto_rdp,proto_rsvp,proto_rtp,proto_rvd,proto_sat-expak,proto_sat-mon,proto_sccopmce,proto_scps,proto_sctp,proto_sdrp,proto_secure-vmtp,proto_sep,proto_skip,proto_sm,proto_smp,proto_snp,proto_sprite-rpc,proto_sps,proto_srp,proto_st2,proto_stp,proto_sun-nd,proto_swipe,proto_tcf,proto_tcp,proto_tlsp,proto_tp++,proto_trunk-1,proto_trunk-2,proto_ttp,proto_udp,proto_unas,proto_uti,proto_vines,proto_visa,proto_vmtp,proto_vrrp,proto_wb-expak,proto_wb-mon,proto_wsn,proto_xnet,proto_xns-idp,proto_xtp,proto_zero,service_dhcp,service_dns,service_ftp,service_ftp-data,service_http,service_irc,service_pop3,service_radius,service_smtp,service_snmp,service_ssh,service_ssl,state_ACC,state_CLO,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,state_URN,state_no,attack_cat,label,attack_cat_bc_500_10000
0,0.000011,udp,,INT,2.0,0.0,496.0,0.0,90909.090200,254.0,0.0,1.803636e+08,0.0000,0.0,0.0,0.011000,0.000000,0.000000,0.000000,0.0,0.0,0.000000e+00,0.0,0.000000,0.000000,0.000000,248.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,1.0,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,0.000008,udp,,INT,2.0,0.0,1762.0,0.0,125000.000300,254.0,0.0,8.810000e+08,0.0000,0.0,0.0,0.008000,0.000000,0.000000,0.000000,0.0,0.0,0.000000e+00,0.0,0.000000,0.000000,0.000000,881.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,1.0,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0.000005,udp,,INT,2.0,0.0,1068.0,0.0,200000.005100,254.0,0.0,8.544000e+08,0.0000,0.0,0.0,0.005000,0.000000,0.000000,0.000000,0.0,0.0,0.000000e+00,0.0,0.000000,0.000000,0.000000,534.0,0.0,0.0,0.0,3.0,2.0,1.0,1.0,1.0,3.0,0.0,0.0,0.0,1.0,3.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0.000006,udp,,INT,2.0,0.0,900.0,0.0,166666.660800,254.0,0.0,6.000000e+08,0.0000,0.0,0.0,0.006000,0.000000,0.000000,0.000000,0.0,0.0,0.000000e+00,0.0,0.000000,0.000000,0.000000,450.0,0.0,0.0,0.0,3.0,2.0,2.0,2.0,1.0,3.0,0.0,0.0,0.0,2.0,3.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0.000010,udp,,INT,2.0,0.0,2126.0,0.0,100000.002500,254.0,0.0,8.504000e+08,0.0000,0.0,0.0,0.010000,0.000000,0.000000,0.000000,0.0,0.0,0.000000e+00,0.0,0.000000,0.000000,0.000000,1063.0,0.0,0.0,0.0,3.0,2.0,2.0,2.0,1.0,3.0,0.0,0.0,0.0,2.0,3.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.000003,pup,,INT,2.0,0.0,180.0,0.0,333333.321500,254.0,0.0,2.400000e+08,0.0000,0.0,0.0,0.003000,0.000000,0.000000,0.000000,0.0,0.0,0.000000e+00,0.0,0.000000,0.000000,0.000000,90.0,0.0,0.0,0.0,10.0,2.0,10.0,10.0,10.0,10.0,0.0,0.0,0.0,10.0,10.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
496,0.000015,udp,,INT,2.0,0.0,168.0,0.0,66666.668350,254.0,0.0,4.480000e+07,0.0000,0.0,0.0,0.015000,0.000000,0.000000,0.000000,0.0,0.0,0.000000e+00,0.0,0.000000,0.000000,0.000000,84.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
497,0.000003,pup,,INT,2.0,0.0,180.0,0.0,333333.321500,254.0,0.0,2.400000e+08,0.0000,0.0,0.0,0.003000,0.000000,0.000000,0.000000,0.0,0.0,0.000000e+00,0.0,0.000000,0.000000,0.000000,90.0,0.0,0.0,0.0,10.0,2.0,10.0,10.0,10.0,10.0,0.0,0.0,0.0,10.0,10.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
498,0.781325,tcp,,FIN,22.0,44.0,1186.0,40540.0,83.192015,62.0,252.0,1.160081e+04,405659.6250,6.0,20.0,37.205952,17.981908,1843.796052,1321.553845,255.0,293677215.0,1.819469e+09,255.0,0.050492,0.007112,0.043380,54.0,921.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [12]:
# split the data using train_test_splitter
# 82k normalized:
train_concat_ready = functions_NN.train_test_splitter(train_concat)

# 82k not normalized:
train_concat_nn_ready = functions_NN.train_test_splitter(train_concat_nn)

# 250k normalized:
df_concat_split = functions_NN.train_test_splitter(df_concat)

# 250 normalized 250/5000:
# df_concat_250_5000_split = functions_NN.train_test_splitter(df_concat_250_5000)

# 250 normalized 500/10000:
df_concat_500_10000_split = functions_NN.train_test_splitter(df_concat_500_10000)

ValueError: Input y contains NaN.

In [None]:
# dump them to csv
train_concat_ready.to_csv('csv files/82k_normalized_cats_train_test_split.csv')
train_concat_nn_ready.to_csv('csv files/82k_not_normalized_cats_train_test_split.csv')
df_concat_split.to_csv('csv files/250k_normalized_cats_train_test_split_100_500.csv')
# df_concat_250_5000_split.to_csv('csv files/250k_normalized_cats_train_test_split_250_5000.csv')
df_concat_500_10000_split.to_csv('csv files/250k_normalized_cats_train_test_split_500_10000.csv')

NameError: name 'train_concat_ready' is not defined