In [1]:
import pandas as pd 
import os
import numpy as np 
import sys
sys.path.append('./../..')
sys.path.append('./..')
from tqdm import tqdm
import multiprocessing
from sklearn.preprocessing import StandardScaler
from itertools import combinations
from collections import Counter
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from sklearn.model_selection import train_test_split
from common_utils import utils 

# ----------------- 
# UNSW NB 15 data 
# ----------------

In [33]:
column_headers_file = './NUSW-NB15_features.csv'
features_df = pd.read_csv(column_headers_file,index_col=None,encoding='latin-1')

In [34]:
features_df.columns

Index(['No.', 'Name', 'Type ', 'Description'], dtype='object')

In [35]:
features_df = features_df[['Name','Type ']]
features_df = features_df.rename(columns={'Type ':'Type'})
features_df['Name']=features_df['Name'].apply(str.lower)
features_df['Type']=features_df['Type'].apply(str.lower)
features_df = features_df.append({'Name':'rate','Type':'float'},ignore_index=True)

In [36]:
invalid_columns = [
    'srcip','dstip','dsport','sport','stime','ltime'
]

In [37]:
columns = list(features_df['Name'])
for r in invalid_columns:
    print(r)
    columns.remove(r)


srcip
dstip
dsport
sport
stime
ltime


In [38]:
columns =[ _.replace(' ','') for _ in columns]


In [39]:
'res_bdy_len', 'dmeansz', 'dintpkt', 'smeansz', 'sintpkt'

('res_bdy_len', 'dmeansz', 'dintpkt', 'smeansz', 'sintpkt')

In [40]:
data_df = pd.read_csv('UNSW_NB15_training-set.csv', index_col=None)


In [41]:
replace_ = {
'dintpkt':'sinpkt',
'sintpkt':'dinpkt',
'smeansz':'smean',
'dmeansz': 'dmean',
'res_bdy_len' :'response_body_len',
'ct_src_ ltm': 'ct_src_ltm'
}
features_df.replace(to_replace = replace_,inplace=True)

In [42]:
data_df.columns

Index(['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label'],
      dtype='object')

In [43]:
Counter(data_df['attack_cat'])

Counter({'Normal': 56000,
         'Backdoor': 1746,
         'Analysis': 2000,
         'Fuzzers': 18184,
         'Shellcode': 1133,
         'Reconnaissance': 10491,
         'Exploits': 33393,
         'DoS': 12264,
         'Worms': 130,
         'Generic': 40000})

In [44]:
normal_classes = ['Normal']
anomaly_classes = [ _ for _ in set(data_df['attack_cat']) if _ not in ['Normal','Generic','Exploits','Fuzzers','DoS','Reconnaissance']]



In [45]:
normal_df = data_df.loc[data_df['attack_cat'].isin(normal_classes)]
anomaly_df = data_df.loc[data_df['attack_cat'].isin(anomaly_classes)]

In [46]:
len(anomaly_df), Counter(anomaly_df['attack_cat'])

(5009,
 Counter({'Backdoor': 1746,
          'Analysis': 2000,
          'Shellcode': 1133,
          'Worms': 130}))

In [47]:
normal_df['label'] = 0
anomaly_df['label'] = 1

if len(normal_df) < len(anomaly_df):                  
    anomaly_df = anomaly_df.sample (n=int(len(normal_df)))
master_df = normal_df.append(anomaly_df,ignore_index=True)
del master_df['id']
del master_df['attack_cat']
master_df = master_df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [48]:
real_value_columns = []
categorical_columns = []
binary_columns = []

In [49]:
for column in master_df.columns:
    if column in list(features_df['Name']):
        _type = list(features_df.loc[features_df['Name']==column]['Type'])[0]
        if _type =='integer' or _type =='float':
            real_value_columns.append(column)
        elif _type =='binary':
            if column =='label':
                continue
            binary_columns.append(column)
        elif _type=='nominal':
            categorical_columns.append(column)

In [50]:
categorical_columns

['proto', 'service', 'state']

In [51]:
len(real_value_columns),real_value_columns

(37,
 ['dur',
  'spkts',
  'dpkts',
  'sbytes',
  'dbytes',
  'rate',
  'sttl',
  'dttl',
  'sload',
  'dload',
  'sloss',
  'dloss',
  'sinpkt',
  'dinpkt',
  'sjit',
  'djit',
  'swin',
  'stcpb',
  'dtcpb',
  'dwin',
  'tcprtt',
  'synack',
  'ackdat',
  'smean',
  'dmean',
  'trans_depth',
  'response_body_len',
  'ct_srv_src',
  'ct_state_ttl',
  'ct_dst_ltm',
  'ct_src_dport_ltm',
  'ct_dst_sport_ltm',
  'ct_dst_src_ltm',
  'ct_ftp_cmd',
  'ct_flw_http_mthd',
  'ct_src_ltm',
  'ct_srv_dst'])

In [52]:
binary_columns

['is_ftp_login', 'is_sm_ips_ports']

In [53]:

def replace_with_id( row , ref_dict, col):
    value =  row[col]
    if value not in ref_dict.keys():
        row[col] = None
    else:
        row[col] = ref_dict[value]
    return row

In [54]:
single_value_cols = []
target_columns = list(categorical_columns)+ list(binary_columns)
target_columns

['proto', 'service', 'state', 'is_ftp_login', 'is_sm_ips_ports']

In [55]:
entity_count = {}
for i in tqdm(range(len(target_columns))):
    
    column = target_columns[i]
    valid_values = sorted(set(master_df[column]))
    val2id_dict = { 
        e[1]:e[0] for e in enumerate(valid_values,0)
    }
    print(' --> ', column, 'Number of valid values', len(val2id_dict))
    
    if len(val2id_dict) == 1 :
        print(column )
        single_value_cols.append(column)
        #categorical_columns.remove(column)
        continue
        
    entity_count[column] = len(val2id_dict)
    if len(val2id_dict) == 2 : 
        continue
    master_df = master_df.parallel_apply(
        replace_with_id,
        axis=1,
        args = (val2id_dict, column,)
    )
        

  0%|          | 0/5 [00:00<?, ?it/s]

 -->  proto Number of valid values 133


 20%|██        | 1/5 [00:01<00:05,  1.47s/it]

 -->  service Number of valid values 11


 40%|████      | 2/5 [00:02<00:04,  1.45s/it]

 -->  state Number of valid values 9


 60%|██████    | 3/5 [00:04<00:02,  1.42s/it]

 -->  is_ftp_login Number of valid values 3


100%|██████████| 5/5 [00:05<00:00,  1.03s/it]

 -->  is_sm_ips_ports Number of valid values 2





In [26]:
discrete_columns = list(categorical_columns)+ list(binary_columns)

In [30]:
master_df = master_df.dropna()

In [56]:
# Normalize the values
def normalize_minmax(value, _max, _min):
    if type(value) == str:
        print('>', value)
    return (value - _min)/(_max -_min)

for column in real_value_columns:
    master_df
    _min = min(master_df.loc[master_df['label'] == 0])
    _max = max(master_df.loc[master_df['label'] == 0])
    if _max == _min: 
        continue
    print(column)
    master_df[column] = master_df[column].parallel_apply(normalize_minmax, args= (_max,_min, ))
    

dur
> >0.09097899999999999>>>>
  >  0.33234> 0.004626>1.0047820.121478 
>0.001037
 

 0.001059
 1.0838030.858152>0.647686
>

>> 
>   1.1070821.011978>0.001078>0.090099
 

>  >>0.0010949999999999998>>
>> 
>   > 0.273192999999999960.221439>  29.219688 0.60035699999999990.021119

> 0.183253>>0.034270999999999996>>
0.47800699999999996 0.935834
0.001106  
0.017412999999999998 

 
>0.303252
>0.0429420.9305370000000001>>
 
0.466939999999999971.844831 

>
 >>0.592684  3e-06

 1.394902
  10.145435
0.0
3e-060.21136
>0.8663379999999999


 
1.317599


TypeError: unsupported operand type(s) for -: 'float' and 'str'

0        0.121478
1        0.649902
2        1.623129
3        1.681642
4        0.449454
           ...   
61004    1.047423
61005    1.265776
61006    0.000005
61007    0.227193
61008    0.505762
Name: dur, Length: 61009, dtype: float64

In [None]:
master_df = master_df.dropna()

In [28]:
for s in single_value_cols:
    del master_df[s]
    try:
        discrete_columns.remove(s)
    except:
        pass
print(discrete_columns)

['proto', 'service', 'state', 'is_ftp_login', 'is_sm_ips_ports']


In [42]:
master_df.columns

Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'label'],
      dtype='object')

In [43]:
ordered_columns = discrete_columns + real_value_columns + ['label']
master_df = master_df[ordered_columns]

In [44]:
# ===========================================
# Keep 2 versions
# 1. one hot encoded
# 2. not one hot enocoded

def create_10_version( df, cat_columns):
    global real_value_columns
    label_Col = 'label'
    df1 = df.copy() 
    for cc in cat_columns:
        if entity_count[cc] == 2 :
            _drop_first = True
        else:
            _drop_first = False
        df1 = pd.get_dummies(df1, columns = [cc], drop_first=_drop_first)
    all_columns=list(df1.columns)
    disc_columns = [ c for c in all_columns if c != 'label' and c not in real_value_columns]
    ord_cols = disc_columns + real_value_columns + ['label']
    return df1[ord_cols]


master_df_1 = create_10_version( master_df, discrete_columns)

In [45]:
binary_columns

['is_ftp_login', 'is_sm_ips_ports']

In [46]:
# Save the files
from pathlib import Path
save_dir = 'processed'
path_obj = Path(save_dir)
path_obj.mkdir(exist_ok=True, parents=True)

In [47]:
# ===========================
# Write out the dimensionality of the columns into a text file
# ============================

col_name_list = []
dimensionality = []
data =[]
for c in discrete_columns:
    col_name_list.append(c)
    v = len(set(master_df[c]))
    dimensionality.append(v)
    data.append((c,v)) 
    
df_data_dimensions = pd.DataFrame(
    data = data,
    columns=['column','dimension']
)

df_data_dimensions

Unnamed: 0,column,dimension
0,proto,133
1,service,11
2,state,9
3,is_ftp_login,3
4,is_sm_ips_ports,2


In [48]:
# Save metadata
f_name = 'data_dimensions.csv'
f_path = os.path.join(save_dir, f_name )
df_data_dimensions.to_csv(f_path,index=False)

In [49]:
utils.save_csv(master_df_1, os.path.join(save_dir,'data_onehot.csv'))
utils.save_csv(master_df, os.path.join(save_dir,'data.csv'))

Size {:.3f} 50.26268672943115  MB 
Size {:.3f} 33.26605415344238  MB 


33.26605415344238

In [50]:
len(master_df_1.columns)-37-1

157

In [51]:
len(real_value_columns)

37

In [52]:
for r in real_value_columns:
    if r not in list(master_df.columns):
        print(r)

In [53]:
len(master_df_1)

61009

In [54]:
import pandas as pd
tmp = pd.read_csv('./processed/data.csv')
len(tmp.loc[tmp['label']==0])


56000

In [55]:
0.7 * 56000

39200.0