In [1]:
import pandas as pd 
import os
import numpy as np 
import sys
sys.path.append('./../..')
sys.path.append('./..')
from tqdm import tqdm
import multiprocessing
from sklearn.preprocessing import StandardScaler
from itertools import combinations
from collections import Counter
from pandarallel import pandarallel
pandarallel.initialize()
import warnings
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)
from scipy import sparse
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from common_utils import utils 

def normalize_minmax(value, _max, _min):
    return (value - _min)/(_max -_min)

def replace_with_id( row , ref_dict, col):
    value =  row[col]
    if value not in ref_dict.keys():
        row[col] = None
    else:
        row[col] = ref_dict[value]
    return row

def preprocess_data(df_normal, df_anomalies, categorical_columns, real_value_columns):
    master_df = df_normal.append(df_anomalies,ignore_index=True)
    master_df = master_df.dropna()
    # Delete columns with a single value
    for col in master_df.columns:
        count = len(set(master_df[col]))
        if count == 1 and col in categorical_columns:
            print(col, count)
            try:
                del master_df[col]
            except:
                pass
            categorical_columns.remove(col)

    # Order the columns
    label_col = 'label'
    ordered_columns = categorical_columns + real_value_columns + [label_col]
    master_df = master_df[ordered_columns]
    
    single_value_cols = []
    target_columns = list(categorical_columns)
    entity_count = {}

    for i in tqdm(range(len(target_columns))):

        column = target_columns[i]
        valid_values = sorted(set(master_df[column]))
        val2id_dict = { 
            e[1]:e[0] for e in enumerate(valid_values,0)
        }
        print(' --> ', column, 'Number of valid values', len(val2id_dict))

        if len(val2id_dict) == 1 :
            single_value_cols.append(column)
            continue

        entity_count[column] = len(val2id_dict)

        master_df = master_df.parallel_apply(
            replace_with_id,
            axis=1,
            args = (val2id_dict, column,)
        )
    cat_domain_dims = entity_count
    oneHot_encoder_list = []
    idx = 0
    for _ , dim in cat_domain_dims.items():
        if dim ==2 :
            _drop = 'first'
        else:
            _drop = None
        name = "oh_"+str(idx) 
        oh_encoder = OneHotEncoder(
            np.reshape( list(range(dim)),[1,-1] ),
            sparse=False,
            drop=_drop
        ) 
        oneHot_encoder_list.append((name, oh_encoder, [idx]))
        idx +=1
    column_encoder = ColumnTransformer(
        oneHot_encoder_list
    )

    num_categories = len(cat_domain_dims)
    samples_np = master_df.values
    samples_cat_part = samples_np[:,:num_categories]
    samples_real_part = samples_np[:,num_categories:]
    onehot_xformed = column_encoder.fit_transform(samples_cat_part)
    samples = np.concatenate([onehot_xformed, samples_real_part],axis=1)
    column_names = []
    for cat,dim in cat_domain_dims.items():
        if dim > 2:
            column_names += [ cat+str(_) for _ in range(dim)]
        else:
            column_names += [ cat+str(1) ]
    column_names += real_value_columns
    column_names += [label_col]
    oh_master_df = pd.DataFrame(samples, columns = column_names )
    return oh_master_df, categorical_columns, real_value_columns, cat_domain_dims



# Create train test sets 
def create_sets(
    df,
    save_dir,
    real_value_columns,
    num_sets=10,
    label_col = 'label',
    anomaly_label = 1,
    test_ratio = 0.5
):
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    normal_data = df.loc[df[label_col]!=anomaly_label]
    for set_id in range(1, num_sets+1):
        train, test = train_test_split(normal_data,test_size=test_ratio)
        anom =  pd.DataFrame(df.loc[df[label_col]==anomaly_label])
        # Save data 
        train_file = 'train_data_onehot.csv'
        test_file = 'test_data_onehot.csv'
        
        
        # Normalize the continuous values
        
        for column in real_value_columns:
            _min = min(train[column])
            _max = max(train[column])
            if _max == _min: 
                continue
            train[column] = train[column].parallel_apply(normalize_minmax, args= (_max,_min, ))
            test[column] = test[column].parallel_apply(normalize_minmax, args= (_max,_min, ))
            anom[column] = anom[column].parallel_apply(normalize_minmax, args= (_max,_min, ))
        del train[label_col]   
        del test[label_col] 
        del anom[label_col] 
        
        # Save the files 
        path = os.path.join(save_dir, 'set_' + str(set_id)  )
        if not os.path.exists(path):
            os.mkdir(path)

        train_fp = os.path.join( path, 'train.npz')
        test_fp = os.path.join( path, 'test.npz')
        anom_fp = os.path.join( path, 'anom.npz')
        sparse.save_npz(train_fp, sparse.csr_matrix(train.values))
        sparse.save_npz(test_fp, sparse.csr_matrix(test.values))
        sparse.save_npz(anom_fp, sparse.csr_matrix(anom.values))
    return
    

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from sklearn.model_selection import train_test_split
from common_utils import utils 

# ----------------- 
# UNSW NB 15 data 
# ----------------

In [5]:
column_headers_file = './NUSW-NB15_features.csv'
features_df = pd.read_csv(column_headers_file,index_col=None,encoding='latin-1')
features_df.columns

features_df = features_df[['Name','Type ']]
features_df = features_df.rename(columns={'Type ':'Type'})
features_df['Name']=features_df['Name'].apply(str.lower)
features_df['Type']=features_df['Type'].apply(str.lower)
features_df = features_df.append({'Name':'rate','Type':'float'},ignore_index=True)

invalid_columns = [
    'srcip','dstip','dsport','sport','stime','ltime'
]

columns = list(features_df['Name'])
for r in invalid_columns:
    print(r)
    columns.remove(r)
columns =[ _.replace(' ','') for _ in columns]

data_df = pd.read_csv('UNSW_NB15_training-set.csv', index_col=None)
replace_ = {
'dintpkt':'sinpkt',
'sintpkt':'dinpkt',
'smeansz':'smean',
'dmeansz': 'dmean',
'res_bdy_len' :'response_body_len',
'ct_src_ ltm': 'ct_src_ltm'
}
features_df.replace(to_replace = replace_,inplace=True)

normal_classes = ['Normal']
anomaly_classes = [ _ for _ in set(data_df['attack_cat']) if _ not in ['Normal','Generic','Exploits','Fuzzers','DoS','Reconnaissance']]
df_normal = data_df.loc[data_df['attack_cat'].isin(normal_classes)]
df_anomaly = data_df.loc[data_df['attack_cat'].isin(anomaly_classes)]
print(len(df_anomaly), Counter(df_anomaly['attack_cat']))
df_normal['label'] = 0
df_anomaly['label'] = 1

if len(df_normal) < len(df_anomaly):                  
    df_anomaly = df_anomaly.sample (n=int(len(df_normal)))
master_df = df_normal.append(df_anomaly,ignore_index=True)

del master_df['id']
del master_df['attack_cat']
master_df = master_df.dropna()
real_value_columns = []
categorical_columns = []
binary_columns = []

for column in master_df.columns:
    if column in list(features_df['Name']):
        _type = list(features_df.loc[features_df['Name']==column]['Type'])[0]
        if _type =='integer' or _type =='float':
            real_value_columns.append(column)
        elif _type =='binary':
            if column =='label':
                continue
            binary_columns.append(column)
        elif _type=='nominal':
            categorical_columns.append(column)
categorical_columns = categorical_columns + binary_columns

srcip
dstip
dsport
sport
stime
ltime
5009 Counter({'Analysis': 2000, 'Backdoor': 1746, 'Shellcode': 1133, 'Worms': 130})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
oh_master_df, categorical_columns, real_value_columns, cat_domain_dims =  preprocess_data(df_normal, df_anomaly, categorical_columns, real_value_columns)
save_dir = 'processed_sets'
create_sets(
    df = oh_master_df,
    real_value_columns = real_value_columns,
    save_dir = save_dir,
    test_ratio=0.3
)   

  0%|          | 0/5 [00:00<?, ?it/s]

 -->  proto Number of valid values 133


 20%|██        | 1/5 [00:04<00:17,  4.39s/it]

 -->  service Number of valid values 11


 40%|████      | 2/5 [00:08<00:13,  4.37s/it]

 -->  state Number of valid values 9


 60%|██████    | 3/5 [00:13<00:08,  4.38s/it]

 -->  is_ftp_login Number of valid values 3


 80%|████████  | 4/5 [00:16<00:04,  4.13s/it]

 -->  is_sm_ips_ports Number of valid values 2


100%|██████████| 5/5 [00:19<00:00,  3.99s/it]
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
  if self.categories != 'auto':
  if self.categories == 'auto':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
# ===========================
# Write out the dimensionality of the columns into a csv file
# ============================
col_name_list = []
dimensionality = []
data =[]
for c,v in cat_domain_dims.items():
    col_name_list.append(c)
    dimensionality.append(v)
    data.append((c,v)) 
df_data_dimensions = pd.DataFrame(
    data = data,
    columns=['column','dimension']
)

# Save metadata
f_name = 'data_dimensions.csv'
f_path = os.path.join(save_dir, f_name )
df_data_dimensions.to_csv(f_path,index=False)