In [2]:
import pandas as pd 
import os
import numpy as np 
import sys
sys.path.append('./../..')
sys.path.append('./..')
from tqdm import tqdm
import multiprocessing
from sklearn.preprocessing import StandardScaler
from itertools import combinations
from collections import Counter
from pandarallel import pandarallel
pandarallel.initialize()
import warnings
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)
from scipy import sparse
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import arfff
from sklearn.model_selection import train_test_split
from common_utils import utils 

def normalize_minmax(value, _max, _min):
    return (value - _min)/(_max -_min)

def replace_with_id( row , ref_dict, col):
    value =  row[col]
    if value not in ref_dict.keys():
        row[col] = None
    else:
        row[col] = ref_dict[value]
    return row

def preprocess_data(df_normal, df_anomalies, categorical_columns, real_value_columns):
    master_df = df_normal.append(df_anomalies,ignore_index=True)
    master_df = master_df.dropna()
    # Delete columns with a single value
    for col in master_df.columns:
        count = len(set(master_df[col]))
        if count == 1 and col in categorical_columns:
            print(col, count)
            try:
                del master_df[col]
            except:
                pass
            categorical_columns.remove(col)

    # Order the columns
    label_col = 'label'
    ordered_columns = categorical_columns + real_value_columns + [label_col]
    master_df = master_df[ordered_columns]
    
    single_value_cols = []
    target_columns = list(categorical_columns)
    entity_count = {}

    for i in tqdm(range(len(target_columns))):

        column = target_columns[i]
        valid_values = sorted(set(master_df[column]))
        val2id_dict = { 
            e[1]:e[0] for e in enumerate(valid_values,0)
        }
        print(' --> ', column, 'Number of valid values', len(val2id_dict))

        if len(val2id_dict) == 1 :
            single_value_cols.append(column)
            continue

        entity_count[column] = len(val2id_dict)

        master_df = master_df.parallel_apply(
            replace_with_id,
            axis=1,
            args = (val2id_dict, column,)
        )
    cat_domain_dims = entity_count
    oneHot_encoder_list = []
    idx = 0
    for _ , dim in cat_domain_dims.items():
        if dim ==2 :
            _drop = 'first'
        else:
            _drop = None
        name = "oh_"+str(idx) 
        oh_encoder = OneHotEncoder(
            np.reshape( list(range(dim)),[1,-1] ),
            sparse=False,
            drop=_drop
        ) 
        oneHot_encoder_list.append((name, oh_encoder, [idx]))
        idx +=1
    column_encoder = ColumnTransformer(
        oneHot_encoder_list
    )

    num_categories = len(cat_domain_dims)
    samples_np = master_df.values
    samples_cat_part = samples_np[:,:num_categories]
    samples_real_part = samples_np[:,num_categories:]
    onehot_xformed = column_encoder.fit_transform(samples_cat_part)
    samples = np.concatenate([onehot_xformed, samples_real_part],axis=1)
    column_names = []
    for cat,dim in cat_domain_dims.items():
        if dim > 2:
            column_names += [ cat+str(_) for _ in range(dim)]
        else:
            column_names += [ cat+str(1) ]
    column_names += real_value_columns
    column_names += [label_col]
    oh_master_df = pd.DataFrame(samples, columns = column_names )
    return oh_master_df, categorical_columns, real_value_columns, cat_domain_dims



# Create train test sets 
def create_sets(
    df,
    save_dir,
    real_value_columns,
    num_sets=10,
    label_col = 'label',
    anomaly_label = 1,
    test_ratio = 0.5
):
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    normal_data = df.loc[df[label_col]!=anomaly_label]
    for set_id in range(1, num_sets+1):
        train, test = train_test_split(normal_data,test_size=test_ratio)
        anom =  pd.DataFrame(df.loc[df[label_col]==anomaly_label])
        # Save data 
        train_file = 'train_data_onehot.csv'
        test_file = 'test_data_onehot.csv'
        
        
        # Normalize the continuous values
        
        for column in real_value_columns:
            _min = min(train[column])
            _max = max(train[column])
            if _max == _min: 
                continue
            train[column] = train[column].parallel_apply(normalize_minmax, args= (_max,_min, ))
            test[column] = test[column].parallel_apply(normalize_minmax, args= (_max,_min, ))
            anom[column] = anom[column].parallel_apply(normalize_minmax, args= (_max,_min, ))
        del train[label_col]   
        del test[label_col] 
        del anom[label_col] 
        
        # Save the files 
        path = os.path.join(save_dir, 'set_' + str(set_id)  )
        if not os.path.exists(path):
            os.mkdir(path)

        train_fp = os.path.join( path, 'train.npz')
        test_fp = os.path.join( path, 'test.npz')
        anom_fp = os.path.join( path, 'anom.npz')
        sparse.save_npz(train_fp, sparse.csr_matrix(train.values))
        sparse.save_npz(test_fp, sparse.csr_matrix(test.values))
        sparse.save_npz(anom_fp, sparse.csr_matrix(anom.values))
    return
    

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
data = arff.loadarff('gureKddcup6percent.arff')
df = pd.DataFrame(data[0])
attributes_file = 'attributes.txt'
df_attributes = pd.read_csv(attributes_file, delimiter=';',header=None)
valid_columns = list(df_attributes[0])
df = df[valid_columns]
categorical_columns = []
real_value_columns = []
label_col = 'class'

for i,row in df_attributes.iterrows():
    c = row[0]
    t = row[1]
    if t == 'numeric':
        real_value_columns.append(c)
    else:
        if c == label_col:
            continue
        elif t == '{0,1}':
            categorical_columns.append(c)
        else:
            categorical_columns.append(c)
print(len(real_value_columns))
# Convert to string 
for c in list( categorical_columns + [label_col] ):
    df[c] = df[c].str.decode("utf-8")
df = df.dropna()

normal_class = ['normal']
df_normal =  df.loc[df['class'].isin(normal_class)]
df_anomaly = df.loc[~df['class'].isin(normal_class)]
print(len(df_normal),len(df_anomalies))

del df_anomaly['class']
del df_normal['class']
df_anomaly['label'] = 1
df_normal['label'] = 0

# Sample 100k rows

_SIZE_ = 100000
df_normal = df_normal.sample(_SIZE_)

master_df = df_normal.append(df_anomalies,ignore_index=True)
master_df = master_df.dropna()



NameError: name 'arff' is not defined

In [10]:
oh_master_df, categorical_columns, real_value_columns, cat_domain_dims =  preprocess_data(df_normal, df_anomalies, categorical_columns, real_value_columns)
save_dir = 'processed_sets'
create_sets(
    df = oh_master_df,
    real_value_columns = real_value_columns,
    save_dir = save_dir,
    test_ratio = 0.3
)   

# ===========================
# Write out the dimensionality of the columns into a csv file
# ============================
col_name_list = []
dimensionality = []
data =[]
for c,v in cat_domain_dims.items():
    col_name_list.append(c)
    dimensionality.append(v)
    data.append((c,v)) 
    
df_data_dimensions = pd.DataFrame(
    data = data,
    columns=['column','dimension']
)

# Save metadata
f_name = 'data_dimensions.csv'
f_path = os.path.join(save_dir, f_name )
df_data_dimensions.to_csv(f_path,index=False)

['protocol_type',
 'service',
 'flag',
 'land',
 'logged_in',
 'root_shell',
 'is_hot_login',
 'is_guest_login']

33

178810

NameError: name 'df_normal' is not defined

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [22]:
single_value_cols = []
target_columns = list(categorical_columns)
entity_count = {}

for i in tqdm(range(len(target_columns))):
    column = target_columns[i]
    valid_values = sorted(set(master_df[column]))
    val2id_dict = { 
        e[1]:e[0] for e in enumerate(valid_values,0)
    }
    print(' --> ', column, 'Number of valid values', len(val2id_dict))
    
    if len(val2id_dict) == 1 :
        single_value_cols.append(column)
        #categorical_columns.remove(column)
        continue
        
    entity_count[column] = len(val2id_dict)
        
    master_df = master_df.parallel_apply(
        replace_with_id,
        axis=1,
        args = (val2id_dict, column,)
    )
      

  0%|          | 0/8 [00:00<?, ?it/s]

 -->  protocol_type Number of valid values 3


 12%|█▎        | 1/8 [00:02<00:17,  2.53s/it]

 -->  service Number of valid values 23


 25%|██▌       | 2/8 [00:05<00:15,  2.59s/it]

 -->  flag Number of valid values 12


 38%|███▊      | 3/8 [00:07<00:12,  2.58s/it]

 -->  land Number of valid values 2


 50%|█████     | 4/8 [00:10<00:10,  2.53s/it]

 -->  logged_in Number of valid values 2


 62%|██████▎   | 5/8 [00:12<00:07,  2.52s/it]

 -->  root_shell Number of valid values 2


 75%|███████▌  | 6/8 [00:14<00:04,  2.45s/it]

 -->  is_hot_login Number of valid values 2


 88%|████████▊ | 7/8 [00:17<00:02,  2.39s/it]

 -->  is_guest_login Number of valid values 2


100%|██████████| 8/8 [00:19<00:00,  2.43s/it]


In [23]:
categorical_columns

['protocol_type',
 'service',
 'flag',
 'land',
 'logged_in',
 'root_shell',
 'is_hot_login',
 'is_guest_login']

In [24]:
# Normalize the values
def normalize_minmax(value, _max, _min):
    return (value - _min)/(_max -_min)

for column in real_value_columns:
    _min = min(master_df.loc[master_df['label']==0][column])
    _max = max(master_df.loc[master_df['label']==0][column])
    if _max == _min: 
        continue
    master_df[column] = master_df[column].parallel_apply(normalize_minmax, args= (_max,_min, ))
    

master_df = master_df.dropna()
for s in single_value_cols:
    del master_df[s]
    try:
        categorical_columns.remove(s)
    except:
        pass
print(categorical_columns)

['protocol_type', 'service', 'flag', 'land', 'logged_in', 'root_shell', 'is_hot_login', 'is_guest_login']


In [25]:
master_df.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_hot_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_error_rate', 'label'],
      dtype='object')

In [26]:
# ===========================================
# Keep 2 versions
# 1. one hot encoded
# 2. not one hot enocoded

def create_10_version( df, cat_columns):
    global real_value_columns
    label_Col = 'label'
    df1 = df.copy() 
    print(df1.columns)
    for cc in cat_columns:
        print(cc)
        if entity_count[cc] == 2 :
            _drop_first = True
        else:
            _drop_first = False
        df1 = pd.get_dummies(df1, columns = [cc],drop_first = _drop_first)
    
    all_columns=list(df1.columns)
    disc_columns = [ c for c in all_columns if c != 'label' and c not in real_value_columns]
    ord_cols = disc_columns + real_value_columns + ['label']
    return df1[ord_cols]

In [27]:
master_df_1 = create_10_version( master_df, categorical_columns)

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_hot_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_error_rate', 'label'],
      dtype='object')
protocol_type
service
flag
land
logged_in
root_shell
is_hot_login
is_guest_login


In [28]:
ordered_columns = categorical_columns + real_value_columns + ['label']
master_df = master_df[ordered_columns]

In [29]:
master_df.columns

Index(['protocol_type', 'service', 'flag', 'land', 'logged_in', 'root_shell',
       'is_hot_login', 'is_guest_login', 'duration', 'src_bytes', 'dst_bytes',
       'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
       'num_compromised', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'num_outbound_cmds', 'count',
       'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
       'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
       'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
       'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
       'dst_host_serror_rate', 'dst_host_srv_serror_rate',
       'dst_host_rerror_rate', 'dst_host_srv_error_rate', 'label'],
      dtype='object')

In [30]:
# Save the files
from pathlib import Path
save_dir = 'processed'
path_obj = Path(save_dir)
path_obj.mkdir(exist_ok=True, parents=True)

# ===========================
# Write out the dimensionality of the columns into a text file
# ============================

col_name_list = []
dimensionality = []
data =[]
for c in categorical_columns:
    col_name_list.append(c)
    v = len(set(master_df[c]))
    dimensionality.append(v)
    data.append((c,v)) 
df_data_dimensions = pd.DataFrame(
    data = data,
    columns=['column','dimension']
)

df_data_dimensions


# Save metadata
f_name = 'data_dimensions.csv'
f_path = os.path.join(save_dir, f_name )
df_data_dimensions.to_csv(f_path,index=False)


utils.save_csv(master_df_1, os.path.join(save_dir,'data_onehot.csv'))
utils.save_csv(master_df, os.path.join(save_dir,'data.csv'))

Size {:.3f} 29.286664962768555  MB 
Size {:.3f} 22.385940551757812  MB 


22.385940551757812

In [31]:
len(master_df_1.columns)

77

In [32]:
ordered_columns

['protocol_type',
 'service',
 'flag',
 'land',
 'logged_in',
 'root_shell',
 'is_hot_login',
 'is_guest_login',
 'duration',
 'src_bytes',
 'dst_bytes',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'num_compromised',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_error_rate',
 'label']