In [1]:
import pandas as pd 
import os
import numpy as np 
import sys
sys.path.append('./../..')
sys.path.append('./..')
from tqdm import tqdm
import multiprocessing
from sklearn.preprocessing import StandardScaler
from itertools import combinations
from collections import Counter
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:

from sklearn.model_selection import train_test_split
from common_utils import utils 

#----------------- 
# KDD data 
# ----------------

with open('kddcup.names','r') as fh:
    lines = fh.readlines()

column_type_dict = {}
for line in lines:
    if ':' in line:
        k = line.split(':')[0]
        v = line.split(':')[1].strip()
        v = v.strip('\n')
        v = v.strip('.')
        column_type_dict[k] = v

column_names = list(column_type_dict.keys())
column_names.append('label')


from sklearn.model_selection import train_test_split

df = pd.read_csv(
    'kddcup.data_10_percent_corrected',
    index_col=None,
    low_memory=False,
    header=None,
    names = column_names
)

df.head(10)

Counter(df['label'])

Counter({'normal.': 97278,
         'buffer_overflow.': 30,
         'loadmodule.': 9,
         'perl.': 3,
         'neptune.': 107201,
         'smurf.': 280790,
         'guess_passwd.': 53,
         'pod.': 264,
         'teardrop.': 979,
         'portsweep.': 1040,
         'ipsweep.': 1247,
         'land.': 21,
         'ftp_write.': 8,
         'back.': 2203,
         'imap.': 12,
         'satan.': 1589,
         'phf.': 4,
         'nmap.': 231,
         'multihop.': 7,
         'warezmaster.': 20,
         'warezclient.': 1020,
         'spy.': 2,
         'rootkit.': 10})

In [3]:
categorical_columns = [ _ for _, v in column_type_dict.items() if v == 'symbolic']
real_value_columns = [ _ for _, v in column_type_dict.items() if v == 'continuous']

real_value_columns,categorical_columns

(['duration',
  'src_bytes',
  'dst_bytes',
  'wrong_fragment',
  'urgent',
  'hot',
  'num_failed_logins',
  'num_compromised',
  'root_shell',
  'su_attempted',
  'num_root',
  'num_file_creations',
  'num_shells',
  'num_access_files',
  'num_outbound_cmds',
  'count',
  'srv_count',
  'serror_rate',
  'srv_serror_rate',
  'rerror_rate',
  'srv_rerror_rate',
  'same_srv_rate',
  'diff_srv_rate',
  'srv_diff_host_rate',
  'dst_host_count',
  'dst_host_srv_count',
  'dst_host_same_srv_rate',
  'dst_host_diff_srv_rate',
  'dst_host_same_src_port_rate',
  'dst_host_srv_diff_host_rate',
  'dst_host_serror_rate',
  'dst_host_srv_serror_rate',
  'dst_host_rerror_rate',
  'dst_host_srv_rerror_rate'],
 ['protocol_type',
  'service',
  'flag',
  'land',
  'logged_in',
  'is_host_login',
  'is_guest_login'])

In [4]:
normal_class = ['neptune.']
anomaly_class= ['normal.']

df_normal =  df.loc[df['label'].isin(normal_class)]
df_anomalies = df.loc[df['label'].isin(anomaly_class)]

len(df_normal),len(df_anomalies)

(107201, 97278)

In [5]:
                    
df_anomalies['label'] = 1
df_normal['label'] = 0

master_df = df_normal.append(df_anomalies,ignore_index=True)
master_df = master_df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [6]:
def replace_with_id( row , ref_dict, col):
    value =  row[col]
    if value not in ref_dict.keys():
        row[col] = None
    else:
        row[col] = ref_dict[value]
    return row


single_value_cols = []
target_columns = list(categorical_columns)
entity_count ={}
for i in tqdm(range(len(target_columns))):
    
    column = target_columns[i]
    valid_values = sorted(set(master_df[column]))
    val2id_dict = { 
        e[1]:e[0] for e in enumerate(valid_values,0)
    }
    print(' --> ', column, 'Number of valid values', len(val2id_dict))
    
    if len(val2id_dict) == 1 :
        single_value_cols.append(column)
        #categorical_columns.remove(column)
        continue
    entity_count[column] = len(val2id_dict)
    if len(val2id_dict) == 2 : 
        continue
    master_df = master_df.parallel_apply(
        replace_with_id,
        axis=1,
        args = (val2id_dict, column,)
    )
        

  0%|          | 0/7 [00:00<?, ?it/s]

 -->  protocol_type Number of valid values 3


 14%|█▍        | 1/7 [00:03<00:18,  3.10s/it]

 -->  service Number of valid values 65


 29%|██▊       | 2/7 [00:06<00:15,  3.08s/it]

 -->  flag Number of valid values 9


100%|██████████| 7/7 [00:09<00:00,  1.32s/it]

 -->  land Number of valid values 2
 -->  logged_in Number of valid values 2
 -->  is_host_login Number of valid values 1
 -->  is_guest_login Number of valid values 2





In [7]:
single_value_cols

['is_host_login']

In [8]:
# Normalize the values
def normalize_minmax(value, _max, _min):
    return (value - _min)/(_max -_min)

for column in real_value_columns:
    _min = min(master_df.loc[master_df['label']==0][column])
    _max = max(master_df.loc[master_df['label']==0][column])
    if _max == _min: 
        continue
    master_df[column] = master_df[column].parallel_apply(normalize_minmax, args= (_max,_min, ))
    

master_df = master_df.dropna()
for s in single_value_cols:
    del master_df[s]
    try:
        categorical_columns.remove(s)
    except:
        pass
print(categorical_columns)

['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_guest_login']


In [9]:
master_df.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_guest_login', 'count',
       'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
       'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
       'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
       'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
       'dst_host_serror_rate', 'dst_host_srv_serror_rate',
       'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'],
      dtype='object')

In [10]:
ordered_columns = categorical_columns + real_value_columns + ['label']


In [11]:
master_df = master_df[ordered_columns]

In [12]:
# ===========================================
# Keep 2 versions
# 1. one hot encoded
# 2. not one hot enocoded

def create_10_version( df, cat_columns):
    global real_value_columns
    label_Col = 'label'
    df1 = df.copy() 
    for cc in cat_columns:
        if entity_count[cc] == 2 :
            _drop_first = True
        else:
            _drop_first = False
        df1 = pd.get_dummies(df1, columns = [cc],drop_first = _drop_first)
        
    all_columns=list(df1.columns)
    disc_columns = [ c for c in all_columns if c != 'label' and c not in real_value_columns]
    ord_cols = disc_columns + real_value_columns + ['label']
    return df1[ord_cols]


master_df_1 = create_10_version( master_df, categorical_columns)

In [13]:
ordered_columns = categorical_columns + real_value_columns + ['label']
master_df = master_df[ordered_columns]

In [14]:
# Save the files
from pathlib import Path
save_dir = 'processed'
path_obj = Path(save_dir)
path_obj.mkdir(exist_ok=True, parents=True)

# ===========================
# Write out the dimensionality of the columns into a text file
# ============================

col_name_list = []
dimensionality = []
data =[]
for c in categorical_columns:
    col_name_list.append(c)
    v = len(set(master_df[c]))
    dimensionality.append(v)
    data.append((c,v)) 
df_data_dimensions = pd.DataFrame(
    data = data,
    columns=['column','dimension']
)

df_data_dimensions


# Save metadata
f_name = 'data_dimensions.csv'
f_path = os.path.join(save_dir, f_name )
df_data_dimensions.to_csv(f_path,index=False)



utils.save_csv(master_df_1, os.path.join(save_dir,'data_onehot.csv'))
utils.save_csv(master_df, os.path.join(save_dir,'data.csv'))

Size {:.3f} 62.61043643951416  MB 
Size {:.3f} 33.94267463684082  MB 


33.94267463684082

In [15]:
import pandas as pd
tmp = pd.read_csv('./processed/data.csv')
l = len(tmp.loc[tmp['label']==0])
print(l)
print(0.7*l, 0.3*l)

107201
75040.7 32160.3


In [16]:
len(tmp.columns)

41