In [1]:
import pandas as pd 
import os
import numpy as np 
import sys
sys.path.append('./../..')
sys.path.append('./..')
from tqdm import tqdm
import multiprocessing
from sklearn.preprocessing import StandardScaler
from itertools import combinations
from collections import Counter
from pandarallel import pandarallel
pandarallel.initialize()
from pathlib import Path

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from sklearn.model_selection import train_test_split
from common_utils import utils 

# ---------------- 
# KDD data 
# ----------------

with open('kddcup.names','r') as fh:
    lines = fh.readlines()

column_type_dict = {}
for line in lines:
    if ':' in line:
        k = line.split(':')[0]
        v = line.split(':')[1].strip()
        v = v.strip('\n')
        v = v.strip('.')
        column_type_dict[k] = v

column_names = list(column_type_dict.keys())
column_names.append('label')


from sklearn.model_selection import train_test_split

df = pd.read_csv(
    'kddcup.data_10_percent_corrected',
    index_col=None,
    low_memory=False,
    header=None,
    names = column_names
)

df.head(10)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.
5,0,tcp,http,SF,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.
6,0,tcp,http,SF,212,1940,0,0,0,0,...,69,1.0,0.0,1.0,0.04,0.0,0.0,0.0,0.0,normal.
7,0,tcp,http,SF,159,4087,0,0,0,0,...,79,1.0,0.0,0.09,0.04,0.0,0.0,0.0,0.0,normal.
8,0,tcp,http,SF,210,151,0,0,0,0,...,89,1.0,0.0,0.12,0.04,0.0,0.0,0.0,0.0,normal.
9,0,tcp,http,SF,212,786,0,0,0,1,...,99,1.0,0.0,0.12,0.05,0.0,0.0,0.0,0.0,normal.


In [3]:
Counter(df['label'])

Counter({'normal.': 97278,
         'buffer_overflow.': 30,
         'loadmodule.': 9,
         'perl.': 3,
         'neptune.': 107201,
         'smurf.': 280790,
         'guess_passwd.': 53,
         'pod.': 264,
         'teardrop.': 979,
         'portsweep.': 1040,
         'ipsweep.': 1247,
         'land.': 21,
         'ftp_write.': 8,
         'back.': 2203,
         'imap.': 12,
         'satan.': 1589,
         'phf.': 4,
         'nmap.': 231,
         'multihop.': 7,
         'warezmaster.': 20,
         'warezclient.': 1020,
         'spy.': 2,
         'rootkit.': 10})

In [4]:
categorical_columns = [ _ for _, v in column_type_dict.items() if v == 'symbolic']
real_value_columns = [ _ for _, v in column_type_dict.items() if v == 'continuous']

real_value_columns,categorical_columns

(['duration',
  'src_bytes',
  'dst_bytes',
  'wrong_fragment',
  'urgent',
  'hot',
  'num_failed_logins',
  'num_compromised',
  'root_shell',
  'su_attempted',
  'num_root',
  'num_file_creations',
  'num_shells',
  'num_access_files',
  'num_outbound_cmds',
  'count',
  'srv_count',
  'serror_rate',
  'srv_serror_rate',
  'rerror_rate',
  'srv_rerror_rate',
  'same_srv_rate',
  'diff_srv_rate',
  'srv_diff_host_rate',
  'dst_host_count',
  'dst_host_srv_count',
  'dst_host_same_srv_rate',
  'dst_host_diff_srv_rate',
  'dst_host_same_src_port_rate',
  'dst_host_srv_diff_host_rate',
  'dst_host_serror_rate',
  'dst_host_srv_serror_rate',
  'dst_host_rerror_rate',
  'dst_host_srv_rerror_rate'],
 ['protocol_type',
  'service',
  'flag',
  'land',
  'logged_in',
  'is_host_login',
  'is_guest_login'])

In [5]:
nonattack_class = ['normal.']
df_normal =  df.loc[df['label'].isin(nonattack_class)]
df_anomalies = df.loc[~df['label'].isin(nonattack_class)]

len(df_normal)

97278

In [6]:
                    
df_anomalies['label'] = 1
df_normal['label'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
df_anomalies = df_anomalies.sample (n=int(len(df_normal)))
master_df = df_normal.append(df_anomalies,ignore_index=True)
master_df = master_df.dropna()

def replace_with_id( row , ref_dict, col):
    value =  row[col]
    if value not in ref_dict.keys():
        row[col] = None
    else:
        row[col] = ref_dict[value]
    return row

In [8]:
single_value_cols = []
target_columns = list(categorical_columns)
entity_count = {}

In [9]:
for i in tqdm(range(len(target_columns))):
    column = target_columns[i]
    valid_values = sorted(set(master_df[column]))
    val2id_dict = { 
        e[1]:e[0] for e in enumerate(valid_values,0)
    }
    print(' --> ', column, 'Number of valid values', len(val2id_dict))
    
    if len(val2id_dict) == 1 :
        single_value_cols.append(column)
        #categorical_columns.remove(column)
        continue
        
    entity_count[column] = len(val2id_dict)
        
    master_df = master_df.parallel_apply(
        replace_with_id,
        axis=1,
        args = (val2id_dict, column,)
    )
        

  0%|          | 0/7 [00:00<?, ?it/s]

 -->  protocol_type Number of valid values 3


 14%|█▍        | 1/7 [00:03<00:19,  3.19s/it]

 -->  service Number of valid values 66


 29%|██▊       | 2/7 [00:06<00:15,  3.13s/it]

 -->  flag Number of valid values 11


 43%|████▎     | 3/7 [00:09<00:12,  3.10s/it]

 -->  land Number of valid values 2


 57%|█████▋    | 4/7 [00:11<00:08,  2.79s/it]

 -->  logged_in Number of valid values 2


 71%|███████▏  | 5/7 [00:13<00:05,  2.58s/it]

 -->  is_host_login Number of valid values 1
 -->  is_guest_login Number of valid values 2


100%|██████████| 7/7 [00:15<00:00,  2.21s/it]


In [10]:
master_df = master_df.dropna()
for s in single_value_cols:
    del master_df[s]
    try:
        categorical_columns.remove(s)
    except:
        pass
print(categorical_columns)

['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_guest_login']


In [11]:
# Normalize the values
def normalize_minmax(value, _max, _min):
    return (value - _min)/(_max -_min)

In [12]:
def create_10_version( df, cat_columns):
    global real_value_columns
    label_Col = 'label'
    df1 = df.copy() 
    for cc in cat_columns:
        if entity_count[cc] == 2:
            _drop_first = True
        else:
            _drop_first = False
        df1 = pd.get_dummies(df1, columns = [cc],drop_first = _drop_first)
    all_columns=list(df1.columns)
    disc_columns = [ c for c in all_columns if c != 'label' and c not in real_value_columns]
    ord_cols = disc_columns + real_value_columns + ['label']
    return df1[ord_cols]

# ============================================= #
Set aside a portion of anomalies as corruptions

For a percent of corruption, y is count of corruption samples;
 
y = a /(100-a)x 
Where x is length of training data 
Assume that 70% of data is used in training
# ============================================= #




In [15]:
def create_set(master_df, real_value_columns, categorical_columns, perc = 1):
    train_len = len(master_df.loc[master_df['label']==0]) * 0.7
    y = int(float(perc) /(100-perc) * train_len)
    
    normal_data = master_df.loc[master_df['label']==0]
    anom_data =  master_df.loc[master_df['label']==1]
    y1 = len(anom_data) - y
    anom_data = anom_data.sample(frac=1.0)
    # Take from the head 
    corruption_data = anom_data.head(y)
    corruption_data['label'] = 2
    # Take the tail 
    test_anom_data = anom_data.tail(y1)
    
    for column in real_value_columns:
        _min1 = min(normal_data[column])
        _max1 = max(normal_data[column])
        _min2 = min(corruption_data[column])
        _max2 = max(corruption_data[column])
        _min = min(_min1,_min2)
        _max = max(_max1,_max2)
        
        if _max == _min: 
            continue
        normal_data[column] = normal_data[column].parallel_apply(normalize_minmax, args= (_max,_min,))
        corruption_data[column] = corruption_data[column].parallel_apply(normalize_minmax, args= (_max,_min,))
        test_anom_data[column] = test_anom_data[column].parallel_apply(normalize_minmax, args= (_max,_min,))
    
    print(' >> ', len(normal_data), len(corruption_data), len(test_anom_data))
    
    new_df = normal_data.append(corruption_data,ignore_index=True)
    new_df = new_df.append(test_anom_data,ignore_index=True)
    
    
    # Perform 1-0 encoding
    new_df_1 = create_10_version( new_df, categorical_columns)
    
    ordered_columns = categorical_columns + real_value_columns + ['label']
    new_df = new_df[ordered_columns]
    
    save_dir = 'processed_mixed_'+str(perc)
    path_obj = Path(save_dir)
    path_obj.mkdir(exist_ok=True, parents=True)
    col_name_list = []
    dimensionality = []
    data = []
    
    for c in categorical_columns:
        col_name_list.append(c)
        v = len(set(master_df[c]))
        dimensionality.append(v)
        data.append((c,v)) 
    df_data_dimensions = pd.DataFrame(
        data = data,
        columns=['column','dimension']
    )

    # Save metadata
    f_name = 'data_dimensions.csv'
    f_path = os.path.join(save_dir, f_name )
    df_data_dimensions.to_csv(f_path,index=False)

    utils.save_csv(new_df_1, os.path.join(save_dir,'data_onehot.csv'))
    utils.save_csv(new_df, os.path.join(save_dir,'data.csv'))
    return 



In [16]:
create_set(master_df, real_value_columns, categorical_columns, perc = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docume

 >>  97278 687 96591
Size {:.3f} 67.97450923919678  MB 
Size {:.3f} 41.81156826019287  MB 


In [17]:
create_set(master_df, real_value_columns, categorical_columns, perc = 2)
create_set(master_df, real_value_columns, categorical_columns, perc = 3)
create_set(master_df, real_value_columns, categorical_columns, perc = 4)
create_set(master_df, real_value_columns, categorical_columns, perc = 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docume

 >>  97278 1389 95889
Size {:.3f} 67.97450923919678  MB 
Size {:.3f} 41.81156826019287  MB 
 >>  97278 2106 95172
Size {:.3f} 67.97676086425781  MB 
Size {:.3f} 41.813819885253906  MB 
 >>  97278 2837 94441
Size {:.3f} 67.95990943908691  MB 
Size {:.3f} 41.79696846008301  MB 
 >>  97278 3583 93695
Size {:.3f} 67.95508193969727  MB 
Size {:.3f} 41.79214096069336  MB 


['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_guest_login']


In [9]:
# Save the files



# ===========================
# Write out the dimensionality of the columns into a text file
# ============================



Size {:.3f} 68.23318290710449  MB 
Size {:.3f} 42.441330909729004  MB 


42.441330909729004