# KDDCup1999

## Import libraries and configure

In [78]:
import numpy as np
import pandas as pd
import klib

import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import RocCurveDisplay, confusion_matrix, ConfusionMatrixDisplay, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [30]:
labels = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate', 'label']
df = pd.read_csv('./data/kddcup/kddcup.data.corrected', names=labels, index_col=False)

## Import and read data

In [31]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898431 entries, 0 to 4898430
Data columns (total 42 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   duration                     int64  
 1   protocol_type                object 
 2   service                      object 
 3   flag                         object 
 4   src_bytes                    int64  
 5   dst_bytes                    int64  
 6   land                         int64  
 7   wrong_fragment               int64  
 8   urgent                       int64  
 9   hot                          int64  
 10  num_failed_logins            int64  
 11  logged_in                    int64  
 12  num_compromised              int64  
 13  root_shell                   int64  
 14  su_attempted                 int64  
 15  num_root                     int64  
 16  num_file_creations           int64  
 17  num_shells                   int64  
 18  num_access_files             int64  
 19  

In [5]:
df['label'].unique()

array(['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.',
       'smurf.', 'guess_passwd.', 'pod.', 'teardrop.', 'portsweep.',
       'ipsweep.', 'land.', 'ftp_write.', 'back.', 'imap.', 'satan.',
       'phf.', 'nmap.', 'multihop.', 'warezmaster.', 'warezclient.',
       'spy.', 'rootkit.'], dtype=object)

In [32]:
def clean_df(df):

    # This set of feature should have >= 0 values
    num = df._get_numeric_data()
    num[num < 0] = 0

    zero_variance_cols = []
    for col in df.columns:
        if len(df[col].unique()) == 1:
            zero_variance_cols.append(col)
    df.drop(zero_variance_cols, axis = 1, inplace = True)
    print('\nzero variance columns', zero_variance_cols, 'dropped')
    print('shape after removing zero variance columns:', df.shape)

    df.replace([np.inf, -np.inf], np.nan, inplace = True)
    print()
    print(df.isna().any(axis = 1).sum(), 'rows dropped')
    df.dropna(inplace = True)
    print('\nshape after removing nan:', df.shape)

    # Drop duplicate rows
    df.drop_duplicates(inplace = True)
    print('\nshape after dropping duplicates:', df.shape)

    column_pairs = [(i, j) for i, j in combinations(df, 2) if df[i].equals(df[j])]
    ide_cols = []
    for column_pair in column_pairs:
        ide_cols.append(column_pair[1])
    df.drop(ide_cols, axis = 1, inplace = True)
    print('\ncolumns which have identical values', column_pairs, 'dropped')
    print('shape after removing identical value columns:', df.shape)
    return df
df_cleaned = clean_df(df)


zero variance columns ['num_outbound_cmds'] dropped
shape after removing zero variance columns: (4898431, 41)

0 rows dropped

shape after removing nan: (4898431, 41)

shape after dropping duplicates: (1074992, 41)

columns which have identical values [] dropped
shape after removing identical value columns: (1074992, 41)


In [33]:
df_cleaned = klib.data_cleaning(df_cleaned)

Shape of cleaned data: (1074992, 41) - Remaining NAs: 0


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 273.16 MB (-72.51%)



In [18]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074992 entries, 0 to 1074991
Data columns (total 41 columns):
 #   Column                       Non-Null Count    Dtype   
---  ------                       --------------    -----   
 0   duration                     1074992 non-null  int32   
 1   protocol_type                1074992 non-null  category
 2   service                      1074992 non-null  category
 3   flag                         1074992 non-null  category
 4   src_bytes                    1074992 non-null  int32   
 5   dst_bytes                    1074992 non-null  int32   
 6   land                         1074992 non-null  int8    
 7   wrong_fragment               1074992 non-null  int8    
 8   urgent                       1074992 non-null  int8    
 9   hot                          1074992 non-null  int8    
 10  num_failed_logins            1074992 non-null  int8    
 11  logged_in                    1074992 non-null  int8    
 12  num_compromised             

In [34]:
df_cleaned['label'].unique()

['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.', ..., 'multihop.', 'warezmaster.', 'warezclient.', 'spy.', 'rootkit.']
Length: 23
Categories (23, object): ['back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.', ..., 'spy.', 'teardrop.', 'warezclient.', 'warezmaster.']

In [36]:
df_cleaned2 = df_cleaned.copy()

In [37]:
df_cleaned2['label'] = df_cleaned2['label'].apply(lambda x: 0 if x == 'normal.' else 1)

In [39]:
df_cleaned2['label'].value_counts()

label
0    812814
1    262178
Name: count, dtype: int64

In [41]:
df_cleaned2 = klib.data_cleaning(df_cleaned2)

Shape of cleaned data: (1074983, 41) - Remaining NAs: 0


Dropped rows: 9
     of which 9 duplicates. (Rows (first 150 shown): [434927, 531587, 551535, 693519, 698381, 717520, 724707, 724710, 747443])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 7.17 MB (-6.48%)



In [55]:
num_cols = df_cleaned2._get_numeric_data().columns

cate_cols = list(set(df_cleaned2.columns)-set(num_cols))

cate_cols

['service', 'flag', 'protocol_type']

In [51]:
df_cleaned2.tail()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
1074978,0,tcp,http,SF,212,2288,0,0,0,0,...,255,1.0,0.0,0.33,0.05,0.0,0.01,0.0,0.0,0
1074979,0,tcp,http,SF,219,236,0,0,0,0,...,255,1.0,0.0,0.25,0.05,0.0,0.01,0.0,0.0,0
1074980,0,tcp,http,SF,218,3610,0,0,0,0,...,255,1.0,0.0,0.2,0.05,0.0,0.01,0.0,0.0,0
1074981,0,tcp,http,SF,219,1234,0,0,0,0,...,255,1.0,0.0,0.17,0.05,0.0,0.01,0.0,0.0,0
1074982,0,tcp,http,SF,219,1098,0,0,0,0,...,255,1.0,0.0,0.14,0.05,0.0,0.01,0.0,0.0,0


In [77]:
#Visualization
def bar_graph(feature):
    df_cleaned2[feature].value_counts().plot(kind="bar")

In [81]:
le = LabelEncoder()

df_cleaned2['protocol_type'] = le.fit_transform(df_cleaned2['protocol_type'])
df_cleaned2['service'] = le.fit_transform(df_cleaned2['service'])
df_cleaned2['flag'] = le.fit_transform(df_cleaned2['flag'])

In [85]:
df_cleaned2 = klib.data_cleaning(df_cleaned2)

Shape of cleaned data: (1074983, 41) - Remaining NAs: 0


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.0 MB (-0.0%)



In [89]:
df_cleaned2.to_csv('kddcup_processed.csv')