In [15]:
import pandas as pd
import os

'''
Checking for same column names
'''

test_file = 'UNSW_NB15_test-set.csv'
train_file = 'UNSW_NB15_training-set.csv'

test_df = pd.read_csv(os.path.join('data', test_file))
train_df = pd.read_csv(os.path.join('data', train_file))
print('Unfiltered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Unfiltered ' + train_file + ' dataframe shape: ' + str(train_df.shape))

# Find common columns between test and training datasets
test_columns = set(test_df.columns)
train_columns = set(train_df.columns)
if test_columns == train_columns:
    print('\n Columns in both files are the same! \n')

common_columns = test_columns.intersection(train_columns)
filtered_test_df = test_df[list(common_columns)]
filtered_train_df = train_df[list(common_columns)]

# Write the filtered dataframes back to CSV files
filtered_test_df.to_csv(test_file, index=False)
filtered_train_df.to_csv(train_file, index=False)

test_df = pd.read_csv(os.path.join('data', test_file))
train_df = pd.read_csv(os.path.join('data', train_file))
print('Filtered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Filtered ' + train_file + ' dataframe shape: ' + str(train_df.shape))

Unfiltered UNSW_NB15_test-set.csv dataframe shape: (82332, 45)
Unfiltered UNSW_NB15_training-set.csv dataframe shape: (175341, 45)

 Columns in both files are the same! 

Filtered UNSW_NB15_test-set.csv dataframe shape: (82332, 45)
Filtered UNSW_NB15_training-set.csv dataframe shape: (175341, 45)


In [24]:
'''
Remove all the records with categorical values that only appear in training and test data.
'''

proto_values = None
service_values = None
state_values = None
attack_cat = None
categorical_columns = {'proto': proto_values, 
                       'service': service_values, 
                       'state': state_values, 
                       'attack_cat': attack_cat}

for key in categorical_columns.keys():
    test_values = set(test_df[key])
    train_values = set(train_df[key])
    print('Values only in training data:')
    deleted_values = test_values - train_values
    if not deleted_values:
        print('None found!')
    print('Values only in testing data:')
    if not deleted_values:
        print('None found!')
    deleted_values = train_values - test_values
    common_values = test_values.intersection(train_values)
    categorical_columns[key] = common_values
    print(key + ':')
    print(common_values)
    print('')

Values only in training data:
None found!
Values only in testing data:
None found!
proto:
{'gre', 'netblt', 'sps', 'sep', 'pim', 'ggp', 'sprite-rpc', 'leaf-2', 'bbn-rcc', 'fc', 'l2tp', 'ospf', 'a/n', 'sun-nd', 'mobile', 'pri-enc', 'ipv6-no', 'sdrp', 'rvd', 'igmp', 'xtp', 'ipnip', 'bna', 'emcon', 'larp', 'pipe', 'mhrp', 'rsvp', 'compaq-peer', 'argus', 'dcn', 'uti', 'encap', 'gmtp', 'irtp', 'aris', 'secure-vmtp', 'isis', 'merit-inp', 'iso-ip', 'cftp', 'tcp', 'br-sat-mon', 'ipv6-route', 'wb-expak', 'stp', 'ipv6', 'iplt', 'skip', 'rdp', 'cphb', 'st2', 'ipv6-opts', 'unas', 'ipcv', 'ipv6-frag', 'trunk-2', 'mtp', 'hmp', 'iatp', 'ddx', 'ipx-n-ip', 'idpr', 'udp', 'vines', 'leaf-1', 'idrp', 'i-nlsp', 'crudp', 'cbt', 'chaos', 'igp', 'aes-sp3-d', 'pup', 'sctp', 'arp', 'mux', 'scps', 'pnni', 'micp', 'eigrp', 'kryptolan', 'pvp', 'any', 'ptp', 'fire', 'sccopmce', 'narp', 'wsn', 'srp', 'dgp', 'nsfnet-igp', 'smp', 'sat-mon', 'egp', '3pc', 'ip', 'prm', 'vrrp', 'tlsp', 'nvp', 'ifmp', 'sat-expak', 'wb-mon

In [None]:
'''
Encode categorical features and normalize numeric features.
'''

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)