In [24]:
import pandas as pd
import numpy as np
import os

'''
Checking for same column names.
'''

test_file = 'UNSW_NB15_test-set.csv'
train_file = 'UNSW_NB15_training-set.csv'

test_df = pd.read_csv(os.path.join('data', test_file))
train_df = pd.read_csv(os.path.join('data', train_file))

print('Unfiltered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Unfiltered ' + train_file + ' dataframe shape: ' + str(train_df.shape))

# Find common columns between test and training datasets
test_columns = set(test_df.columns)
train_columns = set(train_df.columns)
if test_columns == train_columns:
    print('\n Columns in both files are the same! \n')

common_columns = test_columns.intersection(train_columns)
filtered_test_df = test_df[list(common_columns)]
filtered_train_df = train_df[list(common_columns)]

filtered_test_df.drop('is_sm_ips_ports', axis=1, inplace=True)
filtered_train_df.drop('is_sm_ips_ports', axis=1, inplace=True)

if 'is_sm_ips_ports' in filtered_test_df.columns:
    print('LOOK RIGHT HERE')

# Write the filtered dataframes back to CSV files
filtered_test_df.to_csv(test_file, index=False)
filtered_train_df.to_csv(train_file, index=False)

test_df = pd.read_csv(os.path.join('data', test_file))
train_df = pd.read_csv(os.path.join('data', train_file))

print('Filtered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Filtered ' + train_file + ' dataframe shape: ' + str(train_df.shape))



Unfiltered UNSW_NB15_test-set.csv dataframe shape: (82332, 45)
Unfiltered UNSW_NB15_training-set.csv dataframe shape: (175341, 45)

 Columns in both files are the same! 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test_df.drop('is_sm_ips_ports', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_train_df.drop('is_sm_ips_ports', axis=1, inplace=True)


Filtered UNSW_NB15_test-set.csv dataframe shape: (82332, 45)
Filtered UNSW_NB15_training-set.csv dataframe shape: (175341, 45)


In [13]:
'''
Drop all rows with missing values.
'''

print('Unfiltered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Unfiltered ' + train_file + ' dataframe shape: ' + str(train_df.shape))

# Replace '-' with NaN
test_df.replace('-', np.nan, inplace=True)
train_df.replace('-', np.nan, inplace=True)

# Remove rows containing NaN values
test_df.dropna(axis=0, how='any', inplace=True)
train_df.dropna(axis=0, how='any', inplace=True)

print('Filtered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Filtered ' + train_file + ' dataframe shape: ' + str(train_df.shape))

# Write the filtered dataframes back to CSV files
test_df.to_csv(test_file, index=False)
train_df.to_csv(train_file, index=False)

train_df



Unfiltered UNSW_NB15_test-set.csv dataframe shape: (82332, 45)
Unfiltered UNSW_NB15_training-set.csv dataframe shape: (175341, 45)
Filtered UNSW_NB15_test-set.csv dataframe shape: (35179, 45)
Filtered UNSW_NB15_training-set.csv dataframe shape: (81173, 45)


Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
11,12,2.093085,tcp,smtp,FIN,62,28,56329,2212,42.520967,...,1,2,0,0,0,1,1,0,Normal,0
15,16,0.000002,udp,snmp,INT,2,0,138,0,500000.001300,...,1,4,0,0,0,2,1,0,Normal,0
17,18,0.393556,tcp,http,FIN,10,8,860,1096,43.195886,...,1,2,0,0,1,1,3,0,Normal,0
21,22,0.338017,tcp,http,FIN,10,6,998,268,44.376468,...,1,1,0,0,1,2,3,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175335,175336,0.000006,udp,dns,INT,2,0,114,0,166666.660800,...,17,45,0,0,0,33,45,0,Generic,1
175336,175337,0.000009,udp,dns,INT,2,0,114,0,111111.107200,...,13,24,0,0,0,24,24,0,Generic,1
175338,175339,0.000009,udp,dns,INT,2,0,114,0,111111.107200,...,3,13,0,0,0,3,12,0,Generic,1
175339,175340,0.000009,udp,dns,INT,2,0,114,0,111111.107200,...,14,30,0,0,0,30,30,0,Generic,1


In [14]:
'''
Detect records with categorical values that only appear in training and test data.
'''

proto_values = None
service_values = None
state_values = None
attack_cat = None
categorical_columns = {'proto': proto_values, 
                       'service': service_values, 
                       'state': state_values, 
                       'attack_cat': attack_cat}

for key in categorical_columns.keys():
    test_values = set(test_df[key])
    train_values = set(train_df[key])
    if test_values == train_values:
        print('The values are the same. Dont trip dawg')
    else:
        print('Values only in training data:')
        deleted_values = test_values - train_values
        if not deleted_values:
            print('None found!')
        else:
            print(deleted_values)
        print('Values only in testing data:')
        deleted_values = train_values - test_values
        if not deleted_values:
            print('None found!')
        else:
            print(deleted_values)
    common_values = test_values.intersection(train_values)
    categorical_columns[key] = common_values
    print('Common categorical values for ' + key + ' column:')
    print(common_values)
    print('')

The values are the same. Dont trip dawg
Common categorical values for proto column:
{'tcp', 'udp'}

The values are the same. Dont trip dawg
Common categorical values for service column:
{'smtp', 'ssh', 'radius', 'snmp', 'irc', 'ftp', 'ssl', 'dns', 'dhcp', 'ftp-data', 'pop3', 'http'}

Values only in training data:
{'ACC'}
Values only in testing data:
{'RST'}
Common categorical values for state column:
{'FIN', 'REQ', 'INT', 'CON'}

Values only in training data:
None found!
Values only in testing data:
{'Analysis'}
Common categorical values for attack_cat column:
{'Exploits', 'Worms', 'Generic', 'Reconnaissance', 'Normal', 'Backdoor', 'Fuzzers', 'DoS'}



In [15]:
'''
Remove all the records with categorical values that only appear in training and test data.
'''

print('Unfiltered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Unfiltered ' + train_file + ' dataframe shape: ' + str(train_df.shape))

for key in categorical_columns.keys():
    # Find unique values in the test and train datasets for the current column
    test_values = set(test_df[key])
    train_values = set(train_df[key])

    # Find values only in the training data and values only in the testing data
    deleted_values_test = test_values - train_values
    deleted_values_train = train_values - test_values

    # Combine deleted values from both datasets
    deleted_values = deleted_values_test.union(deleted_values_train)

    # Check if there are any deleted values
    if deleted_values:
        # Create a boolean mask to identify records containing deleted values
        mask_test = test_df[key].isin(deleted_values)
        mask_train = train_df[key].isin(deleted_values)
        
        # Delete records containing deleted values from both datasets
        test_df = test_df[~mask_test]
        train_df = train_df[~mask_train]

print('Filtered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Filtered ' + train_file + ' dataframe shape: ' + str(train_df.shape))

Unfiltered UNSW_NB15_test-set.csv dataframe shape: (35179, 45)
Unfiltered UNSW_NB15_training-set.csv dataframe shape: (81173, 45)
Filtered UNSW_NB15_test-set.csv dataframe shape: (35178, 45)
Filtered UNSW_NB15_training-set.csv dataframe shape: (80595, 45)


In [16]:
'''
Encode categorical features and hot encode 'attack_cat' column.
'''

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)

def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

print(categorical_columns.keys())

for key in categorical_columns.keys():
    if key != 'attack_cat':
        print(key)
        encode_text_dummy(test_df, key)
        encode_text_dummy(train_df, key)

# Write the filtered dataframes back to CSV files
test_df.to_csv(test_file, index=False)
train_df.to_csv(train_file, index=False)

print(train_df.columns)
train_df



dict_keys(['proto', 'service', 'state', 'attack_cat'])
proto
service
state
Index(['id', 'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl',
       'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit',
       'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat',
       'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src',
       'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label',
       'proto-tcp', 'proto-udp', 'service-dhcp', 'service-dns', 'service-ftp',
       'service-ftp-data', 'service-http', 'service-irc', 'service-pop3',
       'service-radius', 'service-smtp', 'service-snmp', 'service-ssh',
       'service-ssl', 'state-CON', 'state-FIN', 'state-INT', 'state-REQ'],
      dtype='object')


Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,...,service-pop3,service-radius,service-smtp,service-snmp,service-ssh,service-ssl,state-CON,state-FIN,state-INT,state-REQ
3,4,1.681642,12,12,628,770,13.677108,62,252,2.740179e+03,...,False,False,False,False,False,False,False,True,False,False
11,12,2.093085,62,28,56329,2212,42.520967,62,252,2.118251e+05,...,False,False,True,False,False,False,False,True,False,False
15,16,0.000002,2,0,138,0,500000.001300,254,0,2.760000e+08,...,False,False,False,True,False,False,False,False,True,False
17,18,0.393556,10,8,860,1096,43.195886,62,252,1.573347e+04,...,False,False,False,False,False,False,False,True,False,False
21,22,0.338017,10,6,998,268,44.376468,254,252,2.127704e+04,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175335,175336,0.000006,2,0,114,0,166666.660800,254,0,7.600000e+07,...,False,False,False,False,False,False,False,False,True,False
175336,175337,0.000009,2,0,114,0,111111.107200,254,0,5.066666e+07,...,False,False,False,False,False,False,False,False,True,False
175338,175339,0.000009,2,0,114,0,111111.107200,254,0,5.066666e+07,...,False,False,False,False,False,False,False,False,True,False
175339,175340,0.000009,2,0,114,0,111111.107200,254,0,5.066666e+07,...,False,False,False,False,False,False,False,False,True,False


In [17]:
''' 
Normalize numeric features.
'''

# Select numerical columns
print(set(train_df.dtypes))
numerical_cols = train_df.select_dtypes(include=['float64', 'int64']).columns

# Normalize numerical columns
train_df[numerical_cols] = (train_df[numerical_cols] - train_df[numerical_cols].min()) / (train_df[numerical_cols].max() - train_df[numerical_cols].min())
# Normalize numerical columns
test_df[numerical_cols] = (test_df[numerical_cols] - test_df[numerical_cols].min()) / (test_df[numerical_cols].max() - test_df[numerical_cols].min())

print(train_df.shape)
# Unnecessary column for training/testing purposes
test_df.drop('id', axis=1, inplace=True)
train_df.drop('id', axis=1, inplace=True)
print(train_df.shape)

# Write the filtered dataframes back to CSV files
test_df.to_csv(test_file, index=False)
train_df.to_csv(train_file, index=False)

test_df

{dtype('O'), dtype('bool'), dtype('int64'), dtype('float64')}
(80595, 60)
(80595, 59)


Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,service-pop3,service-radius,service-smtp,service-snmp,service-ssh,service-ssl,state-CON,state-FIN,state-INT,state-REQ
35,0.016398,0.000845,0.000726,0.000052,0.000080,0.000017,0.138393,0.996047,0.000003,0.000401,...,False,False,False,False,False,False,False,True,False,False
40,0.025588,0.000845,0.000908,0.000053,0.000086,0.000012,0.138393,0.996047,0.000002,0.000285,...,False,False,False,False,False,False,False,True,False,False
45,0.017656,0.000845,0.000726,0.000053,0.000077,0.000016,0.138393,0.996047,0.000002,0.000360,...,False,False,False,False,False,False,False,True,False,False
49,0.016509,0.000845,0.000908,0.000051,0.000096,0.000019,0.138393,0.996047,0.000003,0.000494,...,False,False,False,False,False,False,False,True,False,False
72,0.021725,0.001033,0.000726,0.000058,0.000076,0.000015,0.138393,0.996047,0.000002,0.000289,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81462,0.013638,0.005543,0.001271,0.004746,0.000042,0.000089,0.995536,0.996047,0.000285,0.000267,...,False,False,False,False,False,False,False,True,False,False
81466,0.013753,0.005543,0.001271,0.004746,0.000042,0.000088,0.995536,0.996047,0.000282,0.000265,...,False,False,False,False,False,False,False,True,False,False
81518,0.014582,0.005543,0.001271,0.004746,0.000042,0.000083,0.995536,0.996047,0.000266,0.000250,...,False,False,False,False,False,False,False,True,False,False
81540,0.012337,0.005543,0.001271,0.004746,0.000042,0.000099,0.995536,0.996047,0.000315,0.000295,...,False,False,False,False,False,False,False,True,False,False


In [18]:
'''
Hot encode 'attack_cat' column for labeling.
'''

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Function to encode text values to integers
def encode_text_integer(df, name):
    encoder = LabelEncoder()
    df[name] = encoder.fit_transform(df[name])

# Encode 'attack_cat' column to integers
encode_text_integer(train_df, 'attack_cat')
encode_text_integer(test_df, 'attack_cat')
train_df['attack_cat'] = train_df['attack_cat'].astype('int32')
test_df['attack_cat'] = test_df['attack_cat'].astype('int32')

# Assuming df is your DataFrame
train_boolean_columns = train_df.select_dtypes(include='bool').columns
test_boolean_columns = test_df.select_dtypes(include='bool').columns
train_df[train_boolean_columns] = train_df[train_boolean_columns].astype('int32')
test_df[test_boolean_columns] = test_df[test_boolean_columns].astype('int32')

# Write the filtered dataframes back to CSV files
test_df.to_csv(test_file, index=False)
train_df.to_csv(train_file, index=False)

# Print 'attack_cat' column
print(train_df['attack_cat'])

3         5
11        5
15        5
17        5
21        5
         ..
175335    4
175336    4
175338    4
175339    4
175340    4
Name: attack_cat, Length: 80595, dtype: int32


In [19]:
import tensorflow as tf
print('Currently using Tensorflow version: ' + tf.__version__)

Currently using Tensorflow version: 2.16.1


In [20]:
'''
Extract testing and training data from dataframes.
'''

X_train = train_df.drop('attack_cat', axis=1)  # Exclude 'attack_cat' column
y_train = train_df['attack_cat']
X_test = test_df.drop('attack_cat', axis=1) # Exclude 'attack_cat' column
y_test = test_df['attack_cat']
X_train

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,service-pop3,service-radius,service-smtp,service-snmp,service-ssh,service-ssl,state-CON,state-FIN,state-INT,state-REQ
3,2.802780e-02,0.001144,0.001093,0.000041,0.000053,0.000014,0.138393,0.992126,0.000001,0.000150,...,0,0,0,0,0,0,0,1,0,0
11,3.488529e-02,0.006344,0.002551,0.004337,0.000151,0.000043,0.138393,0.992126,0.000092,0.000364,...,0,0,1,0,0,0,0,1,0,0
15,1.666693e-08,0.000104,0.000000,0.000003,0.000000,0.500000,0.995536,0.000000,0.119792,0.000000,...,0,0,0,1,0,0,0,0,1,0
17,6.559354e-03,0.000936,0.000729,0.000059,0.000075,0.000043,0.138393,0.992126,0.000007,0.000869,...,0,0,0,0,0,0,0,1,0,0
21,5.633690e-03,0.000936,0.000547,0.000069,0.000018,0.000044,0.995536,0.992126,0.000009,0.000236,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175335,8.333466e-08,0.000104,0.000000,0.000001,0.000000,0.166667,0.995536,0.000000,0.032986,0.000000,...,0,0,0,0,0,0,0,0,1,0
175336,1.333355e-07,0.000104,0.000000,0.000001,0.000000,0.111111,0.995536,0.000000,0.021991,0.000000,...,0,0,0,0,0,0,0,0,1,0
175338,1.333355e-07,0.000104,0.000000,0.000001,0.000000,0.111111,0.995536,0.000000,0.021991,0.000000,...,0,0,0,0,0,0,0,0,1,0
175339,1.333355e-07,0.000104,0.000000,0.000001,0.000000,0.111111,0.995536,0.000000,0.021991,0.000000,...,0,0,0,0,0,0,0,0,1,0


In [21]:
# Display information about the DataFrame
print(train_df.info())
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 80595 entries, 3 to 175340
Data columns (total 59 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   dur                80595 non-null  float64
 1   spkts              80595 non-null  float64
 2   dpkts              80595 non-null  float64
 3   sbytes             80595 non-null  float64
 4   dbytes             80595 non-null  float64
 5   rate               80595 non-null  float64
 6   sttl               80595 non-null  float64
 7   dttl               80595 non-null  float64
 8   sload              80595 non-null  float64
 9   dload              80595 non-null  float64
 10  sloss              80595 non-null  float64
 11  dloss              80595 non-null  float64
 12  sinpkt             80595 non-null  float64
 13  dinpkt             80595 non-null  float64
 14  sjit               80595 non-null  float64
 15  djit               80595 non-null  float64
 16  swin               80595 n

In [22]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

# Reshape data for CNN (assuming single channel)
X_train_reshaped = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_reshaped = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Define the CNN model
def create_cnn(input_shape, num_classes):
    model = models.Sequential([
        layers.Conv1D(32, 3, activation='relu', input_shape=input_shape),
        layers.MaxPooling1D(2),
        layers.Conv1D(64, 3, activation='relu'),
        layers.MaxPooling1D(2),
        layers.Conv1D(64, 3, activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

# Define input shape
input_shape = X_train_reshaped.shape[1:]

# Define number of classes
num_classes = y_train.shape[1]

# Create the CNN model
model = create_cnn(input_shape, num_classes)

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32, validation_data=(X_test_reshaped, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test_reshaped, y_test)

print('\nTest accuracy:', test_acc)


AttributeError: 'DataFrame' object has no attribute 'reshape'