In [11]:
import pandas as pd
import numpy as np
import os

'''
Checking for same column names.
'''

test_file = 'UNSW_NB15_test-set.csv'
train_file = 'UNSW_NB15_training-set.csv'

test_df = pd.read_csv(os.path.join('data', test_file))
train_df = pd.read_csv(os.path.join('data', train_file))

print('Unfiltered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Unfiltered ' + train_file + ' dataframe shape: ' + str(train_df.shape))

# Find common columns between test and training datasets
test_columns = set(test_df.columns)
train_columns = set(train_df.columns)
if test_columns == train_columns:
    print('\n Columns in both files are the same! \n')

common_columns = test_columns.intersection(train_columns)
filtered_test_df = test_df[list(common_columns)]
filtered_train_df = train_df[list(common_columns)]

filtered_test_df.drop('is_sm_ips_ports', axis=1, inplace=True)
filtered_train_df.drop('is_sm_ips_ports', axis=1, inplace=True)

# Write the filtered dataframes back to CSV files
filtered_test_df.to_csv(test_file, index=False)
filtered_train_df.to_csv(train_file, index=False)

test_df = pd.read_csv(os.path.join('data', test_file))
train_df = pd.read_csv(os.path.join('data', train_file))
print('Filtered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Filtered ' + train_file + ' dataframe shape: ' + str(train_df.shape))



Unfiltered UNSW_NB15_test-set.csv dataframe shape: (82332, 45)
Unfiltered UNSW_NB15_training-set.csv dataframe shape: (175341, 45)

 Columns in both files are the same! 

Filtered UNSW_NB15_test-set.csv dataframe shape: (82332, 45)
Filtered UNSW_NB15_training-set.csv dataframe shape: (175341, 45)


In [9]:
'''
Drop all rows with missing values.
'''

print('Unfiltered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Unfiltered ' + train_file + ' dataframe shape: ' + str(train_df.shape))

# Replace '-' with NaN
test_df.replace('-', np.nan, inplace=True)
train_df.replace('-', np.nan, inplace=True)

# Remove rows containing NaN values
test_df.dropna(axis=0, how='any', inplace=True)
train_df.dropna(axis=0, how='any', inplace=True)

print('Filtered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Filtered ' + train_file + ' dataframe shape: ' + str(train_df.shape))

# Write the filtered dataframes back to CSV files
test_df.to_csv(test_file, index=False)
train_df.to_csv(train_file, index=False)

train_df



Unfiltered UNSW_NB15_test-set.csv dataframe shape: (0, 66)
Unfiltered UNSW_NB15_training-set.csv dataframe shape: (0, 66)
Filtered UNSW_NB15_test-set.csv dataframe shape: (0, 66)
Filtered UNSW_NB15_training-set.csv dataframe shape: (0, 66)


Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,state-INT,state-REQ,attack_cat-Backdoor,attack_cat-DoS,attack_cat-Exploits,attack_cat-Fuzzers,attack_cat-Generic,attack_cat-Normal,attack_cat-Reconnaissance,attack_cat-Worms


In [4]:
'''
Detect records with categorical values that only appear in training and test data.
'''

proto_values = None
service_values = None
state_values = None
attack_cat = None
categorical_columns = {'proto': proto_values, 
                       'service': service_values, 
                       'state': state_values, 
                       'attack_cat': attack_cat}

for key in categorical_columns.keys():
    test_values = set(test_df[key])
    train_values = set(train_df[key])
    if test_values == train_values:
        print('The values are the same. Dont trip dawg')
    else:
        print('Values only in training data:')
        deleted_values = test_values - train_values
        if not deleted_values:
            print('None found!')
        else:
            print(deleted_values)
        print('Values only in testing data:')
        deleted_values = train_values - test_values
        if not deleted_values:
            print('None found!')
        else:
            print(deleted_values)
    common_values = test_values.intersection(train_values)
    categorical_columns[key] = common_values
    print('Common categorical values for ' + key + ' column:')
    print(common_values)
    print('')

The values are the same. Dont trip dawg
Common categorical values for proto column:
{'udp', 'tcp'}

The values are the same. Dont trip dawg
Common categorical values for service column:
{'ftp-data', 'ftp', 'dhcp', 'smtp', 'pop3', 'http', 'snmp', 'dns', 'irc', 'radius', 'ssl', 'ssh'}

Values only in training data:
{'ACC'}
Values only in testing data:
{'RST'}
Common categorical values for state column:
{'REQ', 'INT', 'CON', 'FIN'}

Values only in training data:
None found!
Values only in testing data:
{'Analysis'}
Common categorical values for attack_cat column:
{'Normal', 'Fuzzers', 'DoS', 'Worms', 'Exploits', 'Generic', 'Reconnaissance', 'Backdoor'}



In [5]:
'''
Remove all the records with categorical values that only appear in training and test data.
'''

print('Unfiltered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Unfiltered ' + train_file + ' dataframe shape: ' + str(train_df.shape))

for key in categorical_columns.keys():
    # Find unique values in the test and train datasets for the current column
    test_values = set(test_df[key])
    train_values = set(train_df[key])

    # Find values only in the training data and values only in the testing data
    deleted_values_test = test_values - train_values
    deleted_values_train = train_values - test_values

    # Combine deleted values from both datasets
    deleted_values = deleted_values_test.union(deleted_values_train)

    # Check if there are any deleted values
    if deleted_values:
        # Create a boolean mask to identify records containing deleted values
        mask_test = test_df[key].isin(deleted_values)
        mask_train = train_df[key].isin(deleted_values)
        
        # Delete records containing deleted values from both datasets
        test_df = test_df[~mask_test]
        train_df = train_df[~mask_train]

print('Filtered ' + test_file + ' dataframe shape: ' + str(test_df.shape))
print('Filtered ' + train_file + ' dataframe shape: ' + str(train_df.shape))

Unfiltered UNSW_NB15_test-set.csv dataframe shape: (35179, 45)
Unfiltered UNSW_NB15_training-set.csv dataframe shape: (81173, 45)
Filtered UNSW_NB15_test-set.csv dataframe shape: (35178, 45)
Filtered UNSW_NB15_training-set.csv dataframe shape: (80595, 45)


In [6]:
'''
Encode categorical features.
'''

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)

def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

print(categorical_columns.keys())

for key in categorical_columns.keys():
    encode_text_dummy(test_df, key)

for key in categorical_columns.keys():
    encode_text_dummy(train_df, key)

# Write the filtered dataframes back to CSV files
test_df.to_csv(test_file, index=False)
train_df.to_csv(train_file, index=False)

train_df



dict_keys(['proto', 'service', 'state', 'attack_cat'])


Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,...,state-INT,state-REQ,attack_cat-Backdoor,attack_cat-DoS,attack_cat-Exploits,attack_cat-Fuzzers,attack_cat-Generic,attack_cat-Normal,attack_cat-Reconnaissance,attack_cat-Worms
3,4,1.681642,12,12,628,770,13.677108,62,252,2.740179e+03,...,False,False,False,False,False,False,False,True,False,False
11,12,2.093085,62,28,56329,2212,42.520967,62,252,2.118251e+05,...,False,False,False,False,False,False,False,True,False,False
15,16,0.000002,2,0,138,0,500000.001300,254,0,2.760000e+08,...,True,False,False,False,False,False,False,True,False,False
17,18,0.393556,10,8,860,1096,43.195886,62,252,1.573347e+04,...,False,False,False,False,False,False,False,True,False,False
21,22,0.338017,10,6,998,268,44.376468,254,252,2.127704e+04,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175335,175336,0.000006,2,0,114,0,166666.660800,254,0,7.600000e+07,...,True,False,False,False,False,False,True,False,False,False
175336,175337,0.000009,2,0,114,0,111111.107200,254,0,5.066666e+07,...,True,False,False,False,False,False,True,False,False,False
175338,175339,0.000009,2,0,114,0,111111.107200,254,0,5.066666e+07,...,True,False,False,False,False,False,True,False,False,False
175339,175340,0.000009,2,0,114,0,111111.107200,254,0,5.066666e+07,...,True,False,False,False,False,False,True,False,False,False


In [7]:
''' 
Normalize numeric features.
'''


# Select numerical columns
print(set(train_df.dtypes))
numerical_cols = train_df.select_dtypes(include=['float64', 'int64']).columns

# Normalize numerical columns
train_df[numerical_cols] = (train_df[numerical_cols] - train_df[numerical_cols].min()) / (train_df[numerical_cols].max() - train_df[numerical_cols].min())
# Normalize numerical columns
test_df[numerical_cols] = (test_df[numerical_cols] - test_df[numerical_cols].min()) / (test_df[numerical_cols].max() - test_df[numerical_cols].min())

print(train_df.shape)
# Unnecessary column for training/testing purposes
test_df.drop('id', axis=1, inplace=True)
train_df.drop('id', axis=1, inplace=True)
print(train_df.shape)

# Write the filtered dataframes back to CSV files
test_df.to_csv(test_file, index=False)
train_df.to_csv(train_file, index=False)

test_df

{dtype('bool'), dtype('float64'), dtype('int64')}
(80595, 67)
(80595, 66)


Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,state-INT,state-REQ,attack_cat-Backdoor,attack_cat-DoS,attack_cat-Exploits,attack_cat-Fuzzers,attack_cat-Generic,attack_cat-Normal,attack_cat-Reconnaissance,attack_cat-Worms
35,0.016398,0.000845,0.000726,0.000052,0.000080,0.000017,0.138393,0.996047,0.000003,0.000401,...,False,False,False,False,False,False,False,True,False,False
40,0.025588,0.000845,0.000908,0.000053,0.000086,0.000012,0.138393,0.996047,0.000002,0.000285,...,False,False,False,False,False,False,False,True,False,False
45,0.017656,0.000845,0.000726,0.000053,0.000077,0.000016,0.138393,0.996047,0.000002,0.000360,...,False,False,False,False,False,False,False,True,False,False
49,0.016509,0.000845,0.000908,0.000051,0.000096,0.000019,0.138393,0.996047,0.000003,0.000494,...,False,False,False,False,False,False,False,True,False,False
72,0.021725,0.001033,0.000726,0.000058,0.000076,0.000015,0.138393,0.996047,0.000002,0.000289,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81462,0.013638,0.005543,0.001271,0.004746,0.000042,0.000089,0.995536,0.996047,0.000285,0.000267,...,False,False,False,False,False,False,False,True,False,False
81466,0.013753,0.005543,0.001271,0.004746,0.000042,0.000088,0.995536,0.996047,0.000282,0.000265,...,False,False,False,False,False,False,False,True,False,False
81518,0.014582,0.005543,0.001271,0.004746,0.000042,0.000083,0.995536,0.996047,0.000266,0.000250,...,False,False,False,False,False,False,False,True,False,False
81540,0.012337,0.005543,0.001271,0.004746,0.000042,0.000099,0.995536,0.996047,0.000315,0.000295,...,False,False,False,False,False,False,False,True,False,False


In [10]:
''' 
Fully Connected Neural network
'''
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the model architecture
model = Sequential([
    Dense(64, activation='relu', input_shape=(65,)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'Precision', 'Recall'])

# Train the model
history = model.fit(train_df.drop(columns=['label']), train_df['label'], 
                    epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on test data
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(test_df.drop(columns=['label']), 
                                                                       test_df['label'])

print('Test Loss:', test_loss)
print('Test Accuracy:', test_accuracy)
print('Test Precision:', test_precision)
print('Test Recall:', test_recall)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Training data contains 0 samples, which is not sufficient to split it into a validation and training set as specified by `validation_split=0.2`. Either provide more data, or a different value for the `validation_split` argument.

In [1]:
''' 
Confusion matrix and ROC curve for fully connected neural network
'''
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, auc

# Predict labels for the test data
predictions = model.predict(test_df.drop(columns=['label']))
predicted_labels = (predictions > 0.5).astype(int)

# Confusion Matrix
cm = confusion_matrix(test_df['label'], predicted_labels)
print("Confusion Matrix:")
print(cm)

# Plot Confusion Matrix
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.xticks([0, 1], ['Negative', 'Positive'])
plt.yticks([0, 1], ['Negative', 'Positive'])
plt.show()

ModuleNotFoundError: No module named 'matplotlib'