In [None]:
# module imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import random
import time
import tensorflow as tf

# model imports
from sklearn.ensemble import RandomForestClassifier ,RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance
from tensorflow.keras import regularizers

# processing imports
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense
#print('Welcome!')

In [None]:
# fetch the training file

file_path_full_training_set = '/content/UNSW_NB15_training-set.csv'
file_path_test = '/content/UNSW_NB15_testing-set.csv'


df = pd.read_csv(file_path_full_training_set)
test_df = pd.read_csv(file_path_test)

In [None]:
df

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.000011,udp,-,INT,2,0,496,0,90909.090200,...,1,2,0,0,0,1,2,0,Normal,0
1,2,0.000008,udp,-,INT,2,0,1762,0,125000.000300,...,1,2,0,0,0,1,2,0,Normal,0
2,3,0.000005,udp,-,INT,2,0,1068,0,200000.005100,...,1,3,0,0,0,1,3,0,Normal,0
3,4,0.000006,udp,-,INT,2,0,900,0,166666.660800,...,1,3,0,0,0,2,3,0,Normal,0
4,5,0.000010,udp,-,INT,2,0,2126,0,100000.002500,...,1,3,0,0,0,2,3,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,82328,0.000005,udp,-,INT,2,0,104,0,200000.005100,...,1,2,0,0,0,2,1,0,Normal,0
82328,82329,1.106101,tcp,-,FIN,20,8,18062,354,24.410067,...,1,1,0,0,0,3,2,0,Normal,0
82329,82330,0.000000,arp,-,INT,1,0,46,0,0.000000,...,1,1,0,0,0,1,1,1,Normal,0
82330,82331,0.000000,arp,-,INT,1,0,46,0,0.000000,...,1,1,0,0,0,1,1,1,Normal,0


In [None]:
set(df['attack_cat'])

{'Analysis',
 'Backdoor',
 'DoS',
 'Exploits',
 'Fuzzers',
 'Generic',
 'Normal',
 'Reconnaissance',
 'Shellcode',
 'Worms'}

In [None]:
# lists to hold our attack classifications

Reconnaissance=['Analysis',
 'Reconnaissance']
Access=['Backdoor',
 'Exploits',
 'Shellcode']
Disruption=['DoS',
 'Worms']
Testing=['Fuzzers']
General=['Generic']
# we will use these for plotting below
attack_labels1 = ['Normal','Reconnaissance','Access','Disruption','Testing','General']

# helper function to pass to data frame mapping
def map_attack(attack_cat):
    if attack_cat in Reconnaissance:
        # dos_attacks map to 1
        attack_type = 1
    elif attack_cat in Access:
        # probe_attacks mapt to 2
        attack_type = 2

    elif attack_cat in Disruption:
        # privilege escalation attacks map to 3
        attack_type = 3
    elif attack_cat in Testing:
        # remote access attacks map to 4
        attack_type = 4
    elif attack_cat in General:
        # remote access attacks map to 4
        attack_type = 5
    else:
        # normal maps to 0
        attack_type = 0


    return attack_type

# map the data and join to the data set
attack_map = df.attack_cat.apply(map_attack)
df['attack_map'] = attack_map

#test_attack_map = df.attack_cat.apply(map_attack)
#test_df['attack_map'] = test_attack_map

# view the result
df.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label,attack_map
0,1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,...,2,0,0,0,1,2,0,Normal,0,0
1,2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,...,2,0,0,0,1,2,0,Normal,0,0
2,3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,...,3,0,0,0,1,3,0,Normal,0,0
3,4,6e-06,udp,-,INT,2,0,900,0,166666.6608,...,3,0,0,0,2,3,0,Normal,0,0
4,5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,...,3,0,0,0,2,3,0,Normal,0,0


In [None]:
set(df['attack_map'])

{0, 1, 2, 3, 4, 5}

In [None]:
# get the intial set of encoded features and encode them
features_to_encode = ['proto', 'service', 'state']
encoded = pd.get_dummies(df[features_to_encode])
test_encoded_base = pd.get_dummies(df[features_to_encode])

# not all of the features are in the test set, so we need to account for diffs
test_index = np.arange(len(df.index))
column_diffs = list(set(encoded.columns.values)-set(test_encoded_base.columns.values))

diff_df = pd.DataFrame(0, index=test_index, columns=column_diffs)

# we'll also need to reorder the columns to match, so let's get those
column_order = encoded.columns.to_list()

# append the new columns
test_encoded_temp = test_encoded_base.join(diff_df)

# reorder the columns
test_final = test_encoded_temp[column_order].fillna(0)

# get numeric features, we won't worry about encoding these at this point
numeric_features = ['dur', 'sbytes', 'dbytes']

# model to fit/test
to_fit = encoded.join(df[numeric_features])
test_set = test_final.join(df[numeric_features])

In [None]:
# create our target classifications
multi_y = df['attack_map']

# test_multi_y = df['attack_map']

# build the training sets
multi_train_X, multi_val_X, multi_train_y, multi_val_y = train_test_split(to_fit, multi_y,test_size=0.2)

In [None]:
# model for the mulit classification
multi_model = RandomForestClassifier()
multi_model.fit(multi_train_X, multi_train_y)
multi_predictions = multi_model.predict(multi_val_X)

# get the score
accuracy_score(multi_predictions,multi_val_y)

0.8398615412643469

In [None]:

# model for the mulit classification
multi_model = DecisionTreeClassifier()
multi_model.fit(multi_train_X, multi_train_y)
multi_predictions = multi_model.predict(multi_val_X)

# get the score
accuracy_score(multi_predictions,multi_val_y)

0.8244974798081011

In [None]:
# model for the mulit classification
multi_model = KNeighborsClassifier()
multi_model.fit(multi_train_X, multi_train_y)
multi_predictions = multi_model.predict(multi_val_X)

# get the score
accuracy_score(multi_predictions,multi_val_y)

0.821036011416773

In [None]:
# model for the mulit classification
multi_model = LogisticRegression()
multi_model.fit(multi_train_X, multi_train_y)
multi_predictions = multi_model.predict(multi_val_X)

# get the score
accuracy_score(multi_predictions,multi_val_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.38592336187526566

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(multi_train_X)
X_test = scaler.transform(multi_val_X)

# 2. Define the neural network architecture
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # Input layer
    Dense(32, activation='relu'),  # Hidden layer
    Dense(len(set(multi_train_y)), activation='softmax')  # Output layer for binary classification
])

# 3. Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 4. Train the model
model.fit(X_train, multi_train_y, epochs=100, batch_size=32, validation_data=(X_test, multi_val_y))  # Incorporate validation

# 5. Evaluate performance
test_loss, test_acc = model.evaluate(X_test, multi_val_y)
print('Test accuracy:', test_acc)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78