In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier

## Importing and preprocessing the data, including converting categorical features to numerical codes.




The data is stored in parquet format it will be imported directly and analyzed for use

In [2]:
df_test = pd.read_parquet('./UNSW_NB15_training-set.parquet')
df_train = pd.read_parquet('./UNSW_NB15_testing-set.parquet')

print(df_train.shape, df_test.shape)
print(f"Distinct values: {df_test['label'].unique()}")

(175341, 36) (82332, 36)
Distinct values: [0 1]


Modify to make train/test data match the 80/20 rule and reduce the size for efficiency purposes

In [3]:
# Select the first 8000/2000 rows

df_train = df_train.sample(n=8000, random_state=42)
df_test = df_test.sample(n=2000, random_state=42)

print(df_train.shape)
print(df_test.shape)  # Should output (2000, 36)


(8000, 36)
(2000, 36)


Use the `.head()` function to visualize the data.

In [6]:
df_train.columns
df_train.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
15482,2.736664,tcp,-,FIN,232,438,13350,548216,244.458206,38858.99,...,0,0,2,1,0,0,0,0,Normal,0
133349,9e-06,udp,dns,INT,2,0,114,0,111111.109375,50666660.0,...,0,0,10,10,0,0,0,0,Generic,1
80485,5.788526,tcp,-,FIN,36,34,6102,3892,11.920133,8199.669,...,0,0,1,1,0,0,0,0,Exploits,1
29972,3.849634,tcp,-,FIN,448,858,25160,1094788,338.993286,52169.12,...,0,0,2,1,0,0,0,0,Normal,0
18339,0.001052,udp,dns,CON,2,2,130,162,2851.711182,494296.6,...,0,0,1,1,0,0,0,0,Normal,0


In this analysis, we'll focus on the attack/normal classification so we can drop the cat columns and set the label as target

In [7]:
columns_to_drop = ['attack_cat']
df_train = df_train.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

In [8]:
target = 'label'
print(f"Distinct values: {df_train['label'].unique()}")

Distinct values: [0 1]


Understanding categories in the dataset to move them to numerical ones

In [9]:
categorical_cols = list(df_train.select_dtypes(include='category').columns.values)
categorical_cols

['proto', 'service', 'state']

In [10]:
for col in categorical_cols:
    print(f"Column: '{col}'")
    print(f"Distinct values: {df_train[col].unique()}")
    print(f"Length: {len(df_train[col].unique())}")
    print(f"Categories ({len(df_train[col].cat.categories)}, {df_train[col].dtype}): {df_train[col].cat.categories}")
    
    # Convert categorical column to integer codes
    df_train[f"{col}"] = df_train[col].cat.codes
    df_train.columns 

Column: 'proto'
Distinct values: ['tcp', 'udp', 'st2', 'unas', 'irtp', ..., 'compaq-peer', 'stp', 'igp', 'mfe-nsp', 'bna']
Length: 130
Categories (133, object): ['3pc', 'a/n', 'aes-sp3-d', 'any', ..., 'xnet', 'xns-idp', 'xtp', 'zero']
Length: 130
Categories (133, category): Index(['3pc', 'a/n', 'aes-sp3-d', 'any', 'argus', 'aris', 'arp', 'ax.25',
       'bbn-rcc', 'bna',
       ...
       'visa', 'vmtp', 'vrrp', 'wb-expak', 'wb-mon', 'wsn', 'xnet', 'xns-idp',
       'xtp', 'zero'],
      dtype='object', length=133)
Column: 'service'
Distinct values: ['-', 'dns', 'smtp', 'http', 'pop3', ..., 'ssh', 'ftp-data', 'ssl', 'dhcp', 'radius']
Length: 12
Categories (13, object): ['-', 'dhcp', 'dns', 'ftp', ..., 'smtp', 'snmp', 'ssh', 'ssl']
Length: 12
Categories (13, category): Index(['-', 'dhcp', 'dns', 'ftp', 'ftp-data', 'http', 'irc', 'pop3', 'radius',
       'smtp', 'snmp', 'ssh', 'ssl'],
      dtype='object')
Column: 'state'
Distinct values: ['FIN', 'INT', 'CON', 'REQ', 'ECO', 'RST']
Catego

In [11]:
for col in categorical_cols:
    print(f"Column: '{col}'")
    print(f"Distinct values: {df_test[col].unique()}")
    print(f"Length: {len(df_test[col].unique())}")
    print(f"Categories ({len(df_test[col].cat.categories)}, {df_test[col].dtype}): {df_test[col].cat.categories}")
    
    # Convert categorical column to integer codes
    df_test[f"{col}"] = df_test[col].cat.codes
    df_test.columns 

Column: 'proto'
Distinct values: ['tcp', 'udp', 'arp', 'unas', 'pnni', ..., 'pgm', 'ib', 'l2tp', 'dgp', 'ipip']
Length: 75
Categories (131, object): ['3pc', 'a/n', 'aes-sp3-d', 'any', ..., 'xnet', 'xns-idp', 'xtp', 'zero']
Length: 75
Categories (131, category): Index(['3pc', 'a/n', 'aes-sp3-d', 'any', 'argus', 'aris', 'arp', 'ax.25',
       'bbn-rcc', 'bna',
       ...
       'visa', 'vmtp', 'vrrp', 'wb-expak', 'wb-mon', 'wsn', 'xnet', 'xns-idp',
       'xtp', 'zero'],
      dtype='object', length=131)
Column: 'service'
Distinct values: ['ftp', 'dns', '-', 'pop3', 'http', 'ftp-data', 'ssh', 'smtp', 'irc', 'dhcp']
Categories (13, object): ['-', 'dhcp', 'dns', 'ftp', ..., 'smtp', 'snmp', 'ssh', 'ssl']
Length: 10
Categories (13, category): Index(['-', 'dhcp', 'dns', 'ftp', 'ftp-data', 'http', 'irc', 'pop3', 'radius',
       'smtp', 'snmp', 'ssh', 'ssl'],
      dtype='object')
Column: 'state'
Distinct values: ['FIN', 'INT', 'REQ', 'CON']
Categories (7, object): ['ACC', 'CLO', 'CON', 'FIN',

In [12]:
df_test.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,label
38666,1.292678,111,3,3,52,54,2934,3740,81.226723,17811.09,...,69,0,0,1,1,1,1,0,0,0
56460,7e-06,117,2,4,2,0,114,0,142857.140625,65142860.0,...,0,0,0,5,4,0,0,0,0,1
16266,1e-05,117,2,4,2,0,114,0,100000.0,45600000.0,...,0,0,0,23,13,0,0,0,0,1
75603,0.0,6,0,4,1,0,46,0,0.0,0.0,...,0,0,0,1,1,0,0,0,1,0
11200,0.189998,111,0,3,10,6,1460,268,78.948196,55326.9,...,45,0,0,1,1,0,0,0,0,1


In [13]:
categorical_cols = list(df_train.select_dtypes(include='category').columns.values)
categorical_cols

[]

Moving the data to Xy for model fitting

In [14]:
X_train = df_train.drop('label', axis=1)  # Features
y_train = df_train['label']  # Target variable
X_test = df_test.drop('label', axis=1)  # Features
y_test = df_test['label']  # Target variable

In [15]:
print(y_train.shape)
print(y_test.shape) 

(8000,)
(2000,)


## Fitting a basic Random Forest Classifier and evaluating its performance on a test set.

For this problem we'll use a random forest classifier and will used GridSearch for Hyperparameter tuning. Using accuracy, precision and recall as the relevant metrics 


In [16]:
rfe = RandomForestClassifier()

rfe.fit(X_train, y_train)

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
y_pred = rfe.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred)

recall = recall_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

auc_roc = roc_auc_score(y_test, y_pred)


print(f"Accuracy: {accuracy}")

print(f"Precision: {precision}")

print(f"Recall: {recall}")

print(f"F1-score: {f1}")

print(f"AUC-ROC: {auc_roc}")

Accuracy: 0.856
Precision: 0.8061607813673929
Recall: 0.9728014505893019
F1-score: 0.8816762530813476
AUC-ROC: 0.8425880162645506


## Performing hyperparameter tuning using GridSearchCV to optimize the model's F1-score.


In [18]:
# Hyperparameter tuning

param_grid = {

    'n_estimators': [100, 200, 500],

    'max_depth': [5, 10, None],

    'max_features': ['sqrt', 'log2', None]

}


rfe = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rfe, param_grid=param_grid, cv=5, scoring='f1')

grid_search.fit(X_train, y_train)




## Analyzing the feature importances to identify the most influential variables in the classification.

In [19]:
# Results from tuning
print("Best Parameters: ", grid_search.best_params_)

# Print the best score
print("Best Score: ", grid_search.best_score_)

# Get the best estimator
best_estimator = grid_search.best_estimator_

# Print the feature importances
feature_importances = pd.DataFrame(best_estimator.feature_importances_, 
                                   index=X_train.columns, 
                                   columns=['Importance'])
feature_importances = feature_importances.sort_values('Importance', ascending=False)
print("\nFeature Importances:")
print(feature_importances)

# Print the selected features
selected_features = feature_importances[feature_importances['Importance'] > 0].index
print("\nSelected Features:", list(selected_features))

Best Parameters:  {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 100}
Best Score:  0.9538505915910273

Feature Importances:
                   Importance
dload                0.111077
rate                 0.095564
tcprtt               0.085253
sload                0.070052
dmean                0.063803
synack               0.063792
ackdat               0.058989
dur                  0.055118
sinpkt               0.040660
dbytes               0.040397
dinpkt               0.039696
sbytes               0.035729
smean                0.031361
djit                 0.024107
state                0.022732
sjit                 0.021823
dpkts                0.020023
dloss                0.017689
spkts                0.015190
sloss                0.014298
ct_src_dport_ltm     0.012724
service              0.011631
proto                0.010156
ct_dst_sport_ltm     0.008468
swin                 0.008275
is_sm_ips_ports      0.007433
dtcpb                0.004510
stcpb                0.00

In [20]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Get the best estimator from GridSearchCV
best_estimator = grid_search.best_estimator_

# Rerun the classifier with the best parameters on the test set
y_pred = best_estimator.predict(X_test)

# Calculate evaluation metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.63      0.76       897
           1       0.76      0.99      0.86      1103

    accuracy                           0.83      2000
   macro avg       0.87      0.81      0.81      2000
weighted avg       0.86      0.83      0.82      2000


Confusion Matrix:
[[ 561  336]
 [  10 1093]]

Accuracy Score: 0.827
