## Firewall Response Prediction
Model Developed by Faseni Fakoya  - 19th August 2024

In [1]:
#Packages 
import pandas as pd
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
#Data Loading
data = pd.read_csv("./Firewall_log/log2.csv")

In [3]:
data.columns

Index(['Source Port', 'Destination Port', 'NAT Source Port',
       'NAT Destination Port', 'Action', 'Bytes', 'Bytes Sent',
       'Bytes Received', 'Packets', 'Elapsed Time (sec)', 'pkts_sent',
       'pkts_received'],
      dtype='object')

In [4]:
data.head()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
0,57222,53,54587,53,allow,177,94,83,2,30,1,1
1,56258,3389,56258,3389,allow,4768,1600,3168,19,17,10,9
2,6881,50321,43265,50321,allow,238,118,120,2,1199,1,1
3,50553,3389,50553,3389,allow,3327,1438,1889,15,17,8,7
4,50002,443,45848,443,allow,25358,6778,18580,31,16,13,18


In [5]:
unique_counts = data['Action'].value_counts()
print(unique_counts)

Action
allow         37640
deny          14984
drop          12846
reset-both       54
Name: count, dtype: int64


In [6]:
data.head(200)

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
0,57222,53,54587,53,allow,177,94,83,2,30,1,1
1,56258,3389,56258,3389,allow,4768,1600,3168,19,17,10,9
2,6881,50321,43265,50321,allow,238,118,120,2,1199,1,1
3,50553,3389,50553,3389,allow,3327,1438,1889,15,17,8,7
4,50002,443,45848,443,allow,25358,6778,18580,31,16,13,18
5,51465,443,39975,443,allow,3961,1595,2366,21,16,12,9
6,60513,47094,45469,47094,allow,320,140,180,6,7,3,3
7,50049,443,21285,443,allow,7912,3269,4643,23,96,12,11
8,52244,58774,2211,58774,allow,70,70,0,1,5,1,0
9,50627,443,16215,443,allow,8256,1674,6582,31,75,15,16


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65524 entries, 0 to 65523
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Source Port           65524 non-null  int64 
 1   Destination Port      65524 non-null  int64 
 2   NAT Source Port       65524 non-null  int64 
 3   NAT Destination Port  65524 non-null  int64 
 4   Action                65524 non-null  object
 5   Bytes                 65524 non-null  int64 
 6   Bytes Sent            65524 non-null  int64 
 7   Bytes Received        65524 non-null  int64 
 8   Packets               65524 non-null  int64 
 9   Elapsed Time (sec)    65524 non-null  int64 
 10  pkts_sent             65524 non-null  int64 
 11  pkts_received         65524 non-null  int64 
dtypes: int64(11), object(1)
memory usage: 6.0+ MB


In [9]:
#identification of categorical columns. This is necessary to be converted to numerical values before it can be used
categorical_cols = data.select_dtypes(include=['object', 'category']).columns

In [10]:
#Conversion of categorical values into numerical values using Label Encoders. - https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

In [11]:
data.head(200)

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
0,57222,53,54587,53,0,177,94,83,2,30,1,1
1,56258,3389,56258,3389,0,4768,1600,3168,19,17,10,9
2,6881,50321,43265,50321,0,238,118,120,2,1199,1,1
3,50553,3389,50553,3389,0,3327,1438,1889,15,17,8,7
4,50002,443,45848,443,0,25358,6778,18580,31,16,13,18
5,51465,443,39975,443,0,3961,1595,2366,21,16,12,9
6,60513,47094,45469,47094,0,320,140,180,6,7,3,3
7,50049,443,21285,443,0,7912,3269,4643,23,96,12,11
8,52244,58774,2211,58774,0,70,70,0,1,5,1,0
9,50627,443,16215,443,0,8256,1674,6582,31,75,15,16


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65524 entries, 0 to 65523
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Source Port           65524 non-null  int64
 1   Destination Port      65524 non-null  int64
 2   NAT Source Port       65524 non-null  int64
 3   NAT Destination Port  65524 non-null  int64
 4   Action                65524 non-null  int32
 5   Bytes                 65524 non-null  int64
 6   Bytes Sent            65524 non-null  int64
 7   Bytes Received        65524 non-null  int64
 8   Packets               65524 non-null  int64
 9   Elapsed Time (sec)    65524 non-null  int64
 10  pkts_sent             65524 non-null  int64
 11  pkts_received         65524 non-null  int64
dtypes: int32(1), int64(11)
memory usage: 5.7 MB


In [13]:
# Spliting the dataset into Features and Target. The feature values will be used predict the target values.
X = data.drop('Action', axis=1) 
y = data['Action']

In [14]:
#Spliting the dataset into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)

In [17]:
#Model Evaluation
y_pred = clf.predict(X_test)

In [18]:
#Model Accuracy Calculation - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00


In [19]:
#Classification Report - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7538
           1       0.99      1.00      1.00      2960
           2       1.00      1.00      1.00      2594
           3       1.00      0.23      0.38        13

    accuracy                           1.00     13105
   macro avg       1.00      0.81      0.84     13105
weighted avg       1.00      1.00      1.00     13105



In [20]:
# Confusion Matrix - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
# This helps in evaluating the accuracy of a classification.
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[7536    2    0    0]
 [   1 2956    3    0]
 [   0    7 2587    0]
 [   0   10    0    3]]


### Test with external data
In order to simuate the performance of the model on external data. from a customer. I tested the model on a new dataset I extracted from the original dataset I downloaded from kaggle

In [21]:
external_df = pd.read_csv("./Firewall_log/log2.csv")

In [22]:
categorical_cols = external_df.select_dtypes(include=['object', 'category']).columns

In [23]:
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    external_df[col] = label_encoders[col].fit_transform(external_df[col])

In [24]:
# Ensure the external data has the same columns as the training data
external_X = external_df[X.columns]

In [25]:
# Predict with the trained model
external_predictions = clf.predict(external_X)

In [26]:
# Output the predictions
print("External Data Predictions:")
print(external_predictions)

External Data Predictions:
[2 2 2 2]
