In [1]:
# Importing packages
import pandas as pd
import numpy as np
import operator

In [2]:
# Importing preprocessed data sets
train = pd.read_csv('../preprocess_data/train.csv')
test = pd.read_csv('../preprocess_data/test.csv')

# Dropping the index column
train = train.drop(columns = 'Unnamed: 0')
test = test.drop(columns = 'Unnamed: 0')

# Splitting in X and y 
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]
X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

# Replacing inf values with the maximum float number
X_train.replace([np.inf, -np.inf], np.finfo(np.float32).max, inplace = True)
X_train.fillna(X_train.mean(), inplace = True)
X_test.replace([np.inf, -np.inf], np.finfo(np.float32).max, inplace = True)
X_test.fillna(X_train.mean(), inplace = True) # using the mean of the training data to prevent data leakage 

# Feature Selection Using Information Gain

In [3]:
# Defining functions to calculate information gain
def entropy(labels):
    """Compute the entropy of a list of labels."""
    n_labels = len(labels)
    if n_labels <= 1:
        return 0
    value_counts = labels.value_counts() / n_labels
    entropy = -np.sum(value_counts * np.log2(value_counts))
    return entropy

def information_gain(data, labels, attribute):
    """Compute the information gain for a given attribute."""
    # Calculate total entropy before splitting
    total_entropy = entropy(labels)
    
    # Calculate the weighted entropy after splitting by the given attribute
    weighted_entropy = 0
    for value in attribute.unique():
        subset_labels = labels[attribute == value]
        weighted_entropy += (len(subset_labels) / len(labels)) * entropy(subset_labels)
    
    # Information gain is the difference between total entropy and weighted entropy
    info_gain = total_entropy - weighted_entropy
    return info_gain

In [4]:
# Computing information gain for each attribute
info_gain_dict = {}
for column in X_train:
    info_gain_dict[column] = information_gain(X_train, y_train, X_train[column])

In [5]:
# Printing the information gains in descending order
sorted(info_gain_dict.items(), key = operator.itemgetter(1), reverse = True)

[('Flow Bytes/s', 0.7498814412155748),
 (' Average Packet Size', 0.7213538835256111),
 (' Packet Length Std', 0.7116909331054498),
 (' Flow Packets/s', 0.7094301317451537),
 (' Packet Length Variance', 0.7089655874631503),
 ('Fwd Packets/s', 0.7081283054392367),
 (' Packet Length Mean', 0.6816263462323534),
 (' Flow Duration', 0.6802507170926715),
 (' Flow IAT Mean', 0.67627780578498),
 (' Bwd Packets/s', 0.6518162523049276),
 (' Flow IAT Max', 0.632708755195615),
 (' Destination Port', 0.6264352915685752),
 (' Total Length of Bwd Packets', 0.6130724669754792),
 (' Subflow Bwd Bytes', 0.6130724669754792),
 (' Bwd Packet Length Mean', 0.5991204518025013),
 (' Avg Bwd Segment Size', 0.5991204518025013),
 ('Bwd Packet Length Max', 0.5592241210723775),
 ('Total Length of Fwd Packets', 0.5588915667948694),
 (' Subflow Fwd Bytes', 0.5588915667948694),
 (' Fwd IAT Mean', 0.5585074120901432),
 (' Init_Win_bytes_backward', 0.5547067518899622),
 (' Fwd IAT Max', 0.5457801816700032),
 ('Init_Win_

# Creating Feature Groups Based on Information Gain

In [6]:
# Creating feature groups using the same thresholds as the research paper
# Note: if a feature is greater than 0.6 then it will also be included in the greater 0.5, 0.4, 0.3, 0.2, and 0.1 groups
info_gain_gt06 = []
info_gain_gt05 = []
info_gain_gt04 = []
info_gain_gt03 = []
info_gain_gt02 = []
info_gain_gt01 = []
all_features = list(X_train.columns)

for key, val in info_gain_dict.items():
    if val > 0.6:
        info_gain_gt06.append(key)
    if val > 0.5: 
        info_gain_gt05.append(key)
    if val > 0.4:
        info_gain_gt04.append(key)
    if val > 0.3:
        info_gain_gt03.append(key)
    if val > 0.2:
        info_gain_gt02.append(key)
    if val > 0.1:
        info_gain_gt01.append(key)

In [7]:
# Printing the number of features in each Feature Group
print('Feature Groups:')
print(f'Information Gain > 0.6: {len(info_gain_gt06)}')
print(f'Information Gain > 0.5: {len(info_gain_gt05)}')
print(f'Information Gain > 0.4: {len(info_gain_gt04)}')
print(f'Information Gain > 0.3: {len(info_gain_gt03)}')
print(f'Information Gain > 0.2: {len(info_gain_gt02)}')
print(f'Information Gain > 0.1: {len(info_gain_gt01)}')
print(f'All Features: {len(all_features)}')

Feature Groups:
Information Gain > 0.6: 14
Information Gain > 0.5: 26
Information Gain > 0.4: 35
Information Gain > 0.3: 41
Information Gain > 0.2: 55
Information Gain > 0.1: 57
All Features: 77


# Building Classification Models for Each Feature Group

## Feature Group 1: Information Gain > 0.6

#### Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings

# Data Preprocessing for Random Forest
warnings.filterwarnings('ignore')
## Converting categorical label column into numeric 
# Define a mapping dictionary for category to numeric value
category_mapping = {'Normal': 0, 'Bot': 1, 'Brute Force': 2, 'DoS/DdoS': 3, 'Infiltration': 4, 'Portscan': 5, 'Web Attack': 6}
# Map the categorical values to numeric values using the mapping dictionary
y_train_encoded = y_train.map(category_mapping)
y_test_encoded = y_test.map(category_mapping)

In [9]:
## Random Forest classifier
rf_model_fg1 = RandomForestClassifier()
rf_model_fg1.fit(X_train[info_gain_gt06], y_train_encoded.values.ravel())

# Making predictions
rf_pred_fg1 = rf_model_fg1.predict(X_test[info_gain_gt06])

rf_accuracy_fg1 = accuracy_score(y_test_encoded, rf_pred_fg1)
print('Random Forest Accuracy:', rf_accuracy_fg1)

Random Forest Accuracy: 0.9961553180841355


#### Random Trees

In [10]:
## Random Tree classifier
rt_model_fg1 = RandomForestClassifier(n_estimators = 1) # setting n_estimator to 1 is only using a single tree instead of a forest of trees
rt_model_fg1.fit(X_train[info_gain_gt06], y_train_encoded.values.ravel())

# Making predictions
rt_pred_fg1 = rt_model_fg1.predict(X_test[info_gain_gt06])

rt_accuracy_fg1 = accuracy_score(y_test_encoded, rt_pred_fg1)
print('Random Tree Accuracy:', rt_accuracy_fg1)

Random Tree Accuracy: 0.9948070299390621


#### Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB
# Instantiate the Gaussian Naive Bayes classifier
nb_model_fg1 = GaussianNB()

# Train the classifier
nb_model_fg1.fit(X_train[info_gain_gt06], y_train)

# Make predictions
nb_pred_fg1 = nb_model_fg1.predict(X_test[info_gain_gt06])

# Evaluate the model
nb_accuracy_fg1 = accuracy_score(y_test, nb_pred_fg1)
print('Naive Bayes Accuracy', nb_accuracy_fg1)

Naive Bayes Accuracy 0.004604197945185316


## Feature Group 2: Information Gain > 0.5

#### Random Forest

In [12]:
## Random Forest classifier
rf_model_fg2 = RandomForestClassifier()
rf_model_fg2.fit(X_train[info_gain_gt05], y_train_encoded.values.ravel())

# Making predictions
y_pred_fg2 = rf_model_fg2.predict(X_test[info_gain_gt05])

accuracy_fg2 = accuracy_score(y_test_encoded, y_pred_fg2)
print('Random Forest Accuracy:', accuracy_fg2)

Random Forest Accuracy: 0.9981159292295917


#### Random Tree

In [13]:
## Random Tree classifier
rt_model_fg2 = RandomForestClassifier(n_estimators = 1) # setting n_estimator to 1 is only using a single tree instead of a forest of trees
rt_model_fg2.fit(X_train[info_gain_gt05], y_train_encoded.values.ravel())

# Making predictions
rt_pred_fg2 = rt_model_fg2.predict(X_test[info_gain_gt05])

rt_accuracy_fg2 = accuracy_score(y_test_encoded, rt_pred_fg2)
print('Random Tree Accuracy:', rt_accuracy_fg2)

Random Tree Accuracy: 0.9967440901998882


#### Naive Bayes

In [14]:
# Instantiate the Gaussian Naive Bayes classifier
nb_model_fg2 = GaussianNB()

# Train the classifier
nb_model_fg2.fit(X_train[info_gain_gt05], y_train)

# Make predictions
nb_pred_fg2 = nb_model_fg2.predict(X_test[info_gain_gt05])

# Evaluate the model
nb_accuracy_fg2 = accuracy_score(y_test, nb_pred_fg2)
print('Naive Bayes Accuracy', nb_accuracy_fg2)

Naive Bayes Accuracy 0.004604197945185316


## Feature Group 3: Information Gain > 0.4

#### Random Forest

In [15]:
## Random Forest classifier
rf_model_fg3 = RandomForestClassifier()
rf_model_fg3.fit(X_train[info_gain_gt04], y_train_encoded.values.ravel())

# Making predictions
y_pred_fg3 = rf_model_fg3.predict(X_test[info_gain_gt04])

accuracy_fg3 = accuracy_score(y_test_encoded, y_pred_fg3)
print('Random Forest Accuracy:', accuracy_fg3)

Random Forest Accuracy: 0.9982748977008449


#### Random Tree

In [16]:
## Random Tree classifier
rt_model_fg3 = RandomForestClassifier(n_estimators = 1) # setting n_estimator to 1 is only using a single tree instead of a forest of trees
rt_model_fg3.fit(X_train[info_gain_gt04], y_train_encoded.values.ravel())

# Making predictions
rt_pred_fg3 = rt_model_fg3.predict(X_test[info_gain_gt04])

rt_accuracy_fg3 = accuracy_score(y_test_encoded, rt_pred_fg3)
print('Random Tree Accuracy:', rt_accuracy_fg3)

Random Tree Accuracy: 0.9970973534693397


#### Naive Bayes

In [17]:
# Instantiate the Gaussian Naive Bayes classifier
nb_model_fg3 = GaussianNB()

# Train the classifier
nb_model_fg3.fit(X_train[info_gain_gt04], y_train)

# Make predictions
nb_pred_fg3 = nb_model_fg3.predict(X_test[info_gain_gt04])

# Evaluate the model
nb_accuracy_fg3 = accuracy_score(y_test, nb_pred_fg3)
print('Naive Bayes Accuracy', nb_accuracy_fg3)

Naive Bayes Accuracy 0.004604197945185316


## Feature Group 4: Information Gain > 0.3

#### Random Forest

In [18]:
## Random Forest classifier
rf_model_fg4 = RandomForestClassifier()
rf_model_fg4.fit(X_train[info_gain_gt03], y_train_encoded.values.ravel())

# Making predictions
y_pred_fg4 = rf_model_fg4.predict(X_test[info_gain_gt03])

accuracy_fg4 = accuracy_score(y_test_encoded, y_pred_fg4)
print('Random Forest Accuracy:', accuracy_fg4)

Random Forest Accuracy: 0.9982513468162147


#### Random Tree

In [19]:
## Random Tree classifier
rt_model_fg4 = RandomForestClassifier(n_estimators = 1) # setting n_estimator to 1 is only using a single tree instead of a forest of trees
rt_model_fg4.fit(X_train[info_gain_gt03], y_train_encoded.values.ravel())

# Making predictions
rt_pred_fg4 = rt_model_fg4.predict(X_test[info_gain_gt03])

rt_accuracy_fg4 = accuracy_score(y_test_encoded, rt_pred_fg4)
print('Random Tree Accuracy:', rt_accuracy_fg4)

Random Tree Accuracy: 0.9974447290176337


#### Naive Bayes

In [20]:
# Instantiate the Gaussian Naive Bayes classifier
nb_model_fg4 = GaussianNB()

# Train the classifier
nb_model_fg4.fit(X_train[info_gain_gt03], y_train)

# Make predictions
nb_pred_fg4 = nb_model_fg4.predict(X_test[info_gain_gt03])

# Evaluate the model
nb_accuracy_fg4 = accuracy_score(y_test, nb_pred_fg4)
print('Naive Bayes Accuracy', nb_accuracy_fg4)

Naive Bayes Accuracy 0.004604197945185316


## Feature Group 5: Information Gain > 0.2

#### Random Forest

In [21]:
## Random Forest classifier
rf_model_fg5 = RandomForestClassifier()
rf_model_fg5.fit(X_train[info_gain_gt02], y_train_encoded.values.ravel())

# Making predictions
y_pred_fg5 = rf_model_fg5.predict(X_test[info_gain_gt02])

accuracy_fg5 = accuracy_score(y_test_encoded, y_pred_fg5)
print('Random Forest Accuracy:', accuracy_fg5)

Random Forest Accuracy: 0.9983219994701051


#### Random Tree

In [22]:
## Random Tree classifier
rt_model_fg5 = RandomForestClassifier(n_estimators = 1) # setting n_estimator to 1 is only using a single tree instead of a forest of trees
rt_model_fg5.fit(X_train[info_gain_gt02], y_train_encoded.values.ravel())

# Making predictions
rt_pred_fg5 = rt_model_fg5.predict(X_test[info_gain_gt02])

rt_accuracy_fg5 = accuracy_score(y_test_encoded, rt_pred_fg5)
print('Random Tree Accuracy:', rt_accuracy_fg5)

Random Tree Accuracy: 0.9972092201713327


#### Naive Bayes

In [23]:
# Instantiate the Gaussian Naive Bayes classifier
nb_model_fg5 = GaussianNB()

# Train the classifier
nb_model_fg5.fit(X_train[info_gain_gt02], y_train)

# Make predictions
nb_pred_fg5 = nb_model_fg5.predict(X_test[info_gain_gt02])

# Evaluate the model
nb_accuracy_fg5 = accuracy_score(y_test, nb_pred_fg5)
print('Naive Bayes Accuracy', nb_accuracy_fg5)

Naive Bayes Accuracy 0.004604197945185316


## Feature Group 6: Information Gain > 0.1

#### Random Forest

In [24]:
## Random Forest classifier
rf_model_fg6 = RandomForestClassifier()
rf_model_fg6.fit(X_train[info_gain_gt01], y_train_encoded.values.ravel())

# Making predictions
y_pred_fg6 = rf_model_fg6.predict(X_test[info_gain_gt01])

accuracy_fg6 = accuracy_score(y_test_encoded, y_pred_fg6)
print('Random Forest Accuracy:', accuracy_fg6)

Random Forest Accuracy: 0.9983278871912626


#### Random Tree

In [25]:
## Random Tree classifier
rt_model_fg6 = RandomForestClassifier(n_estimators = 1) # setting n_estimator to 1 is only using a single tree instead of a forest of trees
rt_model_fg6.fit(X_train[info_gain_gt01], y_train_encoded.values.ravel())

# Making predictions
rt_pred_fg6 = rt_model_fg6.predict(X_test[info_gain_gt01])

rt_accuracy_fg6 = accuracy_score(y_test_encoded, rt_pred_fg6)
print('Random Tree Accuracy:', rt_accuracy_fg6)

Random Tree Accuracy: 0.9968500691807236


#### Naive Bayes

In [26]:
# Instantiate the Gaussian Naive Bayes classifier
nb_model_fg6 = GaussianNB()

# Train the classifier
nb_model_fg6.fit(X_train[info_gain_gt01], y_train)

# Make predictions
nb_pred_fg6 = nb_model_fg6.predict(X_test[info_gain_gt01])

# Evaluate the model
nb_accuracy_fg6 = accuracy_score(y_test, nb_pred_fg6)
print('Naive Bayes Accuracy', nb_accuracy_fg6)

Naive Bayes Accuracy 0.004604197945185316


## Feature Group 7: All Features

#### Random Forest

In [27]:
## Random Forest classifier
rf_model_fg7 = RandomForestClassifier()
rf_model_fg7.fit(X_train, y_train_encoded.values.ravel())

# Making predictions
y_pred_fg7 = rf_model_fg7.predict(X_test)

accuracy_fg7 = accuracy_score(y_test_encoded, y_pred_fg7)
print('Random Forest Accuracy', accuracy_fg7)

Random Forest Accuracy 0.9982866731431599


#### Random Tree

In [28]:
## Random Tree classifier
rt_model_fg7 = RandomForestClassifier(n_estimators = 1) # setting n_estimator to 1 is only using a single tree instead of a forest of trees
rt_model_fg7.fit(X_train, y_train_encoded.values.ravel())

# Making predictions
rt_pred_fg7 = rt_model_fg7.predict(X_test)

rt_accuracy_fg7 = accuracy_score(y_test_encoded, rt_pred_fg7)
print('Random Tree Accuracy:', rt_accuracy_fg7)

Random Tree Accuracy: 0.9973681886425859


#### Naive Bayes

In [29]:
# Instantiate the Gaussian Naive Bayes classifier
nb_model_fg7 = GaussianNB()

# Train the classifier
nb_model_fg7.fit(X_train, y_train)

# Make predictions
nb_pred_fg7 = nb_model_fg7.predict(X_test)

# Evaluate the model
nb_accuracy_fg7 = accuracy_score(y_test, nb_pred_fg7)
print('Naive Bayes Accuracy', nb_accuracy_fg7)

Naive Bayes Accuracy 0.004604197945185316
