In [1]:
# Importing packages
import pandas as pd
import numpy as np
import operator

In [2]:
# Importing preprocessed data sets
train = pd.read_csv('../preprocess_data/train.csv')
test = pd.read_csv('../preprocess_data/test.csv')

# Dropping the index column
train = train.drop(columns = 'Unnamed: 0')
test = test.drop(columns = 'Unnamed: 0')

# Splitting in X and y 
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]
X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

# Feature Selection Using Information Gain

In [3]:
# Defining functions to calculate information gain
def entropy(labels):
    """Compute the entropy of a list of labels."""
    n_labels = len(labels)
    if n_labels <= 1:
        return 0
    value_counts = labels.value_counts() / n_labels
    entropy = -np.sum(value_counts * np.log2(value_counts))
    return entropy

def information_gain(data, labels, attribute):
    """Compute the information gain for a given attribute."""
    # Calculate total entropy before splitting
    total_entropy = entropy(labels)
    
    # Calculate the weighted entropy after splitting by the given attribute
    weighted_entropy = 0
    for value in attribute.unique():
        subset_labels = labels[attribute == value]
        weighted_entropy += (len(subset_labels) / len(labels)) * entropy(subset_labels)
    
    # Information gain is the difference between total entropy and weighted entropy
    info_gain = total_entropy - weighted_entropy
    return info_gain

In [4]:
# Computing information gain for each attribute
info_gain_dict = {}
for column in X_train:
    info_gain_dict[column] = information_gain(X_train, y_train, X_train[column])

In [5]:
# Printing the information gains in descending order
sorted(info_gain_dict.items(), key = operator.itemgetter(1), reverse = True)

[('Flow Bytes/s', 0.7502532415832261),
 (' Average Packet Size', 0.7213538835256111),
 (' Packet Length Std', 0.7116909331054498),
 (' Flow Packets/s', 0.7094301317451537),
 (' Packet Length Variance', 0.7089655874631503),
 ('Fwd Packets/s', 0.7081283054392367),
 (' Packet Length Mean', 0.6816263462323534),
 (' Flow Duration', 0.6802507170926715),
 (' Flow IAT Mean', 0.67627780578498),
 (' Bwd Packets/s', 0.6518162523049276),
 (' Flow IAT Max', 0.632708755195615),
 (' Destination Port', 0.6264352915685752),
 (' Total Length of Bwd Packets', 0.6130724669754792),
 (' Subflow Bwd Bytes', 0.6130724669754792),
 (' Bwd Packet Length Mean', 0.5991204518025013),
 (' Avg Bwd Segment Size', 0.5991204518025013),
 ('Bwd Packet Length Max', 0.5592241210723775),
 ('Total Length of Fwd Packets', 0.5588915667948694),
 (' Subflow Fwd Bytes', 0.5588915667948694),
 (' Fwd IAT Mean', 0.5585074120901432),
 (' Init_Win_bytes_backward', 0.5547067518899622),
 (' Fwd IAT Max', 0.5457801816700032),
 ('Init_Win_

# Creating Feature Groups Based on Information Gain

In [6]:
# Creating feature groups using the same thresholds as the research paper
# Note: if a feature is greater than 0.6 then it will also be included in the greater 0.5, 0.4, 0.3, 0.2, and 0.1 groups
info_gain_gt06 = []
info_gain_gt05 = []
info_gain_gt04 = []
info_gain_gt03 = []
info_gain_gt02 = []
info_gain_gt01 = []
all_features = list(X_train.columns)

for key, val in info_gain_dict.items():
    if val > 0.6:
        info_gain_gt06.append(key)
    if val > 0.5: 
        info_gain_gt05.append(key)
    if val > 0.4:
        info_gain_gt04.append(key)
    if val > 0.3:
        info_gain_gt03.append(key)
    if val > 0.2:
        info_gain_gt02.append(key)
    if val > 0.1:
        info_gain_gt01.append(key)

In [7]:
# Printing the number of features in each Feature Group
print('Feature Groups:')
print(f'Information Gain > 0.6: {len(info_gain_gt06)}')
print(f'Information Gain > 0.5: {len(info_gain_gt05)}')
print(f'Information Gain > 0.4: {len(info_gain_gt04)}')
print(f'Information Gain > 0.3: {len(info_gain_gt03)}')
print(f'Information Gain > 0.2: {len(info_gain_gt02)}')
print(f'Information Gain > 0.1: {len(info_gain_gt01)}')
print(f'All Features: {len(all_features)}')

Feature Groups:
Information Gain > 0.6: 14
Information Gain > 0.5: 26
Information Gain > 0.4: 35
Information Gain > 0.3: 41
Information Gain > 0.2: 55
Information Gain > 0.1: 57
All Features: 77
