In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score, classification_report
import socket, struct
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings('ignore')

In [16]:
available_features = [
    # Flow identification
    'Source_IP', 
    'Destination_IP',
    'Source_port',
    'Destination_port',
    'Protocol',
    'Timestamp',
    
    # Packet counts
    'fwd_packets_amount',
    'bwd_packets_amount',
    'fwd_packets_length',
    'bwd_packets_length',
    
    # Packet sizes
    'max_fwd_packet ',
    'min_fwd_packet',
    'max_bwd_packet',
    'min_bwd_packet',
    
    # Flag counts
    'FIN_count',
    'SYN_count',
    'RST_count',
    'PSH_count',
    'silence_windows',
    
    # Inter-arrival times
    'min_fwd_inter_arrival_time',
    'max_fwd_inter_arrival_time',
    'mean_fwd_inter_arrival_time',
    'min_bwd_inter_arrival_time',
    'max_bwd_inter_arrival_time',
    'mean_bwd_inter_arrival_time',
    'min_bib_inter_arrival_time',
    'max_bib_inter_arrival_time',
    'mean_bib_inter_arrival_time',
    
    # First packet sizes 
    *[f'first_packet_sizes_{i}' for i in range(30)],
    
    # Statistical features
    'min_packet_size',
    'max_packet_size',
    'mean_packet_size',
    'STD_packet_size',
    'mean_delta_byte',
    'STD_delta_byte',
    
    # Bandwidth features
    *[f'bandwidth_{i}' for i in range(20)],
    
    # Bytes per packet
    *[f'bpp_{i}' for i in range(3)],
    
    # Beaconing features
    *[f'beaconning_{i}' for i in range(20)],
    
    # Packets per second
    'pps_fwd',
    'pps_bwd',
    
    # Additional features
    'count_big_requests',
    'ACK_count'
]

# Print the resulting DataFrames to verify
# print("X_train shape:", X_train.shape)
# print("X_test shape:", X_test.shape)

In [17]:
# Function that changes IP addresses into numbers

def ip2int(ip):
    packedIP = socket.inet_aton(ip)
    return struct.unpack("!L", packedIP)[0]

In [18]:
# Load train, test, and validation datasets
train_path = '../APP-1/train.csv'
test_path = '../APP-1/test.csv'
val_path = '../APP-1/val_without_labels.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
validation_data = pd.read_csv(val_path)

In [19]:
# Prepare features and target
y_train = train_df['label']
y_test = test_df['label']

In [20]:
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [21]:
X_train = X_train = train_df.drop(columns='label')
X_test = test_df.drop(columns='label')

X_train.shape

(640, 111)

In [22]:
# Converting IP addresses into numbers
X_train['Source_IP'] = X_train['Source_IP'].apply(ip2int)
X_train['Destination_IP'] = X_train['Destination_IP'].apply(ip2int)

# Converting IP addresses into numbers
X_test['Source_IP'] = X_test['Source_IP'].apply(ip2int)
X_test['Destination_IP'] = X_test['Destination_IP'].apply(ip2int)

In [45]:
# X_train

Unnamed: 0,Source_IP,Destination_IP,Source_port,Destination_port,Protocol,Timestamp,fwd_packets_amount,bwd_packets_amount,fwd_packets_length,bwd_packets_length,...,beaconning_14,beaconning_15,beaconning_16,beaconning_17,beaconning_18,beaconning_19,pps_fwd,pps_bwd,count_big_requests,ACK_count
0,3232237322,65598122,52264,443,tcp,1.728158e+09,870,2465,648,1629,...,0,0,0,0,0,0,16.183418,40.683315,25,2276
1,3232237322,65598122,52286,443,tcp,1.728158e+09,871,2466,38,236,...,0,0,0,0,0,0,2.082513,12.933502,17,273
2,174351892,2889913550,55125,443,udp,1.727977e+09,2281,2730,85,119,...,0,0,0,0,0,0,2.543558,3.560982,13,169
3,3232237322,65598122,52287,443,tcp,1.728158e+09,872,2467,44,189,...,0,0,0,0,0,0,2.410077,10.352376,19,232
4,3232237322,916118573,51187,443,tcp,1.728158e+09,12,23,13,23,...,0,0,0,0,0,0,10.436018,18.463725,6,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,3232237322,601104833,56423,443,tcp,1.728162e+09,111,176,121,187,...,0,0,0,0,0,0,73.222658,113.162290,6,307
636,3232237322,601104833,55393,443,tcp,1.728161e+09,107,158,118,168,...,0,0,0,0,0,0,54.077641,76.991896,7,285
637,3232237322,601104833,55394,443,tcp,1.728161e+09,19,20,9,10,...,0,0,0,0,0,0,16.580206,18.422451,3,18
638,3232237322,601104833,55447,443,tcp,1.728161e+09,174,420,186,430,...,0,0,0,0,0,0,51.714185,119.554299,6,615


In [35]:
X_train_sel = train_df[available_features]
X_test_sel = train_df[available_features]

In [37]:
len(available_features)

111

In [36]:
X_train_sel.shape
X_test_sel.shape

(640, 111)

In [44]:
# X_train_sel

Unnamed: 0,Source_IP,Destination_IP,Source_port,Destination_port,Protocol,Timestamp,fwd_packets_amount,bwd_packets_amount,fwd_packets_length,bwd_packets_length,...,beaconning_14,beaconning_15,beaconning_16,beaconning_17,beaconning_18,beaconning_19,pps_fwd,pps_bwd,count_big_requests,ACK_count
0,6,219,52264,443,0,1.728158e+09,870,2465,648,1629,...,0,0,0,0,0,0,16.183418,40.683315,25,2276
1,6,219,52286,443,0,1.728158e+09,871,2466,38,236,...,0,0,0,0,0,0,2.082513,12.933502,17,273
2,3,121,55125,443,1,1.727977e+09,2281,2730,85,119,...,0,0,0,0,0,0,2.543558,3.560982,13,169
3,6,219,52287,443,0,1.728158e+09,872,2467,44,189,...,0,0,0,0,0,0,2.410077,10.352376,19,232
4,6,266,51187,443,0,1.728158e+09,12,23,13,23,...,0,0,0,0,0,0,10.436018,18.463725,6,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,6,245,56423,443,0,1.728162e+09,111,176,121,187,...,0,0,0,0,0,0,73.222658,113.162290,6,307
636,6,245,55393,443,0,1.728161e+09,107,158,118,168,...,0,0,0,0,0,0,54.077641,76.991896,7,285
637,6,245,55394,443,0,1.728161e+09,19,20,9,10,...,0,0,0,0,0,0,16.580206,18.422451,3,18
638,6,245,55447,443,0,1.728161e+09,174,420,186,430,...,0,0,0,0,0,0,51.714185,119.554299,6,615


In [38]:
# Identify columns with string data
string_columns = X_train_sel.select_dtypes(include=['object', 'string']).columns

print(string_columns)

# Initialize a dictionary to store LabelEncoders
label_encoders = {}

# Apply LabelEncoder to each string column in both X_train and X_test
for column in string_columns:
    
    # Fit on X_train and transform both X_train and X_test
    X_train_sel[column] = label_encoder.fit_transform(X_train_sel[column])
    X_test_sel[column] = label_encoder.transform(X_test_sel[column])
    
    # Store the encoder for future reference
    label_encoders[column] = label_encoder



Index(['Source_IP', 'Destination_IP', 'Protocol'], dtype='object')


In [39]:
# Find columns with non-numeric data in X_test
# non_numeric_columns = X_test_drop.select_dtypes(include=['object', 'string']).columns
non_numeric_columns = X_test_sel.select_dtypes(include=['object', 'string']).columns

# Print the non-numeric columns
print("Non-numeric columns in X_test:")
print(non_numeric_columns)

Non-numeric columns in X_test:
Index([], dtype='object')


In [40]:
for column in non_numeric_columns:
    if column in label_encoders:  # Check if this column was encoded in X_train
        le = label_encoders[column]
        X_test_sel[column] = le.transform(X_test_sel[column])
    else:
        print(f"Warning: {column} was not encoded in X_train!")

In [41]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sel)
X_test_scaled = scaler.transform(X_test_sel)

In [46]:
# BaggingClassifier with DecisionTree as base estimator
bagging_clf = BaggingClassifier(random_state=28, n_estimators=150,max_samples=0.75, max_features=0.55)
bagging_clf.fit(X_train_sel, y_train_encoded)
y_pred_bagging = bagging_clf.predict(X_test_sel)
accuracy_bagging = accuracy_score(y_test_encoded, y_pred_bagging)
print(f"BaggingClassifier Accuracy on Test Dataset: {accuracy_bagging:.4f}")
print("Classification Report (BaggingClassifier):\n", classification_report(y_test_encoded, y_pred_bagging))

BaggingClassifier Accuracy on Test Dataset: 1.0000
Classification Report (BaggingClassifier):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         5
           8       1.00      1.00      1.00         5
           9       1.00      1.00      1.00         5
          10       1.00      1.00      1.00         5
          11       1.00      1.00      1.00         5
          12       1.00      1.00      1.00         5
          13       1.00      1.00      1.00         5
          14       1.00      1.00      1.00         5
          15       1.00      1.00      1

In [47]:
important_features = [
        'Destination.IP', 'Destination.Port', 'Source.IP', 'Init_Win_bytes_forward',
        'min_seg_size_forward', 'Fwd.Packet.Length.Max', 'Init_Win_bytes_backward',
        'Flow.IAT.Max', 'Source.Port', 'Flow.Duration', 'Fwd.Packet.Length.Std',
        'Bwd.IAT.Total', 'Avg.Fwd.Segment.Size', 'Fwd.Packets.s', 'Fwd.IAT.Total',
        'Fwd.IAT.Max', 'Fwd.Packet.Length.Mean', 'Subflow.Fwd.Bytes', 'Flow.Bytes.s',
        'Min.Packet.Length', 'Total.Length.of.Fwd.Packets', 'Bwd.IAT.Max',
        'Packet.Length.Variance', 'Bwd.Packets.s', 'Flow.IAT.Mean', 'Fwd.Header.Length',
        'act_data_pkt_fwd', 'Max.Packet.Length', 'Flow.Packets.s', 'Flow.IAT.Std',
        'Packet.Length.Std', 'Idle.Max', 'Fwd.Header.Length.1', 'Bwd.Packet.Length.Mean',
        'Bwd.IAT.Std', 'Fwd.Packet.Length.Min', 'Bwd.Packet.Length.Std',
        'Avg.Bwd.Segment.Size', 'Average.Packet.Size', 'Total.Length.of.Bwd.Packets',
        'Packet.Length.Mean', 'Fwd.IAT.Mean', 'Fwd.IAT.Std', 'Flow.IAT.Min',
        'Bwd.IAT.Mean', 'Bwd.Packet.Length.Max', 'Subflow.Fwd.Packets',
        'Total.Fwd.Packets', 'Total.Backward.Packets', 'Bwd.Header.Length',
        'Subflow.Bwd.Bytes', 'Subflow.Bwd.Packets', 'Idle.Mean', 'Fwd.IAT.Min',
        'Down.Up.Ratio', 'Idle.Min'
    ]

In [48]:
len(important_features)

56

In [49]:
rf = RandomForestClassifier(random_state=267, n_jobs=-1,n_estimators = 100)
rf.fit(X_train_scaled, y_train_encoded)
y_pred_rf = rf.predict(X_test_scaled)
accuracy_rf = accuracy_score(y_test_encoded, y_pred_rf)
print(f"RadomForest Accuracy on Test Dataset: {accuracy_rf:.4f}")
print("Classification Report (RadomForest):\n", classification_report(y_test_encoded, y_pred_rf))


RadomForest Accuracy on Test Dataset: 1.0000
Classification Report (RadomForest):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         5
           8       1.00      1.00      1.00         5
           9       1.00      1.00      1.00         5
          10       1.00      1.00      1.00         5
          11       1.00      1.00      1.00         5
          12       1.00      1.00      1.00         5
          13       1.00      1.00      1.00         5
          14       1.00      1.00      1.00         5
          15       1.00      1.00      1.00         