In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
import joblib  # for saving the model

warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)


In [2]:
def select_features(X, y, correlation_matrix, correlation_threshold=0.95):
    # Remove highly correlated features
    highly_correlated = np.where(np.abs(correlation_matrix) > correlation_threshold)
    highly_correlated = [(correlation_matrix.index[x], correlation_matrix.columns[y]) 
                        for x, y in zip(*highly_correlated) if x != y and x < y]
    
    features_to_drop = set()
    for feat1, feat2 in highly_correlated:
        if feat1 not in features_to_drop:
            features_to_drop.add(feat2)
    
    X = X.drop(columns=list(features_to_drop))
    
    # Select best features using mutual information
    # selector = SelectKBest(score_func=mutual_info_classif, k='all')
    selector = SelectKBest(score_func=lambda X, y: mutual_info_classif(X, y, random_state=42), k='all')
    selector.fit(X, y)
    
    feature_scores = pd.DataFrame({
        'Feature': X.columns,
        'Score': selector.scores_
    })
    
    print("\nTop 20 features by mutual information:")
    print(feature_scores.sort_values('Score', ascending=False).head(30))
    
    # Select top features
    k = 30  # Number of features to select
    best_features = feature_scores.nlargest(k, 'Score')['Feature'].tolist()
    
    return X[best_features]



In [3]:
# import pandas as pd
# import warnings
# warnings.filterwarnings('ignore', category=UserWarning)
# warnings.filterwarnings('ignore', category=FutureWarning)
# warnings.filterwarnings('ignore', category=RuntimeWarning)

# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.preprocessing import LabelEncoder, StandardScaler
# from sklearn.metrics import accuracy_score
# from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
# from imblearn.over_sampling import SMOTE
# from sklearn.feature_selection import SelectKBest, f_classif



# ===========================
# STEP 1: DATA LOADING
# ===========================
# In this step, we load the training and testing datasets from CSV files.
# Ensure 'train.csv' and 'test.csv' are in the working directory.

train_df = pd.read_csv('../APP-1/train.csv')
test_df = pd.read_csv('../APP-1/test.csv')

# ===========================
# STEP 2: PREPROCESS FEATURES
# (But do NOT encode the 'label' here)
# ===========================
def preprocess_features(df):
    # Encode only feature columns that are objects, not the target
    non_numeric_cols = df.select_dtypes(include=['object']).columns
    for col in non_numeric_cols:
        if col != 'label':  # Important: skip the target column
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
    return df

train_df = preprocess_features(train_df)
test_df = preprocess_features(test_df)



In [4]:
train_df

Unnamed: 0,Source_IP,Source_port,Destination_IP,Destination_port,Protocol,Timestamp,fwd_packets_amount,bwd_packets_amount,fwd_packets_length,bwd_packets_length,...,beaconning_15,beaconning_16,beaconning_17,beaconning_18,beaconning_19,pps_fwd,pps_bwd,count_big_requests,ACK_count,label
0,6,52264,219,443,0,1.728158e+09,870,2465,648,1629,...,0,0,0,0,0,16.183418,40.683315,25,2276,1inch.io
1,6,52286,219,443,0,1.728158e+09,871,2466,38,236,...,0,0,0,0,0,2.082513,12.933502,17,273,1inch.io
2,3,55125,121,443,1,1.727977e+09,2281,2730,85,119,...,0,0,0,0,0,2.543558,3.560982,13,169,1inch.io
3,6,52287,219,443,0,1.728158e+09,872,2467,44,189,...,0,0,0,0,0,2.410077,10.352376,19,232,1inch.io
4,6,51187,266,443,0,1.728158e+09,12,23,13,23,...,0,0,0,0,0,10.436018,18.463725,6,35,1inch.io
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,6,56423,245,443,0,1.728162e+09,111,176,121,187,...,0,0,0,0,0,73.222658,113.162290,6,307,youradchoices
636,6,55393,245,443,0,1.728161e+09,107,158,118,168,...,0,0,0,0,0,54.077641,76.991896,7,285,youradchoices
637,6,55394,245,443,0,1.728161e+09,19,20,9,10,...,0,0,0,0,0,16.580206,18.422451,3,18,youradchoices
638,6,55447,245,443,0,1.728161e+09,174,420,186,430,...,0,0,0,0,0,51.714185,119.554299,6,615,youradchoices


In [5]:
# ===========================
# STEP 2B: ENCODE THE LABEL (TARGET) SEPARATELY
# ===========================
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'].astype(str))
test_df['label'] = label_encoder.transform(test_df['label'].astype(str))


In [6]:
# Split data into features (X) and target (y)
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

In [7]:
# ===========================
# STEP 3: SMOTE
# ===========================
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


In [8]:
# ===========================
# STEP 4: FEATURE SELECTION
# ===========================
# Taking One minute
print("\nSelecting features...")
X_train_selected = select_features(X_train_res, y_train_res, X_train_res.corr())
X_test_selected = X_test[X_train_selected.columns]


Selecting features...

Top 20 features by mutual information:
                       Feature     Score
3             Destination_port  4.481646
11              min_bwd_packet  4.294100
9               min_fwd_packet  3.974498
0                    Source_IP  3.966535
25        first_packet_sizes_0  3.630354
10              max_bwd_packet  3.490943
8              max_fwd_packet   3.442794
55             min_packet_size  3.419087
26        first_packet_sizes_1  3.337249
13                   SYN_count  3.255645
5                    Timestamp  3.131398
27        first_packet_sizes_2  2.981446
2               Destination_IP  2.652934
28        first_packet_sizes_3  2.547910
62                 bandwidth_1  2.518420
30        first_packet_sizes_5  2.331197
56             max_packet_size  2.252157
32        first_packet_sizes_7  1.350319
29        first_packet_sizes_4  1.331066
31        first_packet_sizes_6  1.307520
33        first_packet_sizes_8  1.097890
1                  Source_port  0.8

In [9]:
X_train_selected

Unnamed: 0,Destination_port,min_bwd_packet,min_fwd_packet,Source_IP,first_packet_sizes_0,max_bwd_packet,max_fwd_packet,min_packet_size,first_packet_sizes_1,SYN_count,...,first_packet_sizes_8,Source_port,bwd_packets_amount,fwd_packets_amount,count_big_requests,first_packet_sizes_9,first_packet_sizes_10,FIN_count,max_bwd_inter_arrival_time,first_packet_sizes_11
0,443,60,60,6,66,1454,1454,60,-66,2,...,60,52264,2465,870,25,118,737,0,15.212418,-60
1,443,60,60,6,66,1454,1454,60,-66,2,...,118,52286,2466,871,17,-60,1454,0,15.212418,183
2,443,65,85,3,1292,1242,1292,65,1292,0,...,1292,55125,2730,2281,13,94,127,0,6.619506,128
3,443,60,60,6,66,1454,1454,60,-66,2,...,60,52287,2467,872,19,118,1454,0,15.212418,197
4,443,60,60,6,66,1454,774,60,-66,2,...,-1270,51187,23,12,6,60,180,0,0.683254,211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,443,60,60,6,66,1454,1454,60,-66,2,...,-1454,56423,176,111,6,-1454,-979,0,0.185992,128
636,443,60,60,6,66,1454,1454,60,-66,2,...,-1454,55393,158,107,7,-1454,-979,0,0.803693,128
637,443,60,60,6,66,1454,1454,60,-66,2,...,-1454,55394,20,19,3,-1454,-979,2,0.136054,60
638,443,60,60,6,66,1454,1454,60,-66,2,...,-1454,55447,420,174,6,-1454,-979,0,2.046324,60


In [10]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

In [11]:
# ===========================
# STEP 5: FINAL MODEL
# ===========================

# Best Seed: 90
# Highest Accuracy: 0.8046875

# Best Seed: 119
# Highest Accuracy: 0.8015625

# Best Seed: 173, 252
# Highest Accuracy: 0.8

bagging_clf_final = BaggingClassifier( 
    n_estimators=800,
    max_samples=1.0,
    max_features=0.5,
    random_state=90,
    n_jobs=-1
)

bagging_clf_final.fit(X_train_scaled, y_train_res)
acc_final = accuracy_score(y_test, bagging_clf_final.predict(X_test_scaled))
print("Final Bagging Accuracy:", acc_final)


Final Bagging Accuracy: 0.8046875


In [23]:
from sklearn.ensemble import RandomForestClassifier

# Configure RandomForestClassifier
rf_clf_final = RandomForestClassifier(
    n_estimators=800,
    max_features=0.5,
    random_state=90,
    n_jobs=-1
)

# Fit the classifier
rf_clf_final.fit(X_train_scaled, y_train_res)

# Evaluate accuracy
acc_final_rf = accuracy_score(y_test, rf_clf_final.predict(X_test_scaled))
print("Final Random Forest Accuracy:", acc_final_rf)

# Get feature importances and select the top 15
feature_importances = rf_clf_final.feature_importances_
feature_names = X_train_selected.columns  # Assuming X_train_scaled is a DataFrame
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort features by importance
top_features = importance_df.sort_values(by='Importance', ascending=False).head(15)

# Extract top 15 feature names into a list
top_15_features = top_features['Feature'].tolist()
print("Top 15 Features:", top_15_features)

# Optionally, inspect the importance DataFrame
print(top_features)

Final Random Forest Accuracy: 0.7546875
Top 15 Features: ['Timestamp', 'Destination_IP', 'Source_port', 'fwd_packets_amount', 'bwd_packets_amount', 'max_bwd_inter_arrival_time', 'bandwidth_1', 'first_packet_sizes_9', 'first_packet_sizes_10', 'first_packet_sizes_4', 'first_packet_sizes_11', 'first_packet_sizes_8', 'first_packet_sizes_7', 'count_big_requests', 'first_packet_sizes_6']
                       Feature  Importance
10                   Timestamp    0.199530
12              Destination_IP    0.122325
21                 Source_port    0.080117
23          fwd_packets_amount    0.050639
22          bwd_packets_amount    0.046202
28  max_bwd_inter_arrival_time    0.039927
14                 bandwidth_1    0.036230
25        first_packet_sizes_9    0.035328
26       first_packet_sizes_10    0.034560
18        first_packet_sizes_4    0.034242
29       first_packet_sizes_11    0.033200
20        first_packet_sizes_8    0.032533
17        first_packet_sizes_7    0.029889
24          c

In [26]:
# Filter the training and testing data to include only the top 15 features
X_train_top15 = X_train_selected[top_15_features]
X_test_top15 = X_test_selected[top_15_features]

X_train_top = scaler.fit_transform(X_train_top15)
X_test_top = scaler.transform(X_test_top15)

# Configure BaggingClassifier
bagging_clf_top15 = BaggingClassifier(
    n_estimators=800,
    max_samples=1.0,
    max_features=0.5,
    random_state=90,
    n_jobs=-1
)

# Train the classifier on the top 15 features
bagging_clf_top15.fit(X_train_top, y_train_res)

# Evaluate accuracy on the test set
acc_top15 = accuracy_score(y_test, bagging_clf_top15.predict(X_test_top))
print("Bagging Accuracy with Top 15 Features:", acc_top15)

Bagging Accuracy with Top 15 Features: 0.7734375
