In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN, BorderlineSMOTE

In [3]:
data = pd.read_csv('../data/processed/data.csv')
X = data[[column for column in list(data.columns) 
          if column not in ['label', 'session_id', 'new_label','entropy',
                            'acceleration_pos_neg_ratio','acceleration_std',
                            'clicks_count', 'durations','hover_frequency','speed_cv']]]
y = data[['new_label']]


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)


In [8]:
# DEFINE BEST DECISION TREE PIPELINE (ADASYN) 
tree_pipe_adasyn = Pipeline([
    ('adasyn', ADASYN(sampling_strategy='minority', random_state=42)),
    ('tree', DecisionTreeClassifier(random_state=42, class_weight='balanced'))
])

In [10]:
# Train on full training set
tree_pipe_adasyn.fit(X_train, y_train)

In [14]:
# -DEFINE BEST KNN PIPELINE (Borderline-SMOTE) 

best_k = 5  

knn_pipe_borderline = Pipeline([
    ('scaler', StandardScaler()),
    ('borderline_smote', BorderlineSMOTE(sampling_strategy='minority', random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=best_k))
])

In [15]:
# Train on full training set
knn_pipe_borderline.fit(X_train, y_train)

  return self._fit(X, y)


In [16]:
#  BUILD FINAL VOTING ENSEMBLE 
ensemble_model = VotingClassifier(
    estimators=[
        ('decision_tree_adasyn', tree_pipe_adasyn),
        ('knn_borderline', knn_pipe_borderline)
    ],
    voting='soft'  # Soft voting to use predicted probabilities
)

In [17]:
# Train ensemble
ensemble_model.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [18]:
# PREDICT AND EVALUATE ENSEMBLE 
y_pred_ensemble = ensemble_model.predict(X_test)
y_prob_ensemble = ensemble_model.predict_proba(X_test)[:, 1]

# Evaluate ensemble performance
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
ensemble_f1 = f1_score(y_test, y_pred_ensemble, average='weighted')
ensemble_roc_auc = roc_auc_score(y_test, y_prob_ensemble)

print(f"Ensemble Test Accuracy: {ensemble_accuracy:.4f}")
print(f"Ensemble F1-Score (Weighted): {ensemble_f1:.4f}")
print(f"Ensemble ROC AUC: {ensemble_roc_auc:.4f}")


Ensemble Test Accuracy: 0.9889
Ensemble F1-Score (Weighted): 0.9888
Ensemble ROC AUC: 0.9998

Classification Report (Ensemble):
              precision    recall  f1-score   support

         bot       0.99      1.00      0.99       136
       human       1.00      0.95      0.98        44

    accuracy                           0.99       180
   macro avg       0.99      0.98      0.98       180
weighted avg       0.99      0.99      0.99       180

