In [188]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, cross_val_score


#### Testing oversampling techniques

#### 1. SMOTE

In [189]:
# load resampled dataset 

resampled_smote = pd.read_csv('../data/processed/oversampled_smote.csv')
resampled_smote = resampled_smote.drop(columns='Unnamed: 0')

In [190]:
resampled_smote

Unnamed: 0,entropy,clicks_count,durations,click_frequency,speed_cv,hover_frequency,avg_hover_duration,acceleration_mean,acceleration_std,acceleration_skewness,...,day_Thursday,day_Tuesday,day_Wednesday,time_afternoon,time_evening,time_morning,time_night,time_wee_hours,hashed_url,label
0,3.121928,10,1606000878926,6.226647e-12,1.292750,3.265045,10044.058824,-97.987654,4076.554706,0.137302,...,0,897,0,0,0,897,0,0,47567188162,human
1,3.221097,19,677103,2.806072e-05,0.509696,0.965173,1951.275000,-3.079521,1211.355072,-0.234733,...,0,0,0,0,401,0,0,0,579555515964,advanced_bot
2,2.895424,17,1606000279969,1.058530e-11,0.517826,0.218432,966.923077,-94.009643,2813.920800,-0.017841,...,0,0,316,0,0,0,316,0,557692383271,advanced_bot
3,2.446439,10,141246,7.079847e-05,0.232604,3.927866,866.327044,-2.978588,616.098440,0.029983,...,0,0,109,109,0,0,0,0,664568366327,moderate_bot
4,3.452820,16,460995,3.470753e-05,0.508165,0.895245,2000.883117,-0.205002,1173.942929,-0.054982,...,0,0,0,0,283,0,0,0,1052243837978,advanced_bot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,2.714056,18,1606000130361,1.154720e-11,0.309763,3.946631,409.879393,-20.385138,578.497009,0.754046,...,0,0,171,0,0,171,0,0,683800219037,moderate_bot
1496,2.744738,17,1606000183263,1.088278e-11,0.292088,3.968565,416.354278,-23.385499,535.726443,0.298905,...,0,0,223,0,0,223,0,0,683800219037,moderate_bot
1497,2.527200,11,114193,9.941933e-05,0.220264,4.006635,851.725138,-4.193722,559.007045,-0.125300,...,0,0,110,110,0,0,0,0,421559864556,moderate_bot
1498,2.647395,18,1606000221860,1.142866e-11,0.250707,3.916775,420.966705,-26.999719,542.643959,-0.040685,...,0,0,263,0,0,263,0,0,586627911530,moderate_bot


Get target column

In [191]:
X_smote = resampled_smote[[column for column in list(resampled_smote.columns) if column != 'label']]
y_smote = resampled_smote[['label']]

In [192]:
# Step 1: Split into train (60%) and temp (40%)
X_train_smote, X_temp_smote, y_train_smote, y_temp_smote = train_test_split(
    X_smote, y_smote, test_size=0.4, random_state=42)

# Step 2: Split temp into validation (20%) and test (20%)
X_val_smote, X_test_smote, y_val_smote, y_test_smote = train_test_split(
    X_temp_smote, y_temp_smote, test_size=0.5, random_state=42)

# Step 3: Standardize using only training data
scaler_smote = StandardScaler()
X_train_smote = scaler_smote.fit_transform(X_train_smote)
X_val_smote = scaler_smote.transform(X_val_smote)
X_test_smote = scaler_smote.transform(X_test_smote)

# Initialize KNN classifier with k=3
knn_smote = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn_smote.fit(X_train_smote, y_train_smote)


y_val_pred_smote = knn_smote.predict(X_val_smote)
val_accuracy_smote = accuracy_score(y_val_smote, y_val_pred_smote)
print(f'Validation Accuracy: {val_accuracy_smote:.2f}')


y_test_pred_smote= knn_smote.predict(X_test_smote)
test_accuracy_smote = accuracy_score(y_test_smote, y_test_pred_smote)
print(f'Test Accuracy: {test_accuracy_smote:.2f}')

Validation Accuracy: 1.00
Test Accuracy: 1.00


  return self._fit(X, y)


#### With CV

In [193]:
# Define KNN model with preprocessing pipeline
# StandardScaler is usually helpful for KNN
knn_smote_pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))

# K-Fold setup
kfold_smote = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores_smote = cross_val_score(knn_smote_pipe, X_train_smote, y_train_smote, cv=kfold_smote, scoring='accuracy')

print(f"CV Fold Accuracies (on training): {cv_scores_smote}")
print(f"Mean CV Accuracy: {np.mean(cv_scores_smote):.4f} ± {np.std(cv_scores_smote):.4f}")

# Step 5: Fit model on full training set
knn_smote_pipe.fit(X_train_smote, y_train_smote)

# Step 6: Evaluate on validation set
val_accuracy_smote = knn_smote_pipe.score(X_val_smote, y_val_smote)
print(f"Validation Accuracy: {val_accuracy_smote:.4f}")

# Step 7: Final evaluation on test set (optional, only after model selection)
test_accuracy_smote = knn_smote.score(X_test_smote, y_test_smote)
print(f"Test Accuracy: {test_accuracy_smote:.4f}")

CV Fold Accuracies (on training): [1. 1. 1. 1. 1.]
Mean CV Accuracy: 1.0000 ± 0.0000
Validation Accuracy: 1.0000
Test Accuracy: 1.0000


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


The dataset is not particularly unbalanced, so there is no need for balancing techniques.

In [194]:
print(len(resampled_smote[resampled_smote['label']== 'human']))
print(len(resampled_smote[resampled_smote['label']== 'moderate_bot']))
print(len(resampled_smote[resampled_smote['label']== 'advanced_bot']))

500
500
500


#### 2. Borderline SMOTE

In [195]:
# load resampled dataset 

resampled_bsmote = pd.read_csv('../data/processed/oversampled_borderlinesmote.csv')
resampled_bsmote = resampled_bsmote.drop(columns='Unnamed: 0')

In [196]:
resampled_bsmote

Unnamed: 0,entropy,clicks_count,durations,click_frequency,speed_cv,hover_frequency,avg_hover_duration,acceleration_mean,acceleration_std,acceleration_skewness,...,day_Thursday,day_Tuesday,day_Wednesday,time_afternoon,time_evening,time_morning,time_night,time_wee_hours,hashed_url,label
0,3.121928,10,1606000878926,6.226647e-12,1.292750,3.265045,10044.058824,-97.987654,4076.554706,0.137302,...,0,897,0,0,0,897,0,0,47567188162,human
1,3.221097,19,677103,2.806072e-05,0.509696,0.965173,1951.275000,-3.079521,1211.355072,-0.234733,...,0,0,0,0,401,0,0,0,579555515964,advanced_bot
2,2.895424,17,1606000279969,1.058530e-11,0.517826,0.218432,966.923077,-94.009643,2813.920800,-0.017841,...,0,0,316,0,0,0,316,0,557692383271,advanced_bot
3,2.446439,10,141246,7.079847e-05,0.232604,3.927866,866.327044,-2.978588,616.098440,0.029983,...,0,0,109,109,0,0,0,0,664568366327,moderate_bot
4,3.452820,16,460995,3.470753e-05,0.508165,0.895245,2000.883117,-0.205002,1173.942929,-0.054982,...,0,0,0,0,283,0,0,0,1052243837978,advanced_bot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,2.874756,15,1606000194188,9.438873e-12,0.222493,3.918420,424.137514,-19.998258,539.740959,0.449709,...,0,0,235,0,0,235,0,0,683800219037,moderate_bot
1496,2.723549,10,161714,6.191525e-05,0.227783,4.065737,732.065938,-0.495459,643.002866,0.202609,...,0,0,128,128,0,0,0,0,1052243837978,moderate_bot
1497,2.595587,13,1606000053744,8.094645e-12,0.472454,4.005811,413.749600,-28.293885,741.233515,1.713755,...,0,0,86,0,0,86,0,0,683800219037,moderate_bot
1498,2.815075,18,1606000134701,1.130401e-11,0.288829,3.971796,406.622554,-20.893619,670.887626,0.376628,...,0,0,176,0,0,176,0,0,1076285962431,moderate_bot


Get target column

In [197]:
X_bsmote = resampled_bsmote[[column for column in list(resampled_bsmote.columns) if column != 'label']]
y_bsmote = resampled_bsmote[['label']]

In [198]:
# Step 1: Split into train (60%) and temp (40%)
X_train_bsmote, X_temp_bsmote, y_train_bsmote, y_temp_bsmote = train_test_split(
    X_bsmote, y_bsmote, test_size=0.4, random_state=42)

# Step 2: Split temp into validation (20%) and test (20%)
X_val_bsmote, X_test_bsmote, y_val_bsmote, y_test_bsmote = train_test_split(
    X_temp_bsmote, y_temp_bsmote, test_size=0.5, random_state=42)

# Step 3: Standardize using only training data
scaler_bsmote = StandardScaler()
X_train_bsmote = scaler_bsmote.fit_transform(X_train_bsmote)
X_val_bsmote = scaler_bsmote.transform(X_val_bsmote)
X_test_bsmote = scaler_bsmote.transform(X_test_bsmote)

# Initialize KNN classifier with k=3
knn_bsmote = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn_bsmote.fit(X_train_bsmote, y_train_bsmote)


y_val_pred_bsmote = knn_bsmote.predict(X_val_bsmote)
val_accuracy_bsmote = accuracy_score(y_val_bsmote, y_val_pred_bsmote)
print(f'Validation Accuracy: {val_accuracy_bsmote:.2f}')


y_test_pred_bsmote= knn_bsmote.predict(X_test_bsmote)
test_accuracy_bsmote = accuracy_score(y_test_bsmote, y_test_pred_bsmote)
print(f'Test Accuracy: {test_accuracy_bsmote:.2f}')

Validation Accuracy: 1.00
Test Accuracy: 1.00


  return self._fit(X, y)


#### With CV

In [199]:
# Define KNN model with preprocessing pipeline
# StandardScaler is usually helpful for KNN
knn_bsmote_pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))

# K-Fold setup
kfold_bsmote = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores_bsmote = cross_val_score(knn_bsmote_pipe, X_train_bsmote, y_train_bsmote, cv=kfold_bsmote, scoring='accuracy')

print(f"CV Fold Accuracies (on training): {cv_scores_bsmote}")
print(f"Mean CV Accuracy: {np.mean(cv_scores_bsmote):.4f} ± {np.std(cv_scores_bsmote):.4f}")

# Step 5: Fit model on full training set
knn_bsmote_pipe.fit(X_train_bsmote, y_train_bsmote)

# Step 6: Evaluate on validation set
val_accuracy_bsmote = knn_bsmote_pipe.score(X_val_bsmote, y_val_bsmote)
print(f"Validation Accuracy: {val_accuracy_bsmote:.4f}")

# Step 7: Final evaluation on test set (optional, only after model selection)
test_accuracy_bsmote = knn_bsmote.score(X_test_bsmote, y_test_bsmote)
print(f"Test Accuracy: {test_accuracy_bsmote:.4f}")

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


CV Fold Accuracies (on training): [0.98888889 0.99444444 1.         1.         0.99444444]
Mean CV Accuracy: 0.9956 ± 0.0042
Validation Accuracy: 1.0000
Test Accuracy: 0.9967


The dataset is not particularly unbalanced, so there is no need for balancing techniques.

In [200]:
print(len(resampled_bsmote[resampled_bsmote['label']== 'human']))
print(len(resampled_bsmote[resampled_bsmote['label']== 'moderate_bot']))
print(len(resampled_bsmote[resampled_bsmote['label']== 'advanced_bot']))

500
500
500


#### 3. ADASYN

In [201]:
# load resampled dataset 

resampled_ada = pd.read_csv('../data/processed/oversampled_adasyn.csv')
resampled_ada = resampled_ada.drop(columns='Unnamed: 0')

In [202]:
resampled_ada

Unnamed: 0,entropy,clicks_count,durations,click_frequency,speed_cv,hover_frequency,avg_hover_duration,acceleration_mean,acceleration_std,acceleration_skewness,...,day_Thursday,day_Tuesday,day_Wednesday,time_afternoon,time_evening,time_morning,time_night,time_wee_hours,hashed_url,label
0,3.121928,10,1606000878926,6.226647e-12,1.292750,3.265045,10044.058824,-97.987654,4076.554706,0.137302,...,0,897,0,0,0,897,0,0,47567188162,human
1,3.221097,19,677103,2.806072e-05,0.509696,0.965173,1951.275000,-3.079521,1211.355072,-0.234733,...,0,0,0,0,401,0,0,0,579555515964,advanced_bot
2,2.895424,17,1606000279969,1.058530e-11,0.517826,0.218432,966.923077,-94.009643,2813.920800,-0.017841,...,0,0,316,0,0,0,316,0,557692383271,advanced_bot
3,2.446439,10,141246,7.079847e-05,0.232604,3.927866,866.327044,-2.978588,616.098440,0.029983,...,0,0,109,109,0,0,0,0,664568366327,moderate_bot
4,3.452820,16,460995,3.470753e-05,0.508165,0.895245,2000.883117,-0.205002,1173.942929,-0.054982,...,0,0,0,0,283,0,0,0,1052243837978,advanced_bot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1483,2.346064,16,1606000052907,1.008221e-11,0.350372,3.597354,428.121482,-19.421949,619.620984,0.859266,...,0,0,93,0,0,93,0,0,51475110751,moderate_bot
1484,2.713247,18,1606000127245,1.161153e-11,0.305007,3.844336,410.805781,-24.127927,612.163808,0.722124,...,0,0,168,0,0,168,0,0,683800219037,moderate_bot
1485,3.129985,19,1606000134344,1.243061e-11,0.318116,4.047347,396.654471,-14.560980,612.391732,1.473343,...,0,0,176,0,0,176,0,0,683800219037,moderate_bot
1486,2.591303,14,1606000082051,8.915743e-12,0.391006,3.971701,403.469148,-20.380032,662.031042,0.822899,...,0,0,115,0,0,115,0,0,427006947181,moderate_bot


Get target column

In [203]:
X_ada = resampled_ada[[column for column in list(resampled_ada.columns) if column != 'label']]
y_ada = resampled_ada[['label']]

In [204]:
# Step 1: Split into train (60%) and temp (40%)
X_train_ada, X_temp_ada, y_train_ada, y_temp_ada = train_test_split(
    X_ada, y_ada, test_size=0.4, random_state=42)

# Step 2: Split temp into validation (20%) and test (20%)
X_val_ada, X_test_ada, y_val_ada, y_test_ada = train_test_split(
    X_temp_ada, y_temp_ada, test_size=0.5, random_state=42)

# Step 3: Standardize using only training data
scaler_ada = StandardScaler()
X_train_ada = scaler_ada.fit_transform(X_train_ada)
X_val_ada = scaler_ada.transform(X_val_ada)
X_test_ada = scaler_ada.transform(X_test_ada)

# Initialize KNN classifier with k=3
knn_ada = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn_ada.fit(X_train_ada, y_train_ada)


y_val_pred_ada = knn_ada.predict(X_val_ada)
val_accuracy_ada = accuracy_score(y_val_ada, y_val_pred_ada)
print(f'Validation Accuracy: {val_accuracy_ada:.2f}')


y_test_pred_ada = knn_ada.predict(X_test_ada)
test_accuracy_ada = accuracy_score(y_test_ada, y_test_pred_ada)
print(f'Test Accuracy: {test_accuracy_ada:.2f}')

Validation Accuracy: 1.00
Test Accuracy: 1.00


  return self._fit(X, y)


#### With CV

In [205]:
# Define KNN model with preprocessing pipeline
# StandardScaler is usually helpful for KNN
knn_ada_pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))

# K-Fold setup
kfold_ada = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores_ada = cross_val_score(knn_ada_pipe, X_train_ada, y_train_ada, cv=kfold_ada, scoring='accuracy')

print(f"CV Fold Accuracies (on training): {cv_scores_ada}")
print(f"Mean CV Accuracy: {np.mean(cv_scores_ada):.4f} ± {np.std(cv_scores_ada):.4f}")

# Step 5: Fit model on full training set
knn_ada_pipe.fit(X_train_ada, y_train_ada)

# Step 6: Evaluate on validation set
val_accuracy_ada = knn_ada_pipe.score(X_val_ada, y_val_ada)
print(f"Validation Accuracy: {val_accuracy_ada:.4f}")

# Step 7: Final evaluation on test set (optional, only after model selection)
test_accuracy_ada = knn_ada_pipe.score(X_test_ada, y_test_ada)
print(f"Test Accuracy: {test_accuracy_ada:.4f}")

CV Fold Accuracies (on training): [0.99441341 1.         0.99438202 1.         0.99438202]
Mean CV Accuracy: 0.9966 ± 0.0027
Validation Accuracy: 0.9966
Test Accuracy: 1.0000


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


The dataset is not particularly unbalanced, so there is no need for balancing techniques.

In [206]:
print(len(resampled_smote[resampled_smote['label']== 'human']))
print(len(resampled_smote[resampled_smote['label']== 'moderate_bot']))
print(len(resampled_smote[resampled_smote['label']== 'advanced_bot']))

500
500
500
