In [91]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import cross_validate


In [92]:
data = pd.read_csv('../data/processed/data.csv')
data = data.drop(columns='session_id')
# data = data.drop(columns=['Unnamed: 0', 'url', 'referrer', 'session_id'])

In [93]:
X = data[[column for column in list(data.columns) if column != 'label' and column != 'new_label' and column not in ['acceleration_pos_neg_ratio','acceleration_std', 'time_morning','durations','hover_frequency','speed_cv']]]
y = data[['label']]

#### Testing oversampling techniques

#### 1. SMOTE

Get target column

In [94]:
# Step 1: Split into train (60%) and temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y)

# Step 2: Split temp into validation (20%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Step 3: Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Step 3: Standardize using only training data
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_val = scaler.transform(X_val)
X_test= scaler.transform(X_test)

# Initialize KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn.fit(X_train_resampled, y_train_resampled)


y_val_pred = knn.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.2f}')


y_test_pred = knn.predict(X_test)
test_accuracy= accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')

Validation Accuracy: 0.96
Test Accuracy: 0.98


  return self._fit(X, y)


#### With CV

In [95]:
# Define KNN model with preprocessing pipeline
# StandardScaler is usually helpful for KNN
knn_pipe = Pipeline([('scaler', StandardScaler()), ('smote', SMOTE(random_state=42)), ('knn', KNeighborsClassifier())])

# K-Fold setup
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# cv_scores = cross_val_score(knn_pipe, X_train, y_train, cv=kfold, scoring='accuracy')

# print(f"CV Fold Accuracies (on training): {cv_scores}")
# print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Step 5: Fit model on full training set
# knn_pipe.fit(X_train_resampled, y_train_resampled)

# Step 6: Evaluate on validation set
# scores = cross_val_score(knn_pipe, X, y, cv=kfold, scoring='accuracy')
scores = cross_validate(knn_pipe, X, y, cv=kfold, scoring='accuracy', return_train_score=True)
print(f"CV Fold Accuracies (on training): {scores['train_score']}")
print(f"Mean CV Accuracy: {np.mean(scores['train_score']):.4f} ± {np.std(scores['train_score']):.4f}")
print(f"CV Fold Accuracies (on test): {scores['test_score']}")
print(f"Mean CV Accuracy: {np.mean(scores['test_score']):.4f} ± {np.std(scores['test_score']):.4f}")


# Step 7: Final evaluation on test set (optional, only after model selection)
# test_accuracy_smote = knn_smote_pipe.score(X_test, y_test)
# print(f"Test Accuracy: {test_accuracy_smote:.4f}")

CV Fold Accuracies (on training): [0.99442897 0.99164345 0.99164345 0.99442897 0.98888889]
Mean CV Accuracy: 0.9922 ± 0.0021
CV Fold Accuracies (on test): [0.97777778 0.98888889 0.96666667 1.         0.94382022]
Mean CV Accuracy: 0.9754 ± 0.0193


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


#### 2. Borderline SMOTE

In [96]:
# Step 1: Split into train (60%) and temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y)

# Step 2: Split temp into validation (20%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Step 3: Apply SMOTE only on the training set
bsmote = BorderlineSMOTE(random_state=42)
X_train_resampled, y_train_resampled = bsmote.fit_resample(X_train, y_train)

# Step 3: Standardize using only training data
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_val = scaler.transform(X_val)
X_test= scaler.transform(X_test)

# Initialize KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn.fit(X_train_resampled, y_train_resampled)


y_val_pred = knn.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.2f}')


y_test_pred = knn.predict(X_test)
test_accuracy= accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')

Validation Accuracy: 0.94
Test Accuracy: 0.99


  return self._fit(X, y)


#### With CV

In [97]:
# Define KNN model with preprocessing pipeline
# StandardScaler is usually helpful for KNN
knn_pipe = Pipeline([('scaler', StandardScaler()), ('borderline_smote', BorderlineSMOTE(random_state=42)), ('knn', KNeighborsClassifier())])

# K-Fold setup
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# cv_scores = cross_val_score(knn_pipe, X_train, y_train, cv=kfold, scoring='accuracy')

# print(f"CV Fold Accuracies (on training): {cv_scores}")
# print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Step 5: Fit model on full training set
# knn_pipe.fit(X_train_resampled, y_train_resampled)

# Step 6: Evaluate on validation set
# scores = cross_val_score(knn_pipe, X, y, cv=kfold, scoring='accuracy')
scores = cross_validate(knn_pipe, X, y, cv=kfold, scoring='accuracy', return_train_score=True)
print(f"CV Fold Accuracies (on training): {scores['train_score']}")
print(f"Mean CV Accuracy: {np.mean(scores['train_score']):.4f} ± {np.std(scores['train_score']):.4f}")
print(f"CV Fold Accuracies (on test): {scores['test_score']}")
print(f"Mean CV Accuracy: {np.mean(scores['test_score']):.4f} ± {np.std(scores['test_score']):.4f}")


# Step 7: Final evaluation on test set (optional, only after model selection)
# test_accuracy_smote = knn_smote_pipe.score(X_test, y_test)
# print(f"Test Accuracy: {test_accuracy_smote:.4f}")

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


CV Fold Accuracies (on training): [0.98885794 0.99721448 0.99164345 0.99442897 0.99444444]
Mean CV Accuracy: 0.9933 ± 0.0028
CV Fold Accuracies (on test): [0.97777778 0.98888889 0.98888889 1.         0.95505618]
Mean CV Accuracy: 0.9821 ± 0.0152


  return self._fit(X, y)


#### 3. ADASYN

In [98]:
# Step 1: Split into train (60%) and temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y)

# Step 2: Split temp into validation (20%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Step 3: Apply ADASYN only on the training set
# adasyn = ADASYN(sampling_strategy={'human':500,'advanced_bot':500,'moderate_bot':500}, random_state=42)
adasyn = ADASYN(sampling_strategy='minority',random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Step 3: Standardize using only training data
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_val = scaler.transform(X_val)
X_test= scaler.transform(X_test)

# Initialize KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn.fit(X_train_resampled, y_train_resampled)


y_val_pred = knn.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.2f}')


y_test_pred = knn.predict(X_test)
test_accuracy= accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')

Validation Accuracy: 0.96
Test Accuracy: 0.98


  return self._fit(X, y)


#### With CV

In [99]:
# Define KNN model with preprocessing pipeline
# StandardScaler is usually helpful for KNN
knn_pipe = Pipeline([('scaler', StandardScaler()), ('adasyn', ADASYN(sampling_strategy='minority',random_state=42)), ('knn', KNeighborsClassifier())])

# K-Fold setup
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# cv_scores = cross_val_score(knn_pipe, X_train, y_train, cv=kfold, scoring='accuracy')

# print(f"CV Fold Accuracies (on training): {cv_scores}")
# print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Step 5: Fit model on full training set
# knn_pipe.fit(X_train_resampled, y_train_resampled)

# Step 6: Evaluate on validation set
# scores = cross_val_score(knn_pipe, X, y, cv=kfold, scoring='accuracy')
scores = cross_validate(knn_pipe, X, y, cv=kfold, scoring='accuracy', return_train_score=True)
print(f"CV Fold Accuracies (on training): {scores['train_score']}")
print(f"Mean CV Accuracy: {np.mean(scores['train_score']):.4f} ± {np.std(scores['train_score']):.4f}")
print(f"CV Fold Accuracies (on test): {scores['test_score']}")
print(f"Mean CV Accuracy: {np.mean(scores['test_score']):.4f} ± {np.std(scores['test_score']):.4f}")

# Step 7: Final evaluation on test set (optional, only after model selection)
# test_accuracy_smote = knn_smote_pipe.score(X_test, y_test)
# print(f"Test Accuracy: {test_accuracy_smote:.4f}")

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


CV Fold Accuracies (on training): [0.99752475 0.99752475 0.99752475 0.99752475 0.99752475 0.99752475
 0.9950495  0.99752475 0.99752475 0.99506173]
Mean CV Accuracy: 0.9970 ± 0.0010
CV Fold Accuracies (on test): [1.         0.97777778 0.97777778 1.         1.         1.
 1.         1.         0.97777778 1.        ]
Mean CV Accuracy: 0.9933 ± 0.0102


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
