In [240]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, cross_val_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN


In [241]:
data = pd.read_csv('../data/processed/data.csv')
data = data.drop(columns=['Unnamed: 0', 'url', 'referrer', 'session_id'])

In [242]:
X = data[[column for column in list(data.columns) if column != 'label']]
y = data[['label']]

#### Testing oversampling techniques

#### 1. SMOTE

Get target column

In [243]:
# Step 1: Split into train (60%) and temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42)

# Step 2: Split temp into validation (20%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42)

# Step 3: Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Step 3: Standardize using only training data
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_val = scaler.transform(X_val)
X_test= scaler.transform(X_test)

# Initialize KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn.fit(X_train_resampled, y_train_resampled)


y_val_pred = knn.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.2f}')


y_test_pred = knn.predict(X_test)
test_accuracy= accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')

Validation Accuracy: 0.97
Test Accuracy: 1.00


  return self._fit(X, y)


#### With CV

In [244]:
# Define KNN model with preprocessing pipeline
# StandardScaler is usually helpful for KNN
knn_pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))

# K-Fold setup
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(knn_pipe, X_train_resampled, y_train_resampled, cv=kfold, scoring='accuracy')

print(f"CV Fold Accuracies (on training): {cv_scores}")
print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Step 5: Fit model on full training set
knn_pipe.fit(X_train_resampled, y_train_resampled)

# Step 6: Evaluate on validation set
val_accuracy = knn_pipe.score(X_val, y_val)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Step 7: Final evaluation on test set (optional, only after model selection)
# test_accuracy_smote = knn_smote_pipe.score(X_test, y_test)
# print(f"Test Accuracy: {test_accuracy_smote:.4f}")

CV Fold Accuracies (on training): [0.98461538 1.         1.         0.98461538 1.        ]
Mean CV Accuracy: 0.9938 ± 0.0075
Validation Accuracy: 0.9667


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


#### 2. Borderline SMOTE

In [245]:
# Step 1: Split into train (60%) and temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42)

# Step 2: Split temp into validation (20%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42)

# Step 3: Apply SMOTE only on the training set
bsmote = BorderlineSMOTE(random_state=42)
X_train_resampled, y_train_resampled = bsmote.fit_resample(X_train, y_train)

# Step 3: Standardize using only training data
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_val = scaler.transform(X_val)
X_test= scaler.transform(X_test)

# Initialize KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn.fit(X_train_resampled, y_train_resampled)


y_val_pred = knn.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.2f}')


y_test_pred = knn.predict(X_test)
test_accuracy= accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')

Validation Accuracy: 0.98
Test Accuracy: 1.00


  return self._fit(X, y)


#### With CV

In [246]:
# Define KNN model with preprocessing pipeline
# StandardScaler is usually helpful for KNN
knn_pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))

# K-Fold setup
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(knn_pipe, X_train_resampled, y_train_resampled, cv=kfold, scoring='accuracy')

print(f"CV Fold Accuracies (on training): {cv_scores}")
print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Step 5: Fit model on full training set
knn_pipe.fit(X_train_resampled, y_train_resampled)

# Step 6: Evaluate on validation set
val_accuracy = knn_pipe.score(X_val, y_val)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Step 7: Final evaluation on test set (optional, only after model selection)
# test_accuracy_smote = knn_smote_pipe.score(X_test, y_test)
# print(f"Test Accuracy: {test_accuracy_smote:.4f}")

CV Fold Accuracies (on training): [0.98461538 1.         0.96923077 0.98461538 1.        ]
Mean CV Accuracy: 0.9877 ± 0.0115
Validation Accuracy: 0.9778


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


#### 3. ADASYN

In [247]:
# Step 1: Split into train (60%) and temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42)

# Step 2: Split temp into validation (20%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42)

# Step 3: Apply ADASYN only on the training set
adasyn = ADASYN(sampling_strategy={'human':500,'advanced_bot':500,'moderate_bot':500}, random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Step 3: Standardize using only training data
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_val = scaler.transform(X_val)
X_test= scaler.transform(X_test)

# Initialize KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn.fit(X_train_resampled, y_train_resampled)


y_val_pred = knn.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.2f}')


y_test_pred = knn.predict(X_test)
test_accuracy= accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')

Validation Accuracy: 0.98
Test Accuracy: 1.00


  return self._fit(X, y)


#### With CV

In [248]:
# Define KNN model with preprocessing pipeline
# StandardScaler is usually helpful for KNN
knn_pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))

# K-Fold setup
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(knn_pipe, X_train_resampled, y_train_resampled, cv=kfold, scoring='accuracy')

print(f"CV Fold Accuracies (on training): {cv_scores}")
print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Step 5: Fit model on full training set
knn_pipe.fit(X_train_resampled, y_train_resampled)

# Step 6: Evaluate on validation set
val_accuracy = knn_pipe.score(X_val, y_val)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Step 7: Final evaluation on test set (optional, only after model selection)
# test_accuracy_smote = knn_smote_pipe.score(X_test, y_test)
# print(f"Test Accuracy: {test_accuracy_smote:.4f}")

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


CV Fold Accuracies (on training): [1.         0.99668874 1.         0.99668874 0.99668874]
Mean CV Accuracy: 0.9980 ± 0.0016
Validation Accuracy: 0.9778


  return self._fit(X, y)
