In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score, recall_score, accuracy_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Load oversampled and undersampled data
oversampled_data = pd.read_csv('/content/drive/MyDrive/oversampled_data.csv')
undersampled_data = pd.read_csv('/content/drive/MyDrive/undersampled_data.csv')

# Features and target for oversampled data
X_smote = oversampled_data.drop('fraud', axis=1)
y_smote = oversampled_data['fraud']

# Features and target for undersampled data
X_rus = undersampled_data.drop('fraud', axis=1)
y_rus = undersampled_data['fraud']

# Train-test split for oversampled data
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote, y_smote, test_size=0.3, random_state=42, stratify=y_smote)

# Train-test split for undersampled data
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_rus, y_rus, test_size=0.3, random_state=42, stratify=y_rus)

# Apply Standard Scaling to the features
scaler = StandardScaler()
X_train_smote = scaler.fit_transform(X_train_smote)
X_test_smote = scaler.transform(X_test_smote)
X_train_rus = scaler.fit_transform(X_train_rus)
X_test_rus = scaler.transform(X_test_rus)

# Initialize models with early stopping where applicable
models = {
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100, validation_fraction=0.1, n_iter_no_change=10),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Neural Network': MLPClassifier(random_state=42, max_iter=1000, early_stopping=True, validation_fraction=0.1),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Function to train and evaluate models with progress tracking
def train_and_evaluate(model_name, model, X_train, y_train, X_test, y_test):
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print(f"{model_name} - F1 Score: {f1:.4f}, ROC AUC: {roc_auc:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}")
    return {model_name: {'F1 Score': f1, 'ROC AUC': roc_auc, 'Recall': recall, 'Accuracy': accuracy, 'Precision': precision}}

# Train and evaluate on oversampled data
print("Results on Oversampled Data (SMOTE):")
results_smote = {}
for model_name, model in tqdm(models.items()):
    result = train_and_evaluate(model_name, model, X_train_smote, y_train_smote, X_test_smote, y_test_smote)
    results_smote.update(result)

# Train and evaluate on undersampled data
print("\nResults on Undersampled Data (RUS):")
results_rus = {}
for model_name, model in tqdm(models.items()):
    result = train_and_evaluate(model_name, model, X_train_rus, y_train_rus, X_test_rus, y_test_rus)
    results_rus.update(result)

# Display the results
print("\nSMOTE (Oversampling) Results")
print(pd.DataFrame(results_smote).transpose())

print("\nRandom UnderSampler (Undersampling) Results")
print(pd.DataFrame(results_rus).transpose())


Results on Oversampled Data (SMOTE):


  0%|          | 0/6 [00:00<?, ?it/s]

Training Random Forest...


 17%|█▋        | 1/6 [01:37<08:08, 97.64s/it]

Random Forest - F1 Score: 0.9973, ROC AUC: 0.9973, Recall: 0.9980, Accuracy: 0.9973, Precision: 0.9966
Training Gradient Boosting...


 33%|███▎      | 2/6 [07:01<15:23, 230.77s/it]

Gradient Boosting - F1 Score: 0.9891, ROC AUC: 0.9890, Recall: 0.9913, Accuracy: 0.9890, Precision: 0.9869
Training Logistic Regression...


 50%|█████     | 3/6 [08:14<07:56, 158.88s/it]

Logistic Regression - F1 Score: 0.9956, ROC AUC: 0.9956, Recall: 0.9945, Accuracy: 0.9956, Precision: 0.9967
Training Neural Network...


 67%|██████▋   | 4/6 [10:55<05:18, 159.49s/it]

Neural Network - F1 Score: 0.9963, ROC AUC: 0.9963, Recall: 0.9964, Accuracy: 0.9963, Precision: 0.9963
Training Naive Bayes...


 83%|████████▎ | 5/6 [10:57<01:42, 102.70s/it]

Naive Bayes - F1 Score: 0.9754, ROC AUC: 0.9747, Recall: 1.0000, Accuracy: 0.9747, Precision: 0.9519
Training Decision Tree...


100%|██████████| 6/6 [11:13<00:00, 112.18s/it]


Decision Tree - F1 Score: 0.9949, ROC AUC: 0.9949, Recall: 0.9956, Accuracy: 0.9949, Precision: 0.9942

Results on Undersampled Data (RUS):


  0%|          | 0/6 [00:00<?, ?it/s]

Training Random Forest...


 17%|█▋        | 1/6 [00:00<00:03,  1.34it/s]

Random Forest - F1 Score: 0.9775, ROC AUC: 0.9773, Recall: 0.9875, Accuracy: 0.9773, Precision: 0.9678
Training Gradient Boosting...


 33%|███▎      | 2/6 [00:02<00:05,  1.31s/it]

Gradient Boosting - F1 Score: 0.9788, ROC AUC: 0.9785, Recall: 0.9921, Accuracy: 0.9785, Precision: 0.9658
Training Logistic Regression...


 50%|█████     | 3/6 [00:02<00:02,  1.22it/s]

Logistic Regression - F1 Score: 0.9792, ROC AUC: 0.9789, Recall: 0.9894, Accuracy: 0.9789, Precision: 0.9692
Training Neural Network...


100%|██████████| 6/6 [00:04<00:00,  1.45it/s]

Neural Network - F1 Score: 0.9768, ROC AUC: 0.9764, Recall: 0.9954, Accuracy: 0.9764, Precision: 0.9590
Training Naive Bayes...
Naive Bayes - F1 Score: 0.9760, ROC AUC: 0.9755, Recall: 0.9995, Accuracy: 0.9755, Precision: 0.9536
Training Decision Tree...
Decision Tree - F1 Score: 0.9720, ROC AUC: 0.9720, Recall: 0.9722, Accuracy: 0.9720, Precision: 0.9718

SMOTE (Oversampling) Results
                     F1 Score   ROC AUC    Recall  Accuracy  Precision
Random Forest        0.997270  0.997268  0.997986  0.997268   0.996555
Gradient Boosting    0.989062  0.989037  0.991273  0.989037   0.986860
Logistic Regression  0.995558  0.995563  0.994462  0.995563   0.996656
Neural Network       0.996326  0.996326  0.996374  0.996326   0.996278
Naive Bayes          0.975369  0.974747  1.000000  0.974747   0.951921
Decision Tree        0.994900  0.994896  0.995608  0.994896   0.994192

Random UnderSampler (Undersampling) Results
                     F1 Score   ROC AUC    Recall  Accuracy  Precision


