In [1]:
import sys
import os
import pandas as pd
import mlflow
import mlflow.sklearn

sys.path.append(os.path.abspath(os.path.join('..')))

from src.preprocess import load_data, clean_data, feature_engineering, merge_geolocation

# Modeling Imports
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, average_precision_score, roc_auc_score

# Imbalance Handling
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline 

# Setup Experiment Tracking (Optional but Professional)
mlflow.set_experiment("Fraud_Detection_Comparison")

2025/12/26 07:43:44 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/26 07:43:44 INFO mlflow.store.db.utils: Updating database tables
2025/12/26 07:43:44 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/26 07:43:44 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/26 07:43:45 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/26 07:43:45 INFO alembic.runtime.migration: Will assume non-transactional DDL.


<Experiment: artifact_location=('file:C:/Users/Eyu '
 'Birhanu/projects/kifiya-AI/fraud-detection/notebooks/mlruns/1'), creation_time=1766721672805, experiment_id='1', last_update_time=1766721672805, lifecycle_stage='active', name='Fraud_Detection_Comparison', tags={}>

In [2]:
# Load & Process
fraud_df = load_data('../data/raw/Fraud_Data.csv')
ip_df = load_data('../data/raw/IpAddress_to_Country.csv')
fraud_df = clean_data(fraud_df)
fraud_df = feature_engineering(fraud_df)
fraud_df = merge_geolocation(fraud_df, ip_df)

# Define X and y
X = fraud_df.drop(['class', 'user_id', 'signup_time', 'purchase_time', 'device_id', 
                   'ip_address', 'ip_address_int', 'lower_bound_ip_address', 
                   'upper_bound_ip_address'], axis=1)
y = fraud_df['class']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Preprocessing Pipeline
numerical_cols = ['purchase_value', 'age', 'time_since_signup', 'hour_of_day']
categorical_cols = ['source', 'browser', 'sex', 'country']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

2025-12-26 07:43:46,525 - INFO - Data loaded successfully from ../data/raw/Fraud_Data.csv. Shape: (151112, 11)
2025-12-26 07:43:46,682 - INFO - Data loaded successfully from ../data/raw/IpAddress_to_Country.csv. Shape: (138846, 3)
2025-12-26 07:43:47,731 - INFO - Starting Geolocation Merge...
2025-12-26 07:43:48,044 - INFO - Geolocation Merge Completed.


In [3]:
# Create Pipeline
lr_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Train
print("Training Baseline (Logistic Regression)...")
lr_pipeline.fit(X_train, y_train)

# Evaluate
y_pred_lr = lr_pipeline.predict(X_test)
print("--- Baseline Classification Report ---")
print(classification_report(y_test, y_pred_lr))

Training Baseline (Logistic Regression)...
--- Baseline Classification Report ---
              precision    recall  f1-score   support

           0       0.95      0.65      0.78     27393
           1       0.17      0.70      0.28      2830

    accuracy                           0.66     30223
   macro avg       0.56      0.68      0.53     30223
weighted avg       0.88      0.66      0.73     30223



In [4]:
# Define Base Pipeline
rf_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Define Hyperparameters grid
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5]
}

# Grid Search with Stratified K-Fold
print("Starting Grid Search for Random Forest...")
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
best_rf_model = grid_search.best_estimator_

Starting Grid Search for Random Forest...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}


In [5]:
xgb_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(eval_metric='logloss', random_state=42))
])

print("Training XGBoost...")
xgb_pipeline.fit(X_train, y_train)

Training XGBoost...


0,1,2
,steps,"[('preprocessor', ...), ('smote', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [6]:
results = []

models = {
    'Logistic Regression': lr_pipeline,
    'Random Forest (Tuned)': best_rf_model,
    'XGBoost': xgb_pipeline
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    res = {
        'Model': name,
        'Precision': average_precision_score(y_test, y_pred), # Simplification for dataframe
        'Recall': classification_report(y_test, y_pred, output_dict=True)['1']['recall'],
        'F1-Score': classification_report(y_test, y_pred, output_dict=True)['1']['f1-score'],
        'AUPRC': average_precision_score(y_test, y_prob)
    }
    results.append(res)

comparison_df = pd.DataFrame(results)
print(comparison_df)

# Save best model logic
best_model_name = comparison_df.sort_values(by='F1-Score', ascending=False).iloc[0]['Model']
print(f"\nBased on F1-Score and AUPRC, the best model is: {best_model_name}")

                   Model  Precision    Recall  F1-Score     AUPRC
0    Logistic Regression   0.148799  0.697527  0.276878  0.397142
1  Random Forest (Tuned)   0.555794  0.525088  0.682277  0.622066
2                XGBoost   0.547552  0.525088  0.678384  0.606181

Based on F1-Score and AUPRC, the best model is: Random Forest (Tuned)
