In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..')))

from src.preprocess import load_data, clean_data, feature_engineering, merge_geolocation

# Modeling Imports
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, average_precision_score

# Imbalance Handling
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline 

print("Libraries Imported")

Libraries Imported


In [2]:
# Re-load and process if variables aren't in memory
fraud_df = load_data('../data/raw/Fraud_Data.csv')
ip_df = load_data('../data/raw/IpAddress_to_Country.csv')
fraud_df = clean_data(fraud_df)
fraud_df = feature_engineering(fraud_df)
fraud_df = merge_geolocation(fraud_df, ip_df)

# Define Features
X = fraud_df.drop(['class', 'user_id', 'signup_time', 'purchase_time', 'device_id', 
                   'ip_address', 'ip_address_int', 'lower_bound_ip_address', 
                   'upper_bound_ip_address'], axis=1)
y = fraud_df['class']

# Define Categorical vs Numerical
categorical_cols = ['source', 'browser', 'sex', 'country']
numerical_cols = ['purchase_value', 'age', 'time_since_signup', 'hour_of_day']

print(f"Features shape: {X.shape}")

2025-12-26 06:19:05,150 - INFO - Data loaded successfully from ../data/raw/Fraud_Data.csv. Shape: (151112, 11)
2025-12-26 06:19:05,365 - INFO - Data loaded successfully from ../data/raw/IpAddress_to_Country.csv. Shape: (138846, 3)
2025-12-26 06:19:06,897 - INFO - Starting Geolocation Merge...
2025-12-26 06:19:07,127 - INFO - Geolocation Merge Completed.


Features shape: (151112, 9)


In [3]:
# 1. Split Data (Stratified because of imbalance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 2. Create Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# 3. Create Model Pipeline with SMOTE
# SMOTE is used inside the pipeline so it only upsamples the Training Data, not Test Data!
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1))
])

# 4. Train
print("Training Random Forest...")
pipeline.fit(X_train, y_train)
print("Training Complete.")

Training Random Forest...
Training Complete.


In [4]:
# Predict
y_pred = pipeline.predict(X_test)

# Report
print("--- Classification Report ---")
print(classification_report(y_test, y_pred))

# AUPRC (Better for imbalance than Accuracy)
# Note: Need to encode y_test/pred if using simple sklearn metric, 
# but for binary 0/1 it works directly.
from sklearn.metrics import precision_recall_curve, auc
precision, recall, _ = precision_recall_curve(y_test, y_pred)
auprc = auc(recall, precision)
print(f"Area Under Precision-Recall Curve (AUPRC): {auprc:.4f}")

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     27393
           1       0.82      0.53      0.64      2830

    accuracy                           0.95     30223
   macro avg       0.89      0.76      0.81     30223
weighted avg       0.94      0.95      0.94     30223

Area Under Precision-Recall Curve (AUPRC): 0.6967
