In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [2]:
df = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

In [3]:
%pip install mlflow
%pip install dagshub

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.50.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc, roc_curve, roc_auc_score
from sklearn.feature_selection import RFE
import xgboost as xgb
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import dagshub
import warnings
import time
from scipy import stats
import category_encoders as ce

In [5]:
fraud = df['isFraud']
x = df.drop('isFraud', axis=1)

In [6]:
x_temp, x_test, y_temp, y_test = train_test_split(
    x, fraud, test_size=0.2, random_state=42, stratify=fraud
)

x_train, x_val, y_train, y_val = train_test_split(
    x_temp, y_temp, test_size=0.125, random_state=42, stratify=y_temp
)

In [7]:
class XGBoost_PrepData(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fill_with_mode(self, tmp, columns):
        for col in columns:
            mode_val = tmp[col].mode()[0]
            tmp[col] = tmp[col].fillna(mode_val)
        return tmp  

    def fill_with_median(self, tmp, columns):
        for col in columns:
            median_val = tmp[col].median()
            tmp[col] = tmp[col].fillna(median_val)
        return tmp  

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        if 'TransactionID' in X.columns:
            X = X.drop('TransactionID', axis=1)

        nullFraction = (X.isnull().sum() / len(X))
        removeCols = nullFraction[nullFraction >= 0.7].index
        X = X.drop(removeCols, axis=1)

        cat_col = X.select_dtypes(include=['object']).columns
        num_col = X.select_dtypes(include=['float64', 'int64']).columns

        X = self.fill_with_mode(X, cat_col)
        X = self.fill_with_median(X, num_col)

        return X

In [8]:
class XGBoost_FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.woe_enc = None
        self.cat_col = None

    def fit(self, X, y):
        self.cat_col = X.select_dtypes(include=['object']).columns
        self.woe_enc = ce.WOEEncoder(cols=self.cat_col)
        self.woe_enc.fit(X, y)
        return self

    def transform(self, X):
        return self.woe_enc.transform(X)

In [9]:
class XGBoost_CorrelationSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.05):
        self.threshold = threshold
        self.selected_features = None

    def fit(self, X, y):
        correlation_with_target = pd.DataFrame()
        for col in X.columns:
            correlation = np.abs(X[col].corr(y))
            correlation_with_target = pd.concat([
                correlation_with_target,
                pd.DataFrame({'Feature': [col], 'Correlation': [correlation]})
            ], ignore_index=True)
        
        correlation_with_target = correlation_with_target.sort_values('Correlation', ascending=False)
        self.selected_features = correlation_with_target[correlation_with_target['Correlation'] > self.threshold]['Feature'].tolist()
        return self

    def transform(self, X):
        return X[self.selected_features]

In [13]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="auc"
)

pipeline_xgb = Pipeline([
    ('prep', XGBoost_PrepData()),
    ('feature_eng', XGBoost_FeatureEngineering()),
    ('feature_select', XGBoost_CorrelationSelector()),
    ('classifier', xgb_model)
])

pipeline_xgb.fit(x_train, y_train)

y_train_pred_proba = pipeline_xgb.predict_proba(x_train)[:, 1]
y_val_pred_proba = pipeline_xgb.predict_proba(x_val)[:, 1]

train_auc = roc_auc_score(y_train, y_train_pred_proba)
val_auc = roc_auc_score(y_val, y_val_pred_proba)

print(f"Train AUC: {train_auc:.4f}")
print(f"Validation AUC: {val_auc:.4f}")


Train AUC: 0.8700
Validation AUC: 0.8633


In [12]:
y_pred_proba = pipeline_xgb.predict_proba(x_test)[:, 1]

test_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Test ROC AUC without GridSearch: {test_auc:.4f}")

Test ROC AUC without GridSearch: 0.8579


In [11]:
import dagshub
import mlflow
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

dagshub.init(repo_owner='goguaD', repo_name='Fraud-Detection', mlflow=True)
mlflow.set_experiment("XGBoost_Training")

xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="auc"
)

pipeline_xgb = Pipeline([
    ('prep', XGBoost_PrepData()),
    ('feature_eng', XGBoost_FeatureEngineering()),
    ('feature_select', XGBoost_CorrelationSelector()),
    ('classifier', xgb_model)
])

with mlflow.start_run(run_name="XGBoost_Feature_Selection"):
    pipeline_xgb.fit(x_train, y_train)

    y_test_pred_proba = pipeline_xgb.predict_proba(x_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_test_pred_proba)

    mlflow.log_param("model_type", "XGBoost")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 6)
    mlflow.log_param("learning_rate", 0.1)

    mlflow.log_metric("Test ROC AUC", test_auc)

    mlflow.sklearn.log_model(pipeline_xgb, artifact_path="pipeline_model")

print(f"Logged Test AUC: {test_auc:.4f}")


Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=4b3e6e2e-fd00-4412-ac16-315277ff3de2&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=2fd4b27314a62cdf4e3047bf05cf26c498ed894a065aaa6616fea6b5f260cf5f




2025/04/27 11:43:04 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost_Training' does not exist. Creating a new experiment.


🏃 View run XGBoost_Feature_Selection at: https://dagshub.com/goguaD/Fraud-Detection.mlflow/#/experiments/0/runs/7d5ca64b738e432ba01d8523d70f4b6a
🧪 View experiment at: https://dagshub.com/goguaD/Fraud-Detection.mlflow/#/experiments/0
Logged Test AUC: 0.8579
