In [21]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
import dagshub
import mlflow
import mlflow.sklearn

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [22]:
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
df = train_transaction.merge(train_identity, on='TransactionID', how='left')

In [23]:
dagshub.init(repo_owner='gioeba', repo_name='IEEE-CIS-Fraud-Detection', mlflow=True)

# Cleaning

In [24]:
y = df['isFraud']
X = df.drop(columns=['isFraud', 'TransactionID', 'TransactionDT'])
X = X.loc[:, X.isnull().mean() < 0.5]

cat_cols = X.select_dtypes('object').columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [25]:
num_transform = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transform = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer([
    ('num', num_transform, num_cols),
    ('cat', cat_transform, cat_cols)
])
clf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=8, class_weight='balanced', random_state=42))
])

In [26]:
mlflow.set_experiment("ieee-decision-tree")

with mlflow.start_run(run_name="tree-depth-8"):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        clf_pipeline.fit(X_train, y_train)
        y_prob = clf_pipeline.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_prob)
        auc_scores.append(auc)

        print(f"Fold {fold} AUC: {auc:.4f}")
        mlflow.log_metric(f"fold_{fold}_auc", auc)

    avg_auc = np.mean(auc_scores)
    mlflow.log_metric("avg_roc_auc", avg_auc)

    mlflow.log_param("model", "DecisionTree")
    mlflow.log_param("max_depth", 8)

    mlflow.sklearn.log_model(clf_pipeline, "model")

    print(f"\nAverage AUC across folds: {avg_auc:.4f}")

Fold 1 AUC: 0.8458
Fold 2 AUC: 0.8484
Fold 3 AUC: 0.8417
Fold 4 AUC: 0.8439
Fold 5 AUC: 0.8452





Average AUC across folds: 0.8450
🏃 View run tree-depth-6 at: https://dagshub.com/gioeba/IEEE-CIS-Fraud-Detection.mlflow/#/experiments/0/runs/1fad5eea5a1243b2bd1315881258638e
🧪 View experiment at: https://dagshub.com/gioeba/IEEE-CIS-Fraud-Detection.mlflow/#/experiments/0


# Training