In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dagshub
import mlflow
import mlflow.sklearn

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [3]:
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')

In [4]:
dagshub.init(repo_owner='gioeba', repo_name='IEEE-CIS-Fraud-Detection', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=1be951bc-2359-4601-b441-9587cca9b200&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=c71513b74e773c41f0f82ddce8bd25c30e16876abcd8262932f0ad4748ebb14c




Output()

# Cleaning

In [5]:
df = train_transaction.merge(train_identity, on='TransactionID', how='left')
y = df['isFraud']
X = df.drop(columns=['isFraud', 'TransactionID']).copy()

# Feature Engineering

In [6]:
logged_fe = 'runs:/2251e67e9c084b99b4e492e146e576a0/fe'
feature_engineering_model = mlflow.sklearn.load_model(logged_fe)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

# Feture Selection

In [7]:
logged_preprocessor = 'runs:/de686b00b14a4c7aac22431740c0ebb5/pre-processor'
dynamic_preprocessor = mlflow.sklearn.load_model(logged_preprocessor)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [8]:
clf_pipeline = Pipeline([
    ('feature_engineering', feature_engineering_model),
    ('preprocessing', dynamic_preprocessor),
    ('classifier', GradientBoostingClassifier(
        n_estimators=50,
        learning_rate=0.5,
        max_depth=8,
        subsample=0.8,
        random_state=42
    ))
])

# Training

In [None]:
mlflow.set_experiment("ieee-gradient-boosting")
with mlflow.start_run(run_name="gb-depth-8-n50"):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        clf_pipeline.fit(X_train, y_train)
        y_prob = clf_pipeline.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_prob)
        auc_scores.append(auc)

        print(f"Fold {fold} AUC: {auc:.4f}")
        mlflow.log_metric(f"fold_{fold}_auc", auc)

    avg_auc = np.mean(auc_scores)
    print(f"\nAverage AUC: {avg_auc:.4f}")
    mlflow.log_metric("avg_auc", avg_auc)

    mlflow.log_param("n_estimators", 50)
    mlflow.log_param("learning_rate", 0.5)
    mlflow.log_param("max_depth", 8)
    mlflow.log_param("subsample", 0.8)

Traceback (most recent call last):
  File "/tmp/ipykernel_31/2101976195.py", line 10, in <cell line: 0>
    clf_pipeline.fit(X_train, y_train)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/ensemble/_gb.py", line 538, in fit
    n_stages = self._fit_stages(
               ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/ensemble/_gb.py", line 615, in _fit_stages
    raw_predictions = self._fit_stage(
                      ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/ensemble/_gb.py", line 257, in _fit_stage
    tree.fit(X, residual, sample_weight=sample_weight, check_input=False)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/tree/_classes.py", line 1247, in fit
    super().fit(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/tree/_classes.py", line 379, in fit
 