In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [None]:
df_train_tr = pd.read_csv("/kaggle/input/ieee-fraud-detection/train_transaction.csv")
df_train_id = pd.read_csv("/kaggle/input/ieee-fraud-detection/train_identity.csv")

df_train = pd.merge(df_train_tr, df_train_id, on='TransactionID', how='left')

**Load data and merge**

In [None]:
df_train.fillna(-999, inplace=True)

X = df_train.drop(columns=['isFraud'])
y = df_train['isFraud']

X = X.drop(columns=['TransactionID'])

In [None]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

categorical_transformer = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [None]:
X_train[categorical_cols] = X_train[categorical_cols].astype(str)
X_val[categorical_cols] = X_val[categorical_cols].astype(str)

In [None]:
!pip install dagshub mlflow

import dagshub
import mlflow

In [None]:
import dagshub
dagshub.init(repo_owner='eghib22', repo_name='Fraud_Detection', mlflow=True)


In [None]:
model_name = "AdaBoost"
model = AdaBoostClassifier(n_estimators=50, random_state=42)
print(f"\nTraining {model_name}...")
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])
clf.fit(X_train, y_train)

y_val_probs = clf.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_probs)
print(f"{model_name} Validation AUC-ROC:", roc_auc)

with mlflow.start_run(run_name=model_name):
    mlflow.set_tag("model", model_name)
    mlflow.log_params(model.get_params())
    mlflow.log_metric('val_roc_auc', roc_auc)
    mlflow.sklearn.log_model(clf, model_name)