In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import mlflow
import mlflow.sklearn

In [2]:
import os
import sys

In [3]:
sys.path.append(os.path.abspath(os.path.join('..')))

In [4]:
df = pd.read_csv('../data/processed/model_ready.csv')


In [7]:
df.columns.tolist()

['TransactionId',
 'BatchId',
 'CurrencyCode',
 'CountryCode',
 'Amount',
 'Value',
 'TransactionStartTime',
 'PricingStrategy',
 'year',
 'month',
 'day',
 'hour',
 'total_transaction_amount',
 'average_transaction_amount',
 'transaction_count',
 'std_transaction_amount',
 'AccountId_AccountId_1',
 'AccountId_AccountId_10',
 'AccountId_AccountId_100',
 'AccountId_AccountId_1000',
 'AccountId_AccountId_1002',
 'AccountId_AccountId_1004',
 'AccountId_AccountId_1005',
 'AccountId_AccountId_1006',
 'AccountId_AccountId_1007',
 'AccountId_AccountId_1008',
 'AccountId_AccountId_1009',
 'AccountId_AccountId_101',
 'AccountId_AccountId_1010',
 'AccountId_AccountId_1011',
 'AccountId_AccountId_1012',
 'AccountId_AccountId_1014',
 'AccountId_AccountId_1015',
 'AccountId_AccountId_1016',
 'AccountId_AccountId_1017',
 'AccountId_AccountId_1018',
 'AccountId_AccountId_102',
 'AccountId_AccountId_1020',
 'AccountId_AccountId_1021',
 'AccountId_AccountId_1022',
 'AccountId_AccountId_1023',
 'Account

In [5]:
# Load processed data
# df = pd.read_csv('../data/processed/model_ready.csv')

# # Drop columns with very high cardinality (e.g., >100 unique values)
# high_card_cols = [col for col in df.columns if df[col].nunique() > 100 and col not in ['is_high_risk', 'CustomerId']]
# df = df.drop(columns=high_card_cols)

# X = df.drop(['is_high_risk', 'CustomerId'], axis=1)
# y = df['is_high_risk']

# # Split data
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
)

SyntaxError: unmatched ')' (2678184706.py, line 14)

In [None]:
df.columns.to_list()

NameError: name 'df' is not defined

#### Models

In [None]:
models = {
    "logreg": (
        LogisticRegression(max_iter=1000, random_state=42),
        {"C": [0.01, 0.1, 1, 10]}
    ),
    "decision_tree": (
        DecisionTreeClassifier(random_state=42),
        {"max_depth": [3, 5, 10, None], "min_samples_split": [2, 5, 10]}
    ),
    "random_forest": (
        RandomForestClassifier(random_state=42),
        {"n_estimators": [50, 100], "max_depth": [5, 10, None]}
    )
}

#### MLflow Experiment Tracking and Model Training

In [None]:
mlflow.set_experiment("credit-risk-model")

best_score = 0
best_model = None
best_name = ""

for name, (model, params) in models.items():
    with mlflow.start_run(run_name=name):
        clf = GridSearchCV(model, params, cv=3, scoring='roc_auc', n_jobs=-1)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_proba = clf.predict_proba(X_test)[:, 1]

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_proba)

        mlflow.log_params(clf.best_params_)
        mlflow.log_metrics({
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1_score": f1,
            "roc_auc": roc_auc
        })
        mlflow.sklearn.log_model(clf.best_estimator_, "model")

        print(f"{name} ROC-AUC: {roc_auc:.4f}")

        if roc_auc > best_score:
            best_score = roc_auc
            best_model = clf.best_estimator_
            best_name = name

#### Register the Best Model

In [None]:
with mlflow.start_run(run_name="register_best_model"):
    mlflow.sklearn.log_model(best_model, "best_model")
    mlflow.register_model(
        model_uri=f"runs:/{mlflow.active_run().info.run_id}/best_model",
        name="CreditRiskBestModel"
    )
print(f"Best model: {best_name} with ROC-AUC: {best_score:.4f}")