In [27]:
# Data handling and ML imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, make_scorer, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# MLflow and DagsHub imports
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm
import dagshub

import time

### Load Processed Data

# MLflow and DagsHub Setup

In [32]:
# Initialize DagsHub connection and MLflow tracking
dagshub.init(repo_owner="fasnis", repo_name="fiap-ds-mlops-10dtsr-creditscoring-grupo7", mlflow=True)

# Enable MLflow autologging
mlflow.autolog()

print("MLflow tracking initialized successfully")

2025/08/03 00:59:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/08/03 00:59:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/08/03 00:59:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/08/03 00:59:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2025/08/03 00:59:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/08/03 00:59:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/08/03 00:59:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2025/08/03 00:59:46 INFO mlflow.pyspark.ml: No SparkSession detected. Autologging will log pyspark.ml models contained in the default allowlist. To specify a custom allowlist, initialize a SparkSession prior to calling mlflow.pyspark.ml.autolog() and specify the path to your allowlist file via the spark.mlflow.pysparkml.autolog.logM

MLflow tracking initialized successfully


In [33]:
# Read the processed dataset from local file
df_processed = pd.read_csv('../data/processed/creditscore_data_processed.csv')
print("Dataset shape:", df_processed.shape)
print("\nDataset info:")
df_processed.info()

Dataset shape: (100000, 44)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 44 columns):
 #   Column                                              Non-Null Count   Dtype  
---  ------                                              --------------   -----  
 0   ID                                                  100000 non-null  object 
 1   Customer_ID                                         100000 non-null  object 
 2   Age                                                 97224 non-null   float64
 3   Annual_Income                                       100000 non-null  float64
 4   Monthly_Inhand_Salary                               100000 non-null  float64
 5   Num_Bank_Accounts                                   100000 non-null  int64  
 6   Num_Credit_Card                                     100000 non-null  int64  
 7   Interest_Rate                                       100000 non-null  int64  
 8   Num_of_Loan           

# Machine Learning


Agora, após fazer uma análise exploratória dos dados e transformá-los conforme necessidade, podemos ir para os modelos de classificação.
Vamos trabalhar com três: RandomForest, XGBoost, LightGBM

In [35]:
# Prepare the data
X = df_processed.drop(columns=["Credit_Score"])

# Convert hexadecimal strings to decimal numbers if any exist
for col in X.columns:
    if X[col].dtype == 'object':
        try:
            # Try to convert hex strings to decimal numbers
            X[col] = X[col].apply(lambda x: float(int(str(x), 16)) if isinstance(x, str) and '0x' in str(x).lower() else x)
        except:
            pass  # If conversion fails, leave as is
            
# Convert remaining strings to numeric, coercing errors to NaN
X = X.apply(pd.to_numeric, errors='coerce')

# Fill NaN values with 0
X = X.fillna(0)

# Prepare target variable
y = df_processed["Credit_Score"]

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models with their parameters
models = {
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 10],
            "min_samples_split": [2, 5]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(eval_metric='mlogloss', random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [3, 6],
            "learning_rate": [0.1, 0.3]
        }
    },
    "LightGBM": {
        "model": LGBMClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "num_leaves": [31, 50],
            "learning_rate": [0.1, 0.3]
        }
    }
}

# Train and evaluate models
for name, config in models.items():
    print(f"\nTraining {name}...")
    
    with mlflow.start_run(run_name=name):
        # Log search parameters
        search_params = {}
        for param_name, param_values in config["params"].items():
            # Convert None to "None" string and handle other special cases
            param_values_str = [str(val) if val is not None else "None" for val in param_values]
            search_params[f"search_{param_name}"] = str(param_values_str)
        mlflow.log_params(search_params)
        
        # Start timer
        start_time = time.time()
        
        # Create and train GridSearchCV
        grid_search = GridSearchCV(
            config["model"], 
            config["params"], 
            cv=3, 
            scoring=make_scorer(accuracy_score),
            n_jobs=-1
        )
        grid_search.fit(X_train_scaled, y_train)
        
        # Calculate training time
        duration = time.time() - start_time
        mlflow.log_metric("training_time", duration)
        
        # Log best parameters
        best_params = {}
        for param_name, param_value in grid_search.best_params_.items():
            # Convert None to "None" string and handle other special cases
            best_params[f"best_{param_name}"] = str(param_value) if param_value is not None else "None"
        mlflow.log_params(best_params)
        
        # Evaluate and log the model
        evaluate_and_log_model(
            name,
            grid_search.best_estimator_,
            X_test_scaled,
            y_test,
            label_encoder
        )
        
        print(f"Training time: {duration:.2f} seconds")
        print(f"Best parameters: {grid_search.best_params_}")


Training RandomForest...




🏃 View run RandomForest at: https://dagshub.com/fasnis/fiap-ds-mlops-10dtsr-creditscoring-grupo7.mlflow/#/experiments/0/runs/a13ee505b22c465692bf84015c11e31a
🧪 View experiment at: https://dagshub.com/fasnis/fiap-ds-mlops-10dtsr-creditscoring-grupo7.mlflow/#/experiments/0


RestException: INTERNAL_ERROR: Response: {'error': 'unsupported endpoint, please contact support@dagshub.com'}

In [None]:
def evaluate_and_log_model(model_name, model, X_test, y_test, label_encoder):
    """
    Evaluate the model and log metrics to MLflow (metrics only, no model logging)
    """
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, output_dict=True)
    try:
        for label in label_encoder.classes_:
            mlflow.log_metric(f"{label}_precision", report[label]['precision'])
            mlflow.log_metric(f"{label}_recall", report[label]['recall'])
            mlflow.log_metric(f"{label}_f1", report[label]['f1-score'])
        mlflow.log_metric("accuracy", report['accuracy'])
        mlflow.log_metric("weighted_avg_f1", report['weighted avg']['f1-score'])
    except Exception as e:
        print(f"Warning: Error during metric logging: {str(e)}")
    print(f"\nClassification Report for {model_name}:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Ao fazer uma análise do uso de cada modelo, temos que:

Random Forest
*   Accuracy: 0.79
*   Melhor performance geral (acurácia)

XGBoost:
*   Accuracy: 0.75

LightGBM:
*   Accuracy: 0.77

Se formos olhar para o recall, temos os seguintes resultados para cada um:

Random Forest:
*   Good: 0.69
*   Poor: 0.80
*   Standard: 0.82

XGBoost:
*   Good: 0.63
*   Poor: 0.73
*   Standard: 0.81

LightGBM:
*   Good: 0.65
*   Poor: 0.75
*   Standard: 0.81

Random Forest foi o mais equilibrado entre as três classes, seguido por LightGBM e XGBoost.

O tempo de processamento foi bem diferente, sendo o de random forest quase 6x o dos outros dois (9 minutos x 1 minuto e meio).

Um recall mais baixo para "Good" já era esperado, visto que temos a base desbalanceada, com:

 Credit_Score:
 *   Standard: 53174 (53.2%)
 *   Poor: 28998 (29.0%)
 *   Good: 17828 (17.8%)

Podemos como próximo passo aplicar um Random Forest forçando o algoritmo a compensar o desbalanceamento.

In [None]:
# 1. Create a copy of the dataframes
df_pd = df_processed.copy()

# 2. Trata strings que viram NaN no Pandas
df_pd.replace(["N/A", "NM", "na", "NaN", "-", ""], pd.NA, inplace=True)

# 3. Converte colunas para números (menos ID e target)
for col in df_pd.columns:
    if col not in ["ID", "Customer_ID", "Month", "Credit_Score"]:
        df_pd[col] = pd.to_numeric(df_pd[col], errors='coerce')

# 4. Remove nulos no target e preenche o resto com zero
df_pd = df_pd.dropna(subset=["Credit_Score"])
# Fill numerical columns with 0, leave 'Credit_Score' as is for encoding
numerical_cols_to_fill = df_pd.select_dtypes(include=["number"]).columns.tolist()
df_pd[numerical_cols_to_fill] = df_pd[numerical_cols_to_fill].fillna(0)

# 5. Seleciona colunas numéricas válidas e o target
X = df_pd.drop(columns=["Credit_Score", "ID", "Customer_ID"])
X = X.select_dtypes(include=["number"])
y = df_pd["Credit_Score"] # Keep target as is for now

# --- Start of added code ---
# Encode the target variable
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# --- End of added code ---

# 6. Split e escalonamento (using encoded y)
# Use y_encoded for the split
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 7. Modelos e seus grids (no change needed here)
models = {
    "RandomForest": {
        "model": RandomForestClassifier(class_weight='balanced', random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 10],
            "min_samples_split": [2, 5]
        }
    }
}

# 8. GridSearch, avaliação e predição
for name, config in models.items():
    print(f"\nTreinando {name}...")
    # Cronômetro inicia
    start_time = time.time()

    gs = GridSearchCV(config["model"], config["params"], cv=3, scoring="accuracy", n_jobs=-1)
    # Fit using the encoded training labels
    gs.fit(X_train_scaled, y_train_encoded)

    end_time = time.time()
    duration = end_time - start_time
    print(f"Tempo de treinamento para {name}: {duration:.2f} segundos")

    print("Melhores parâmetros:", gs.best_params_)

    # Predict and evaluate using the encoded validation labels
    y_val_pred_encoded = gs.best_estimator_.predict(X_val_scaled)
    # For classification_report, you can use the encoded labels and target_names
    target_names = label_encoder.classes_ # Get original class names
    print("Relatório de classificação (validação):")
    print(classification_report(y_val_encoded, y_val_pred_encoded, target_names=target_names))


Treinando RandomForest...


2025/08/03 00:36:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a48d7884e8c3465cb38e9842e9c2d35a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run sedate-crow-212 at: https://dagshub.com/fasnis/fiap-ds-mlops-10dtsr-creditscoring-grupo7.mlflow/#/experiments/0/runs/a48d7884e8c3465cb38e9842e9c2d35a
🧪 View experiment at: https://dagshub.com/fasnis/fiap-ds-mlops-10dtsr-creditscoring-grupo7.mlflow/#/experiments/0




Tempo de treinamento para RandomForest: 127.23 segundos
Melhores parâmetros: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Relatório de classificação (validação):
              precision    recall  f1-score   support

        Good       0.75      0.74      0.75      3566
        Poor       0.78      0.83      0.81      5799
    Standard       0.82      0.80      0.81     10635

    accuracy                           0.80     20000
   macro avg       0.79      0.79      0.79     20000
weighted avg       0.80      0.80      0.80     20000

Relatório de classificação (validação):
              precision    recall  f1-score   support

        Good       0.75      0.74      0.75      3566
        Poor       0.78      0.83      0.81      5799
    Standard       0.82      0.80      0.81     10635

    accuracy                           0.80     20000
   macro avg       0.79      0.79      0.79     20000
weighted avg       0.80      0.80      0.80     20000



**Aplicação prática**:

O modelo pode ser utilizado pela QuantumFinance para:

*   Avaliar risco de crédito de novos clientes com base em histórico e comportamento financeiro
*   Segmentar clientes por perfil de risco (bom, padrão, ruim)
*   Apoiar decisões de concessão de crédito, limite de cartão ou taxas de juros personalizadas
*   Antecipar inadimplência, otimizando ações preventivas de cobrança

# Register Best Model
After evaluating all models, we'll register the best performing one in MLflow Model Registry.

In [19]:
# Get all completed runs from the current experiment
import time

try:
    # Get the current experiment
    current_experiment = mlflow.get_experiment_by_name("Default")
    if current_experiment is None:
        current_experiment = mlflow.get_experiment(0)  # Get the default experiment
    
    # Search only recent runs (last hour) to speed up the search
    current_time = int(time.time() * 1000)  # current time in milliseconds
    one_hour_ago = current_time - (60 * 60 * 1000)  # one hour ago in milliseconds
    
    filter_string = f"metrics.accuracy > 0 AND attributes.start_time > {one_hour_ago}"
    runs = mlflow.search_runs(
        experiment_ids=[current_experiment.experiment_id],
        filter_string=filter_string,
        order_by=["metrics.accuracy DESC"],
        max_results=10  # Limit to recent runs
    )
    
    if len(runs) > 0:
        # Get the run with highest accuracy
        best_run = runs.iloc[0]
        best_run_id = best_run.run_id
        best_accuracy = best_run["metrics.accuracy"]
        
        # Register the model
        model_name = "credit-score-classification-model"
        model_version = mlflow.register_model(f"runs:/{best_run_id}/model", model_name)
        
        print(f"Best model found with accuracy: {best_accuracy:.4f}")
        print(f"Model registered with name: {model_name}")
        print(f"Model version: {model_version.version}")
    else:
        print("No completed runs found with accuracy metrics. Please run the model training first.")
except Exception as e:
    print(f"Error during model registration: {str(e)}")
    print("Please ensure MLflow tracking server is accessible and try again.")

No completed runs found with accuracy metrics. Please run the model training first.
