In [21]:
# Data handling and ML imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, make_scorer, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# MLflow and DagsHub imports
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import dagshub

import time

# MLflow and DagsHub Setup

In [22]:
# Initialize DagsHub connection and MLflow tracking
dagshub.init(repo_owner="fasnis", repo_name="fiap-ds-mlops-10dtsr-creditscoring-grupo7", mlflow=True)

# Enable MLflow autologging
mlflow.autolog()

print("MLflow tracking initialized successfully")

2025/08/03 18:57:19 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/08/03 18:57:19 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/08/03 18:57:19 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


MLflow tracking initialized successfully


In [23]:
# Read the processed dataset from local file
df_processed = pd.read_csv('../data/processed/creditscore_data_processed.csv')
print("Dataset shape:", df_processed.shape)
print("\nDataset info:")
df_processed.info()

Dataset shape: (100000, 44)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 44 columns):
 #   Column                                              Non-Null Count   Dtype  
---  ------                                              --------------   -----  
 0   ID                                                  100000 non-null  object 
 1   Customer_ID                                         100000 non-null  object 
 2   Age                                                 97224 non-null   float64
 3   Annual_Income                                       100000 non-null  float64
 4   Monthly_Inhand_Salary                               100000 non-null  float64
 5   Num_Bank_Accounts                                   100000 non-null  int64  
 6   Num_Credit_Card                                     100000 non-null  int64  
 7   Interest_Rate                                       100000 non-null  int64  
 8   Num_of_Loan           

# Machine Learning


In [24]:
def evaluate_and_log_model(model_name, model, X_test, y_test, label_encoder):
    """
    Evaluate the model and log metrics to MLflow (metrics only, no model logging)
    """
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, output_dict=True)
    try:
        for label in label_encoder.classes_:
            mlflow.log_metric(f"{label}_precision", report[label]['precision'])
            mlflow.log_metric(f"{label}_recall", report[label]['recall'])
            mlflow.log_metric(f"{label}_f1", report[label]['f1-score'])
        mlflow.log_metric("accuracy", report['accuracy'])
        mlflow.log_metric("weighted_avg_f1", report['weighted avg']['f1-score'])
    except Exception as e:
        print(f"Warning: Error during metric logging: {str(e)}")
    print(f"\nClassification Report for {model_name}:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [25]:
# Prepare the data
X = df_processed.drop(columns=["Credit_Score"])

# Convert hexadecimal strings to decimal numbers if any exist
for col in X.columns:
    if X[col].dtype == 'object':
        try:
            # Try to convert hex strings to decimal numbers
            X[col] = X[col].apply(lambda x: float(int(str(x), 16)) if isinstance(x, str) and '0x' in str(x).lower() else x)
        except:
            pass  # If conversion fails, leave as is
            
# Convert remaining strings to numeric, coercing errors to NaN
X = X.apply(pd.to_numeric, errors='coerce')

# Fill NaN values with 0
X = X.fillna(0)

# Prepare target variable
y = df_processed["Credit_Score"]

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models with their parameters
models = {
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 10],
            "min_samples_split": [2, 5]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(eval_metric='mlogloss', random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [3, 6],
            "learning_rate": [0.1, 0.3]
        }
    }
}

# Train and evaluate models
for name, config in models.items():
    print(f"\nTraining {name}...")
    
    with mlflow.start_run(run_name=name):
        # Log search parameters
        search_params = {}
        for param_name, param_values in config["params"].items():
            # Convert None to "None" string and handle other special cases
            param_values_str = [str(val) if val is not None else "None" for val in param_values]
            search_params[f"search_{param_name}"] = str(param_values_str)
        mlflow.log_params(search_params)
        
        # Start timer
        start_time = time.time()
        
        # Create and train GridSearchCV
        grid_search = GridSearchCV(
            config["model"], 
            config["params"], 
            cv=3, 
            scoring=make_scorer(accuracy_score),
            n_jobs=-1
        )
        grid_search.fit(X_train_scaled, y_train)
        
        # Calculate training time
        duration = time.time() - start_time
        mlflow.log_metric("training_time", duration)
        
        # Log best parameters
        best_params = {}
        for param_name, param_value in grid_search.best_params_.items():
            # Convert None to "None" string and handle other special cases
            best_params[f"best_{param_name}"] = str(param_value) if param_value is not None else "None"
        mlflow.log_params(best_params)
        
        # Evaluate and log the model
        evaluate_and_log_model(
            name,
            grid_search.best_estimator_,
            X_test_scaled,
            y_test,
            label_encoder
        )
        
        print(f"Training time: {duration:.2f} seconds")
        print(f"Best parameters: {grid_search.best_params_}")


Training RandomForest...


Python(81975) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(82000) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
2025/08/03 19:01:01 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.



Classification Report for RandomForest:
              precision    recall  f1-score   support

        Good       0.75      0.73      0.74      3566
        Poor       0.78      0.80      0.79      5799
    Standard       0.82      0.81      0.81     10635

    accuracy                           0.79     20000
   macro avg       0.78      0.78      0.78     20000
weighted avg       0.79      0.79      0.79     20000

Training time: 222.10 seconds
Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
🏃 View run RandomForest at: https://dagshub.com/fasnis/fiap-ds-mlops-10dtsr-creditscoring-grupo7.mlflow/#/experiments/0/runs/39bfd1efd65a48a2ae53688f5a70527c
🧪 View experiment at: https://dagshub.com/fasnis/fiap-ds-mlops-10dtsr-creditscoring-grupo7.mlflow/#/experiments/0

Training XGBoost...


Python(82065) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(82070) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
2025/08/03 19:01:59 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.



Classification Report for XGBoost:
              precision    recall  f1-score   support

        Good       0.73      0.74      0.73      3566
        Poor       0.78      0.78      0.78      5799
    Standard       0.81      0.81      0.81     10635

    accuracy                           0.79     20000
   macro avg       0.77      0.77      0.77     20000
weighted avg       0.79      0.79      0.79     20000

Training time: 44.37 seconds
Best parameters: {'learning_rate': 0.3, 'max_depth': 6, 'n_estimators': 200}
🏃 View run XGBoost at: https://dagshub.com/fasnis/fiap-ds-mlops-10dtsr-creditscoring-grupo7.mlflow/#/experiments/0/runs/66bee58e67ee46e6b43c533b8168f130
🧪 View experiment at: https://dagshub.com/fasnis/fiap-ds-mlops-10dtsr-creditscoring-grupo7.mlflow/#/experiments/0


# Register Best Model
After evaluating all models, we'll register the best performing one in MLflow Model Registry.

In [26]:
# Get all completed runs from the current experiment
import time

try:
    # Get the current experiment
    current_experiment = mlflow.get_experiment_by_name("Default")
    if current_experiment is None:
        current_experiment = mlflow.get_experiment(0)  # Get the default experiment
    
    # Search only recent runs (last hour) to speed up the search
    current_time = int(time.time() * 1000)  # current time in milliseconds
    one_hour_ago = current_time - (60 * 60 * 1000)  # one hour ago in milliseconds
    
    filter_string = f"metrics.accuracy > 0 AND attributes.start_time > {one_hour_ago}"
    runs = mlflow.search_runs(
        experiment_ids=[current_experiment.experiment_id],
        filter_string=filter_string,
        order_by=["metrics.accuracy DESC"],
        max_results=10  # Limit to recent runs
    )
    
    if len(runs) > 0:
        # Get the run with highest accuracy
        best_run = runs.iloc[0]
        best_run_id = best_run.run_id
        best_accuracy = best_run["metrics.accuracy"]
        
        # Register the model
        model_name = "credit-score-classification-model"
        model_version = mlflow.register_model(f"runs:/{best_run_id}/model", model_name)
        
        print(f"Best model found with accuracy: {best_accuracy:.4f}")
        print(f"Model registered with name: {model_name}")
        print(f"Model version: {model_version.version}")
    else:
        print("No completed runs found with accuracy metrics. Please run the model training first.")
except Exception as e:
    print(f"Error during model registration: {str(e)}")
    print("Please ensure MLflow tracking server is accessible and try again.")

Registered model 'credit-score-classification-model' already exists. Creating a new version of this model...
2025/08/03 19:02:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: credit-score-classification-model, version 2


Best model found with accuracy: 0.7941
Model registered with name: credit-score-classification-model
Model version: 2


Created version '2' of model 'credit-score-classification-model'.


# Downloading latest version of Model

In [27]:
import os
import json
from datetime import datetime
from mlflow.tracking import MlflowClient

# 1. Caminho relativo para salvar em ../models/
models_dir = "../models"
os.makedirs(models_dir, exist_ok=True)

# 2. Nome do modelo e cliente MLflow
model_name = "credit-score-classification-model"
client = MlflowClient()

# 3. Obter a versão mais recente registrada
registered_versions = sorted(
    client.search_model_versions(f"name='{model_name}'"),
    key=lambda v: int(v.version),
    reverse=True
)

if not registered_versions:
    raise ValueError(f"No registered versions found for model '{model_name}'")

latest = registered_versions[0]

# 4. Caminho fixo do artifact
artifact_path = "model/model.pkl"

# 5. Baixar diretamente o model.pkl
downloaded_file_path = client.download_artifacts(
    run_id=latest.run_id,
    path=artifact_path
)

final_model_path = os.path.join(models_dir, "model.pkl")
with open(downloaded_file_path, "rb") as src, open(final_model_path, "wb") as dst:
    dst.write(src.read())

print(f"Model saved to {final_model_path}")

# 6. Salvar metadata
model_metadata = {
    "model_name": model_name,
    "version": latest.version,
    "run_id": latest.run_id,
    "source": latest.source,
    "downloaded_at": datetime.now().isoformat()
}

metadata_path = os.path.join(models_dir, "model_metadata.json")
with open(metadata_path, "w") as f:
    json.dump(model_metadata, f, indent=2)

print(f"Metadata saved to {metadata_path}")


Downloading artifacts: 100%|██████████| 1/1 [00:09<00:00,  9.02s/it]


Model saved to ../models/model.pkl
Metadata saved to ../models/model_metadata.json
