# Desenvolvimento do modelo de predição

In [1]:
import pandas as pd
from dagshub.data_engine import datasources
import mlflow
import dagshub
from sklearn.model_selection import train_test_split
import mlflow.sklearn
import mlflow.catboost
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import mlflow.models.signature
from mlflow.models import infer_signature
from catboost import CatBoostRegressor

# Obtendo o dataset

In [2]:
ds = datasources.get('fabioebner/quantum-finance-mlops', 'processed')


In [3]:
#Baixando o dataset do processed
ds.all().dataframe 
res = ds.head()
for dp in res:
    dataset_url = dp.download_url
        
    

df =pd.read_csv(dataset_url)


Output()

Output()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44343 entries, 0 to 44342
Data columns (total 33 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Age                          44343 non-null  float64
 1   Annual_Income                44343 non-null  float64
 2   Monthly_Inhand_Salary        44343 non-null  float64
 3   Num_Bank_Accounts            44343 non-null  float64
 4   Num_Credit_Card              44343 non-null  float64
 5   Num_of_Loan                  44343 non-null  float64
 6   Delay_from_due_date          44343 non-null  float64
 7   Num_of_Delayed_Payment       44343 non-null  float64
 8   Num_Credit_Inquiries         44343 non-null  float64
 9   Credit_Mix                   44343 non-null  float64
 10  Outstanding_Debt             44343 non-null  float64
 11  Credit_Utilization_Ratio     44343 non-null  float64
 12  Total_EMI_per_month          44343 non-null  float64
 13  Amount_invested_

In [5]:
df.head()

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Num_Credit_Inquiries,Credit_Mix,...,Occupation_Journalist,Occupation_Lawyer,Occupation_Manager,Occupation_Mechanic,Occupation_Media_Manager,Occupation_Musician,Occupation_Scientist,Occupation_Teacher,Occupation_Writer,Occupation________
0,0.001038,0.000501,0.101127,0.001669,0.002668,0.002694,0.044776,0.001592,0.001542,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.001038,0.000501,0.101127,0.001669,0.002668,0.002694,0.044776,0.001819,0.001542,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.001615,0.001151,0.182628,0.001112,0.002668,0.000673,0.044776,0.00091,0.000771,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.001615,0.001151,0.182628,0.001112,0.002668,0.000673,0.104478,0.000227,0.000771,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.001615,0.001151,0.182628,0.001112,0.923949,0.000673,0.044776,0.000227,0.000771,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
dagshub.init(repo_owner='fabioebner',
             repo_name='quantum-finance-mlops',mlflow=True)

mlflow.autolog()

2025/07/31 19:09:32 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/07/31 19:09:32 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/07/31 19:09:32 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Credit_Score']), df['Credit_Score'], test_size=0.3, random_state=42)

In [8]:
#Funcao para rastrear o modelo

def evaluate_and_log_model(kind, model_name, model, X_test, y_test):
   predictions = model.predict(X_test)
   mse = mean_squared_error(y_test, predictions)
   mae = mean_absolute_error(y_test, predictions)
   r2 = r2_score(y_test, predictions)
   mape = mean_absolute_percentage_error(y_test, predictions)

   mlflow.log_metric("MSE", mse)
   mlflow.log_metric("MAE", mae)
   mlflow.log_metric("R2", r2)
   mlflow.log_metric("MAPE", mape)

   # Inferir a assinatura automaticamente
   signature = infer_signature(X_test, predictions)

   if kind == "catboost":
      mlflow.sklearn.log_model(model, "model", signature=signature, input_example=X_test[:5])
   elif kind == "xgboost":
      mlflow.xgboost.log_model(model, "model", signature=signature, input_example=X_test[:5])
   elif kind == "lightgbm":
      mlflow.lightgbm.log_model(model, "model", signature=signature, input_example=X_test[:5])
   else:
      mlflow.sklearn.log_model(model, "model", signature=signature, input_example=X_test[:5])


### Experimento com Ridge Regression

In [34]:
with mlflow.start_run(run_name="Ridge Regression"):
    param_grid = {
        'alpha': [0.1,1.0, 10.0, 100.0]
    }

    ridge = Ridge()
    grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False))
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    mlflow.log_param("Best_alpha", best_model.alpha)
    evaluate_and_log_model("sklearn", "Ridge Regression", best_model, X_test, y_test)

2025/07/31 18:41:07 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


🏃 View run Ridge Regression at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0/runs/48d6ebbd98a44cab936eee64b135b71e
🧪 View experiment at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0


### Decision Tree

In [None]:
with mlflow.start_run(run_name="Decision Tree Regression"):
    param_grid = {
        'max_depth': [3,5,10,None],
        'min_samples_split': [2, 5, 10]
    }

    tree = DecisionTreeRegressor(random_state=42)
    grid_search = GridSearchCV(tree, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    mlflow.log_param("Best_max_depth", best_model.max_depth)
    mlflow.log_param("Best_min_samples_split", best_model.min_samples_split)


    evaluate_and_log_model("sklearn", "Decision Tree Regression", best_model, X_test, y_test)

2025/07/31 18:49:03 INFO mlflow.sklearn.utils: Logging the 5 best runs, 7 runs will be omitted.


🏃 View run resilient-ox-804 at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0/runs/51abdae3b1ad47eca094792e3a220cae
🧪 View experiment at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0
🏃 View run Decision Tree Regression  at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0/runs/a341c6d1df5b4b17880807670390fd96
🧪 View experiment at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0


### MLP Regression

In [None]:
from sklearn.neural_network import MLPRegressor

with mlflow.start_run(run_name="MLP Regression"):
    param_grid = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01, 0.1]
    }

    tree = MLPRegressor(max_iter=500, random_state=42)
    grid_search = GridSearchCV(tree, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    mlflow.log_param("Best_hidden_layer_sizes", best_model.hidden_layer_sizes)
    mlflow.log_param("Best_activation", best_model.activation)
    mlflow.log_param("Best_alpha", best_model.alpha)


    evaluate_and_log_model("sklearn", "MLP Regression", best_model, X_test, y_test)



### XGBoost Regressor

In [None]:
with mlflow.start_run(run_name="XGBoost Regressor"):
    param_grid = {
        'n_estimators': [100, 200,300],
        'max_depth': [3, 5, 7,9],
        'learning_rate': [0.01, 0.1, 0.2, 0.3]
    }

    tree = XGBRegressor(random_state=42, verbosity=0)

    grid_search = GridSearchCV(tree, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    mlflow.log_param("Best_n_estimators", best_model.n_estimators)
    mlflow.log_param("Best_max_depth", best_model.max_depth)
    mlflow.log_param("Best_learning_rate", best_model.learning_rate)


    evaluate_and_log_model("xgboost", "XGBoost Regressor", best_model, X_test, y_test)

2025/07/31 19:13:03 INFO mlflow.sklearn.utils: Logging the 5 best runs, 43 runs will be omitted.


🏃 View run bold-shad-251 at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0/runs/78f7e629abc54d3f9baa34739ed5eb88
🧪 View experiment at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0
🏃 View run adorable-steed-145 at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0/runs/26a73df17ea34f08a91ee0008c52b2f4
🧪 View experiment at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0
🏃 View run glamorous-mare-430 at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0/runs/1191740f7b294214829241dc43e4266c
🧪 View experiment at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0
🏃 View run unique-dolphin-288 at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0/runs/84b525a6975a4a8cba3991b956b9e699
🧪 View experiment at: https://dagshub.com/fabioebner/quantum-finance-mlops.mlflow/#/experiments/0
🏃 View run invincible

### Registry do Modelo

In [10]:
run_id = "292e91f4ee9e45b089b2daef23f4ac8f"

mlflow.register_model(
    model_uri=f"runs:/{run_id}/model",
    name="CreditScoreModel"
)

Successfully registered model 'CreditScoreModel'.
2025/07/31 19:17:37 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CreditScoreModel, version 1
Created version '1' of model 'CreditScoreModel'.


<ModelVersion: aliases=[], creation_timestamp=1754000257293, current_stage='None', description='', last_updated_timestamp=1754000257293, name='CreditScoreModel', run_id='292e91f4ee9e45b089b2daef23f4ac8f', run_link='', source='mlflow-artifacts:/f9b061f0135d46b3a8b4b22bcf840fe9/292e91f4ee9e45b089b2daef23f4ac8f/artifacts/model', status='READY', status_message=None, tags={}, user_id='', version='1'>