# Desenvolvimento do modelo de predição

In [1]:
import pandas as pd
from dagshub.data_engine import datasources
import mlflow
import dagshub
from sklearn.model_selection import train_test_split
import mlflow.sklearn
import mlflow.catboost
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import mlflow.models.signature
from mlflow.models import infer_signature
from catboost import CatBoostRegressor

# Obtendo o dataset

In [3]:
ds = datasources.get('fabioebner/quantum-finance-mlops', 'processed')


In [None]:
#Baixando o dataset do processed
ds.all().dataframe 
res = ds.head()
for dp in res:
    dataset_url = dp.download_url

df =pd.read_csv(dataset_url)


In [7]:
df.head()

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,...,Occupation_Journalist,Occupation_Lawyer,Occupation_Manager,Occupation_Mechanic,Occupation_Media_Manager,Occupation_Musician,Occupation_Scientist,Occupation_Teacher,Occupation_Writer,Occupation________
0,23.0,19114.12,1824.843333,3.0,4.0,4,3.0,7,11.27,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,23.0,19114.12,3093.745,3.0,4.0,4,-1.0,19,11.27,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,33.0,19114.12,3093.745,3.0,4.0,4,3.0,7,_,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,23.0,19114.12,3093.745,3.0,4.0,4,5.0,4,6.27,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,23.0,19114.12,1824.843333,3.0,4.0,4,6.0,19,11.27,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [12]:
dagshub.init(repo_owner='fabioebner',
             repo_name='quantum-finance-mlops',mlflow=True)

mlflow.autolog()

2025/07/31 14:45:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/07/31 14:45:26 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/07/31 14:45:26 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [13]:
features = list(df.columns)
features.remove('Credit_Score')
features

['Age',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Num_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Credit_Mix',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Monthly_Balance',
 'Credit_History_Total_Months',
 'Occupation_Accountant',
 'Occupation_Architect',
 'Occupation_Developer',
 'Occupation_Doctor',
 'Occupation_Engineer',
 'Occupation_Entrepreneur',
 'Occupation_Journalist',
 'Occupation_Lawyer',
 'Occupation_Manager',
 'Occupation_Mechanic',
 'Occupation_Media_Manager',
 'Occupation_Musician',
 'Occupation_Scientist',
 'Occupation_Teacher',
 'Occupation_Writer',
 'Occupation________']

In [14]:
X = df[features]
len(features)

33

In [None]:
# Confirmando se removel a coluna alvo ('Credit Score')
X

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,...,Occupation_Journalist,Occupation_Lawyer,Occupation_Manager,Occupation_Mechanic,Occupation_Media_Manager,Occupation_Musician,Occupation_Scientist,Occupation_Teacher,Occupation_Writer,Occupation________
0,23.0,19114.12,1824.843333,3.0,4.0,4,3.0,7,11.27,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,23.0,19114.12,3093.745000,3.0,4.0,4,-1.0,19,11.27,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,33.0,19114.12,3093.745000,3.0,4.0,4,3.0,7,_,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,23.0,19114.12,3093.745000,3.0,4.0,4,5.0,4,6.27,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,23.0,19114.12,1824.843333,3.0,4.0,4,6.0,19,11.27,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,25.0,39628.99,3359.415833,4.0,6.0,2,23.0,7,11.5,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,25.0,39628.99,3359.415833,4.0,6.0,2,18.0,7,11.5,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,25.0,39628.99,3359.415833,4.0,6.0,2,27.0,6,11.5,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,25.0,39628.99,3359.415833,4.0,6.0,2,20.0,19,11.5,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
X_train = df.drop("Credit_Score", axis=1) 
y_train = df["Credit_Score"]



In [None]:
test_df = pd.read_csv('../data/raw/test.csv')
X_test = test_df.drop("Credit_Score", axis=1)
y_test = test_df["Credit_Score"]

KeyError: "['Credit_Score'] not found in axis"

In [10]:
#Funcao para rastrear o modelo

def evaluate_and_log_model(kind, model_name, model, X_test, y_test):
   predictions = model.predict(X_test)
   mse = mean_squared_error(y_test, predictions)
   mae = mean_absolute_error(y_test, predictions)
   r2 = r2_score(y_test, predictions)
   mape = mean_absolute_percentage_error(y_test, predictions)

   mlflow.log_metric("MSE", mse)
   mlflow.log_metric("MAE", mae)
   mlflow.log_metric("R2", r2)
   mlflow.log_metric("MAPE", mape)

   # Inferir a assinatura automaticamente
   signature = infer_signature(X_test, predictions)

   if kind == "catboost":
      mlflow.sklearn.log_model(model, "model", signature=signature, input_example=X_test[:5])
   elif kind == "xgboost":
      mlflow.xgboost.log_model(model, "model", signature=signature, input_example=X_test[:5])
   elif kind == "lightgbm":
      mlflow.lightgbm.log_model(model, "model", signature=signature, input_example=X_test[:5])
   else:
      mlflow.sklearn.log_model(model, "model", signature=signature, input_example=X_test[:5])



with mlflow.start_run(run_name="DecisionTree_Regressor"):
    param_grid = {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10]
    }
    tree = DecisionTreeRegressor(random_state=42)
    grid_search = GridSearchCV(tree, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_min_samples_split", best_model.min_samples_split)
    evaluate_and_log_model("sklearn", "Decision Tree Regressor", best_model, X_test, y_test)

NameError: name 'X_train' is not defined