# 0. Imports

In [1]:
import sys
import joblib
sys.path.insert(0, '../src/')

from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import LogisticRegression

from utils.utils import load_config_file
from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from data.data_preprocess import DataPreprocess
from train.train import TrainModels
from evaluation.classifier_eval import ModelEvaluation

# 1. Data Load

In [2]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

[2m2025-05-19 21:00:52[0m [[32m[1minfo     [0m] [1mComeçando a carga dos dados com o nome: train_dataset_name[0m


# 2. Data Validation

In [3]:
dv = DataValidation()
is_valid = dv.run(df)

[2m2025-05-19 21:00:52[0m [[32m[1minfo     [0m] [1mValidacao iniciou             [0m
[2m2025-05-19 21:00:52[0m [[32m[1minfo     [0m] [1mValidation columns passed...  [0m
[2m2025-05-19 21:00:52[0m [[32m[1minfo     [0m] [1mValidacao com sucesso.        [0m


# 3. Data Transformation

In [4]:
dt = DataTransformation(df)
X_train, X_valid, y_train, y_valid = dt.train_test_spliting()

# 4. Experimentations 

In [5]:
import mlflow
from mlflow.tracking import MlflowClient

In [6]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('prob_loan')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1747133663728, experiment_id='1', last_update_time=1747133663728, lifecycle_stage='active', name='prob_loan', tags={}>

## 4.1. Select Best Model

In [7]:
current_experiment = dict(mlflow.get_experiment_by_name('prob_loan'))

In [8]:
experiment_id = current_experiment['experiment_id']

In [9]:
df_mlflow = mlflow\
    .search_runs(filter_string='metrics.valid_roc_auc < 1')\
    .sort_values('metrics.valid_roc_auc', ascending=False)

In [10]:
df_mlflow.columns

Index(['run_id', 'experiment_id', 'status', 'artifact_uri', 'start_time',
       'end_time', 'metrics.log_loss', 'metrics.false_positives',
       'metrics.true_negatives', 'metrics.train_roc_auc',
       'metrics.example_count', 'metrics.f1_score', 'metrics.accuracy_score',
       'metrics.roc_auc', 'metrics.false_negatives',
       'metrics.precision_recall_auc', 'metrics.true_positives',
       'metrics.score', 'metrics.valid_roc_auc', 'metrics.recall_score',
       'metrics.precision_score', 'params.class_weight', 'params.multi_class',
       'params.warm_start', 'params.discretizer', 'params.imputer',
       'params.solver', 'params.C', 'params.tol', 'params.max_iter',
       'params.fit_intercept', 'params.scaler', 'tags.mlflow.source.name',
       'tags.mlflow.source.type', 'tags.model_name', 'tags.mlflow.runName',
       'tags.mlflow.log-model.history', 'tags.mlflow.user',
       'tags.mlflow.datasets'],
      dtype='object')

In [21]:
run_id = df_mlflow.loc[df_mlflow['metrics.valid_roc_auc'].idxmax()]['run_id']
run_id

'4c6de3ab4e4b4d62871573f77bedc8ba'