In [2]:
!pip install "numpy<2.0" "scikit-learn<1.4" "scipy<1.12" "imbalanced-learn<0.13"

Collecting imbalanced-learn<0.13
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4


In [8]:
import pandas as pd
import os

import mlflow
import mlflow.sklearn
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.pipeline import Pipeline as ImbPipeline

# --- 1. Configurações e Carga de Dados ---
print("Carregando dados estruturados do Postgres...")
db_engine = create_engine(f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}")
df = pd.read_sql("SELECT * FROM dados_analise", db_engine)

df = df.rename(columns={"DEATH_EVENT": "death_event"})

# Limpeza básica (garantir que não pegamos dados de inferência sem target)
df = df.dropna(subset=['death_event'])

X = df.drop(columns=['death_event'])
y = df['death_event']

# Split (Garantindo reprodutibilidade)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- 2. Definição dos Experimentos ---

# Dicionário de Técnicas de OverSampling
samplers = {
    "SMOTE": SMOTE(sampling_strategy='minority', k_neighbors=3, random_state=42),
    "BorderlineSMOTE": BorderlineSMOTE(sampling_strategy='minority', k_neighbors=3, random_state=42),
    "ADASYN": ADASYN(sampling_strategy='minority', n_neighbors=3, random_state=42)
}

# Definição dos Modelos Base (Hyperparâmetros do seu notebook)
knn = KNeighborsClassifier(n_neighbors=3)
dt = DecisionTreeClassifier(max_depth=3, random_state=42)
rf = RandomForestClassifier(max_depth=3, random_state=42)

# Dicionário de Classificadores
models = {
    "KNN": knn,
    "DecisionTree": dt,
    "RandomForest": rf,
    "VotingEnsemble": VotingClassifier(
        estimators=[('KNN', knn), ('DT', dt), ('RF', rf)],
        voting='soft'
    )
}

# --- 3. Execução do Loop de Experimentos no MLflow ---

mlflow.set_tracking_uri("http://localhost:5000")
experiment_name = "Heart Failure - Full Benchmark"
mlflow.set_experiment(experiment_name)

best_f1 = 0
best_run_id = None
best_model_name = ""

print(f"Iniciando experimentos em: {experiment_name}")

for sampler_name, sampler in samplers.items():
    for model_name, model in models.items():
        
        run_name = f"{sampler_name} + {model_name}"
        print(f"Treinando: {run_name} ...")
        
        with mlflow.start_run(run_name=run_name) as run:
            
            # Criação do Pipeline: Escalonamento -> Resampling -> Modelo
            # Usamos ImbPipeline para garantir que o resampling ocorra APENAS no treino
            pipeline = ImbPipeline(steps=[
                ('scaler', StandardScaler()),
                ('sampler', sampler),
                ('classifier', model)
            ])
            
            # Treino
            pipeline.fit(X_train, y_train)
            
            # Predição
            y_pred = pipeline.predict(X_test)
            
            # Métricas
            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            
            # Log de Métricas
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("f1_score", f1)
            mlflow.log_metric("recall", recall)
            mlflow.log_metric("precision", precision)
            
            # Log de Parâmetros
            mlflow.log_param("sampler", sampler_name)
            mlflow.log_param("model_type", model_name)
            
            # Log do Modelo (Artefato completo)
            mlflow.sklearn.log_model(pipeline, "model")
            
            # Lógica para encontrar o campeão (baseado no F1-Score)
            if f1 > best_f1:
                best_f1 = f1
                best_run_id = run.info.run_id
                best_model_name = run_name

print("-" * 30)
print(f"MELHOR MODELO DO EXPERIMENTO: {best_model_name}")
print(f"Melhor F1-Score: {best_f1:.4f}")
print("-" * 30)

# --- 4. Registrar o Melhor Modelo para Produção ---

if best_run_id:
    model_uri = f"runs:/{best_run_id}/model"
    registered_model_name = "HeartFailureModel_Production"
    
    print(f"Registrando o melhor modelo ({best_model_name}) em '{registered_model_name}'...")
    
    # Registra o modelo
    reg_model = mlflow.register_model(model_uri, registered_model_name)
    
    # Move para o estágio de Produção (via Client)
    client = mlflow.tracking.MlflowClient()
    client.transition_model_version_stage(
        name=registered_model_name,
        version=reg_model.version,
        stage="Production"
    )
    print("Modelo promovido para Produção com sucesso!")

else:
    print("Nenhum modelo foi treinado com sucesso.")

Carregando dados estruturados do Postgres...


MlflowException: API request to http://localhost:5000/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=Heart+Failure+-+Full+Benchmark (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fae3d968d50>: Failed to establish a new connection: [Errno 111] Connection refused'))