In [10]:
import os
import boto3
import pandas as pd
import io
import mlflow

from mlflow.models import infer_signature

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# conctando com o MinIO 
S3_ENDPOINT_URL = os.getenv("MLFLOW_S3_ENDPOINT_URL", "http://minio:9000")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", "admin")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", "admin123")
BUCKET = "dados-analise"

s3 = boto3.client(
    "s3",
    endpoint_url=S3_ENDPOINT_URL,
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)

mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://mlflow:5000"))

print("Conectado no MinIO:", S3_ENDPOINT_URL)

Conectado no MinIO: http://minio:9000


In [11]:
resp = s3.list_objects_v2(Bucket=BUCKET)
for obj in resp.get("Contents", []):
    print(obj["Key"], "-", obj["Size"], "bytes")

heart_failure_clinical_records_dataset.csv - 12239 bytes


In [12]:
objects = resp.get("Contents", [])
if not objects:
    raise RuntimeError("Nenhum dado encontrado")

# Pega todos os cvs enviados, ordena pela datas e usa o ultimo que chegou
latest_obj = sorted(objects, key=lambda o: o["LastModified"])[-1]
latest_key = latest_obj["Key"]

print(latest_key )

obj = s3.get_object(Bucket=BUCKET, Key=latest_key)
conteudo = obj["Body"].read()

df = pd.read_csv(io.BytesIO(conteudo))
df.head()

heart_failure_clinical_records_dataset.csv


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [13]:
mlflow.set_experiment("dados-analise-exp")

with mlflow.start_run(run_name="rf_base"):
    # media = df["creatinine_phosphokinase"].mean()
    # max = df["creatinine_phosphokinase"].max()
    # min = df["creatinine_phosphokinase"].min()

    # manda pro MLflow como m√©trica
    mlflow.log_metric("media_creatinine_phosphokinase", float(2))
    mlflow.log_metric("max_creatinine_phosphokinase", float(12))
    mlflow.log_metric("min_creatinine_phosphokinase", float(1))
    
    print("M√©dia enviada pro MLflow:")

M√©dia enviada pro MLflow:
üèÉ View run rf_base at: http://mlflow:5000/#/experiments/1/runs/a3fd51711664463bbc93114a0232e99d
üß™ View experiment at: http://mlflow:5000/#/experiments/1


In [14]:
# 2. separar X / y (troca pelos nomes REAIS)
X = df[["age", "anaemia"]]   # <-- ajusta aqui
y = df["anaemia"]                   # <-- ajusta aqui

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# === 3. treina e loga ===
with mlflow.start_run(run_name="rf_base") as run:
    model = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
    )
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    # cria a signature com base nos dados de treino
    y_pred_train = model.predict(X_train)
    signature = infer_signature(X_train, y_pred_train)

    # logs
    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("accuracy", acc)

    info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=X_train.iloc[:5],
        registered_model_name="meu_modelo_trendz",  # <- NOME FIXO
    )

    print("model_uri:", info.model_uri)
    print("registered_model:", info.registered_model_version)

Registered model 'meu_modelo_trendz' already exists. Creating a new version of this model...
2025/12/02 20:20:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: meu_modelo_trendz, version 4
Created version '4' of model 'meu_modelo_trendz'.


model_uri: models:/m-fe589e94960c4fa2867adc593e13029e
registered_model: 4
üèÉ View run rf_base at: http://mlflow:5000/#/experiments/1/runs/f9edfadb364e4a00af11fc59d8c16268
üß™ View experiment at: http://mlflow:5000/#/experiments/1
