# 📜 Projeto Final - Capacitação IA (Ciclo 3)
# 🎓 Alunos: Filipe da Silva Rodrigues e Rodrigo Serafim Floriano da Silva

## 💻 Bibliotecas Necessárias

In [1]:
# Instalação de bibliotecas necessárias para execução do código
# !pip install numpy pandas scikit-learn mlflow xgboost lightgbm catboost tpot pytorch --quiet

In [2]:
# Tratamento de Dataset e Métricas

import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline

# Modelos de Treinamento

# Classificadores básicos
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Classificadores avançados
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# AutoML
from tpot import TPOTClassifier

# Armazenamento e Análise de Modelos

import mlflow
import mlflow.sklearn
import mlflow.pyfunc
from mlflow import pyfunc

# Terminal

import warnings
from IPython.display import clear_output
warnings.filterwarnings("ignore")



---

👾 **Dataset de Classificação - Kaggle: Water Quality**

Esse dataframe é um conjunto de dados que contém informações sobre a qualidade da água e sua potabilidade. As variáveis são:

- `ph`: o valor do pH da água (0 a 14).
- `Hardness`: a capacidade da água de precipitar sabão em mg/L.
- `Solids`: sólidos totais dissolvidos em ppm.
- `Chloramines`: quantidade de cloraminas em ppm.
- `Sulfate`: quantidade de sulfatos dissolvidos em mg/L.
- `Conductivity`: condutividade elétrica da água em μS/cm.
- `Organic_carbon`: quantidade de carbono orgânico em ppm.
- `Trihalomethanes`: quantidade de trihalometanos em μg/L.
- `Turbidity`: medida da propriedade de emissão de luz da água em NTU.
- `Potability`: indica se a água é segura para consumo humano (1 = Potável, 0 = Não potável).

✅ **Objetivo:** Prever se a água é potável ou não com base nas características coletadas.

---


In [3]:
# Carregar o dataset
url = 'water_potability.csv'
dataset = pd.read_csv(url)

# Analisar o dataset
print('\nInformações do Dataset:\n')
display(dataset.info())

print('\nVerificar Valores Nulos:\n')
display(dataset.isnull().sum())

# Exibir o dataset original
print('\nDataset Original:\n')
display(dataset)

# Remover os registros com valores nulos
dataset = dataset.dropna()

# Criar uma cópia do dataset para efetuar os devidos tratamentos
df = dataset.copy()

# Normalizando os dados das features na escala (0..1)
columns_to_normalize = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']
df[columns_to_normalize] = MinMaxScaler().fit_transform(df[columns_to_normalize])

# Separar os dados para o tratamento de features
target = df['Potability'].copy()
features = df.drop('Potability', axis=1).copy()

# Combinando as features transformadas com o target
df = pd.concat(
    [features.reset_index(drop=True), target.reset_index(drop=True)], axis=1)

# Exibindo o DataFrame tratado com as colunas renomeadas
print('\nDataset Tratado para Treinamento:\n')
display(df)


# Separando os dados 
y = df['Potability']  # Coluna 'Potability'
x = df.drop('Potability', axis=1)  # Todas as outras colunas

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=None)



Informações do Dataset:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


None


Verificar Valores Nulos:



ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64


Dataset Original:



Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1



Dataset Tratado para Treinamento:



Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,0.587349,0.577747,0.386298,0.568199,0.647347,0.292985,0.654522,0.795029,0.630115,0
1,0.643654,0.441300,0.314381,0.439304,0.514545,0.356685,0.377248,0.202914,0.520358,0
2,0.388934,0.470876,0.506122,0.524364,0.561537,0.142913,0.249922,0.401487,0.219973,0
3,0.725820,0.715942,0.506141,0.521683,0.751819,0.148683,0.467200,0.658678,0.242428,0
4,0.610517,0.532588,0.237701,0.270288,0.495155,0.494792,0.409721,0.469762,0.585049,0
...,...,...,...,...,...,...,...,...,...,...
2006,0.636224,0.580511,0.277748,0.418063,0.522486,0.342184,0.310364,0.402799,0.627156,1
2007,0.470143,0.548826,0.301347,0.538273,0.498565,0.231359,0.565061,0.175889,0.395061,1
2008,0.817826,0.087434,0.656389,0.670774,0.369089,0.431872,0.563265,0.285745,0.578674,1
2009,0.424187,0.464092,0.459656,0.541633,0.615572,0.388360,0.397780,0.449156,0.440004,1


## 🧪 Experimentos no MLFLOW

In [4]:
models = {
    "TPOT AutoML": [
        {"generations": 3, "population_size": 20, "verbosity": 2},      # Para validação inicial rápida
        {"generations": 5, "population_size": 50, "verbosity": 2},      # Configuração equilibrada
        {"generations": 7, "population_size": 75, "verbosity": 2},      # Configuração robusta para dataset pequeno
        {"generations": 10, "population_size": 100, "verbosity": 2},    # Explorando maior diversidade de pipelines
        {"generations": 15, "population_size": 150, "verbosity": 2},    # Ideal para modelos mais complexos
        {"generations": 20, "population_size": 200, "verbosity": 2},    # Configuração avançada
        {"generations": 25, "population_size": 300, "verbosity": 2}     # Para máxima exploração, dependendo do tempo
    ],
    "Stacking Classifier": [
        # Combinações simples e eficientes para validação rápida
        {
            "estimators": [
                ("rf", RandomForestClassifier(n_estimators=50, max_depth=8, random_state=None)),
                ("gb", GradientBoostingClassifier(n_estimators=30, learning_rate=0.1, max_depth=4, random_state=None)),
                ("dt", DecisionTreeClassifier(max_depth=5, random_state=None)) 
            ],
             "final_estimator": LogisticRegression(max_iter=500)
        },
        
        # Combinações intermediárias com modelos robustos
        {
            "estimators": [
                ("rf", RandomForestClassifier(n_estimators=100, max_depth=12, random_state=None)),
                ("cb", CatBoostClassifier(iterations=50, learning_rate=0.1, depth=5, verbose=0, logging_level='Silent', allow_writing_files=False)),
                ("dt", DecisionTreeClassifier(max_depth=7, random_state=None))  
            ],
             "final_estimator": LogisticRegression(max_iter=1000)
        },
        
        # Exploração avançada com XGBoost e LightGBM
        {
            "estimators": [
                ("xgb", XGBClassifier(n_estimators=75, learning_rate=0.1, max_depth=3, random_state=None)),
                ("lgbm", LGBMClassifier(n_estimators=50, learning_rate=0.05, max_depth=4, random_state=None)),
                ("dt", DecisionTreeClassifier(max_depth=6, random_state=None))  
            ],
             "final_estimator": LogisticRegression(max_iter=1000)
        },
        
        # Combinações avançadas com mais estimadores
        {
            "estimators": [
                ("rf", RandomForestClassifier(n_estimators=75, max_depth=10, random_state=None)),
                ("gb", GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=None)),
                ("xgb", XGBClassifier(n_estimators=50, learning_rate=0.05, max_depth=4, random_state=None)),
                ("dt", DecisionTreeClassifier(max_depth=8, random_state=None))  
            ],
             "final_estimator": LogisticRegression(max_iter=1000)
        },
        
        # Modelos focados em ensembles leves
        {
            "estimators": [
                ("xgb", XGBClassifier(n_estimators=50, learning_rate=0.1, max_depth=4, random_state=None)),
                ("cb", CatBoostClassifier(iterations=50, learning_rate=0.1, depth=6, verbose=0, logging_level='Silent', allow_writing_files=False)),
                ("dt", DecisionTreeClassifier(max_depth=4, random_state=None)) 
            ],
             "final_estimator": LogisticRegression(max_iter=1000)
        },
        
        # Combinações para maior diversidade
        {
            "estimators": [
                ("lgbm", LGBMClassifier(n_estimators=75, learning_rate=0.05, max_depth=5, random_state=None)),
                ("cb", CatBoostClassifier(iterations=50, learning_rate=0.1, depth=6, verbose=0, logging_level='Silent', allow_writing_files=False)),
                ("dt", DecisionTreeClassifier(max_depth=6, random_state=None))  
            ],
             "final_estimator": LogisticRegression(max_iter=1000)
        },
        
        # Ajuste final com modelos mais complexos e profundos
        {
            "estimators": [
                ("rf", RandomForestClassifier(n_estimators=100, max_depth=15, random_state=None)),
                ("gb", GradientBoostingClassifier(n_estimators=75, learning_rate=0.1, max_depth=5, random_state=None)),
                ("dt", DecisionTreeClassifier(max_depth=10, random_state=None))  
            ],
             "final_estimator": LogisticRegression(max_iter=1000)
        }
    ]
}

In [5]:
# Preparar o ambiente do MLFlow e iniciar o experimento

# Configurar o caminho relativo para os artefatos
mlflow.set_tracking_uri("file:./mlruns")

# lista para armazenar os resultados
results = []

# Iniciar o experimento
mlflow.set_experiment("exp_projeto_ciclo_3")

# Run para registrar modelos gerados pelo TPOTClassifier
with mlflow.start_run(run_name="Modelos TPOT Treinados") as main_run:  # Principal
    counter = 0  # Contador para os experimentos
    for params in models["TPOT AutoML"]:  # Itera sobre os parâmetros do TPOT AutoML
        counter += 1
        with mlflow.start_run(run_name=f"{counter}. TPOTClassifier", nested=True):  # Aninhada
            # Instanciar e treinar o TPOTClassifier
            model = TPOTClassifier(**params)
            model.fit(x_train, y_train)

            # Obter o pipeline otimizado
            best_pipeline = model.fitted_pipeline_

            # Avaliar as métricas usando cross_val_score para o pipeline otimizado
            accuracy = cross_val_score(best_pipeline, x_train, y_train, cv=10, scoring='accuracy').mean()
            precision = cross_val_score(best_pipeline, x_train, y_train, cv=10, scoring='precision_weighted').mean()
            recall = cross_val_score(best_pipeline, x_train, y_train, cv=10, scoring='recall_weighted').mean()
            f1 = cross_val_score(best_pipeline, x_train, y_train, cv=10, scoring='f1_weighted').mean()

            # Registrar os parâmetros, métricas e o pipeline otimizado
            for key, value in params.items():
                mlflow.log_param(key, str(value))
            mlflow.log_metric("Accuracy", accuracy)
            mlflow.log_metric("Precision", precision)
            mlflow.log_metric("Recall", recall)
            mlflow.log_metric("F1 Score", f1)
            mlflow.sklearn.log_model(best_pipeline, artifact_path="TPOT_Best_Pipeline",
                                                    registered_model_name="TPOT_Best_Pipeline", 
                                                    input_example=x_test.head(1))

            # Armazenar resultados
            results.append({
                "model": "TPOTClassifier",
                "params": params,
                "Accuracy": accuracy,
                "Precision": precision,
                "Recall": recall,
                "F1 Score": f1,
            })

# Run para registrar modelos gerados pelo StackingClassifier
with mlflow.start_run(run_name="Modelos Stacking Treinados") as main_run:  # Principal
    counter = 0  # Contador para os experimentos
    for params in models["Stacking Classifier"]:  # Itera sobre os parâmetros do Stacking Classifier
        counter += 1
        with mlflow.start_run(run_name=f"{counter}. StackingClassifier", nested=True):  # Aninhada
            # Instanciar e treinar o StackingClassifier
            model = StackingClassifier(**params)
            model.fit(x_train, y_train)

            # Avaliar as métricas usando cross_val_score para o modelo
            accuracy = cross_val_score(model, x_train, y_train, cv=10, scoring='accuracy').mean()
            precision = cross_val_score(model, x_train, y_train, cv=10, scoring='precision_weighted').mean()
            recall = cross_val_score(model, x_train, y_train, cv=10, scoring='recall_weighted').mean()
            f1 = cross_val_score(model, x_train, y_train, cv=10, scoring='f1_weighted').mean()

            # Registrar os parâmetros, métricas e o modelo treinado
            for key, value in params.items():
                mlflow.log_param(key, str(value))
            mlflow.log_metric("Accuracy", accuracy)
            mlflow.log_metric("Precision", precision)
            mlflow.log_metric("Recall", recall)
            mlflow.log_metric("F1 Score", f1)
            mlflow.sklearn.log_model(model, artifact_path="Stacking_Classifier",
                                            registered_model_name="Stacking_Classifier", 
                                            input_example=x_test.head(1))

            # Armazenar resultados
            results.append({
                "model": "StackingClassifier",
                "params": params,
                "Accuracy": accuracy,
                "Precision": precision,
                "Recall": recall,
                "F1 Score": f1,
            })

# Selecionar os 3 melhores modelos com base na métrica Accuracy
best_models = sorted(results, key=lambda x: x["Accuracy"], reverse=True)[:3]

print("\nMelhores Modelos:\n")
for model_info in best_models:
    print(model_info)
print("\n\n")



## 🦾 Armazenando Melhor Modelo Com Pipeline 

In [6]:
import os

# Configurar o tracking URI relativo para MLFlow
mlflow.set_tracking_uri("file:./mlruns")

# Nome do experimento
experiment_name = "exp_projeto_ciclo_3"
mlflow.set_experiment(experiment_name)

# Carregar o dataset
data = pd.read_csv("water_potability.csv").dropna()

# Dividir o dataset em treino e teste
x = data.drop("Potability", axis=1)
y = data["Potability"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=None)

# Função para criar e salvar o pipeline completo no MLFlow
def save_pipeline(ml_model, model_name):
    """
    Combina o modelo com um pipeline de pré-processamento e salva no MLFlow.
    """
    # Combinar pipeline de pré-processamento com o modelo carregado
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),  # Normalização dos dados
        ('model', ml_model)
    ])
    pipeline.fit(x_train, y_train)

    # Classe para encapsular o pipeline no MLFlow
    class PipelineWrapper(mlflow.pyfunc.PythonModel):
        def load_context(self, context):
            self.pipeline = pipeline
        
        def predict(self, context, model_input):
            return self.pipeline.predict(model_input)

    # Salvar o pipeline completo no MLFlow
    save_path = "best_model"
    os.makedirs(save_path, exist_ok=True)
    mlflow.pyfunc.save_model(path=save_path, python_model=PipelineWrapper())
    print(f"\n{model_name} salvo em: {save_path}")

# Obter todos os runs do experimento, ordenando pelo Accuracy em ordem decrescente
runs = mlflow.search_runs(experiment_names=[experiment_name], order_by=["metrics.`Accuracy` DESC"], max_results=1)

if not runs.empty:
    # Selecionar o melhor run
    best_run = runs.iloc[0]
    best_run_id = best_run["run_id"]
    
    # Verificar se há histórico de modelo registrado
    if "tags.mlflow.log-model.history" in best_run:
        log_model_history = json.loads(best_run["tags.mlflow.log-model.history"])
        artifact_path = log_model_history[0]["artifact_path"]

        # Carregar o modelo usando o URI relativo
        model_uri = f"runs:/{best_run_id}/{artifact_path}"
        loaded_model = mlflow.sklearn.load_model(model_uri)

        # Determinar o tipo do modelo e tratá-lo
        if isinstance(loaded_model, TPOTClassifier):
            print("\nModelo do TPOT AutoML carregado, combinando com o pré-processamento...")
            save_pipeline(loaded_model.fitted_pipeline_, "Pipeline do TPOT AutoML")
        else:
            print("\nModelo do Stacking Classifier carregado, combinando com o pré-processamento...")
            save_pipeline(loaded_model, "Pipeline do Stacking Classifier")
    else:
        print("\nNenhum histórico de modelo registrado encontrado para este run.")
else:
    print("\nNenhum run encontrado para o experimento especificado.")



Modelo do Stacking Classifier carregado, combinando com o pré-processamento...

Pipeline do Stacking Classifier salvo em: best_model


## 💾 Modelos Registrados no MLFLOW

In [10]:
import subprocess

# Definir o tracking URI do MLflow
mlflow_tracking_uri = 'file:./mlruns'  # Caminho relativo

mlflow.set_tracking_uri(mlflow_tracking_uri)

# Iniciar o MLflow UI em um subprocesso separado
mlflow_process = subprocess.Popen(["mlflow", "ui", "--host", "127.0.0.1", "--port", "5000"])

# Exibir a URL do MLflow UI
print("MLflow UI está rodando em http://127.0.0.1:5000")

MLflow UI está rodando em http://127.0.0.1:5000


In [8]:
# Parar o subprocesso do MLflow UI
mlflow_process.terminate()

# Confirmar que o MLflow UI foi parado
print("MLflow UI foi parado")

MLflow UI foi parado
