# IMPORTAÇÃO DE BIBLIOTECAS

In [1]:
# MANIPULAÇÃO DE DADOS
import pandas as pd
import numpy as np
from datetime import datetime

# VISUALIZAÇÃO DE DADOS
import matplotlib.pyplot as plt

# TRANSFORMAÇÕES
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from imblearn import over_sampling, under_sampling


# PREPARAÇÃO TREINO E AVALIAÇÃO
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# MODELOS UTILIZADOS
from mixed_naive_bayes import MixedNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# CONFIGURAÇÕES DE EXIBIÇÃO
import gc
import warnings

warnings.filterwarnings("ignore")

# pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

In [2]:
inicio_processo = datetime.now()
print(f'PROCESSO DE MODELAGEM INICIADO EM: {inicio_processo}')

PROCESSO DE MODELAGEM INICIADO EM: 2024-11-23 05:39:50.755329


# IMPORTAÇÃO DE DADOS

In [3]:
weather_aus = pd.read_csv("../../DATA/weatherAUS.csv").dropna()

print(
    f"O dataset possui {weather_aus.shape[0]:,} instâncias (linhas) e {weather_aus.shape[1]:,} características (colunas)."
)

print(f"As características (colunas) do dataset são: {weather_aus.columns.to_list()}")

O dataset possui 56,420 instâncias (linhas) e 23 características (colunas).
As características (colunas) do dataset são: ['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow']


# FUNÇÕES PARA PRÉ-PROCESSAMENTO

## SEPARA TREINO E TESTE (80/20)

In [4]:
def split_data(df, target_column):
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Dividir a base em treino e teste, mantendo a proporção das classes
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    df_train = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)

    return df_train, df_test

## AJUSTA O FORMATO DO ALVO

In [5]:
def adjust_data_types(df):
    df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
    df["RainTomorrow"] = df.RainTomorrow.map({"Yes": 1, "No": 0})

    return df


## CRIAÇÃO DE VARIÁVEIS AUXILIARES

In [6]:
def create_auxiliar_columns(df):
    def get_season(month_number) -> str:
        quarter = month_number % 12 // 3 + 1
        if quarter == 1:
            return "summer"

        if quarter == 2:
            return "autumn"

        if quarter == 3:
            return "winter"

        if quarter == 4:
            return "spring"

    # COLUNAS SELECIONADAS COM BASE NAS AVALIAÇÕES ANTERIORES
    df = df[
        [
            "Date",
            "MinTemp",
            "MaxTemp",
            "Rainfall",
            "Evaporation",
            "Sunshine",
            "WindGustDir",
            "WindGustSpeed",
            "WindDir3pm",
            "WindSpeed9am",
            "WindSpeed3pm",
            "Humidity9am",
            "Humidity3pm",
            "Pressure3pm",
            "Cloud9am",
            "Cloud3pm",
            "RainTomorrow",
        ]
    ]

    df["RangeTemp"] = df["MaxTemp"] - df["MinTemp"]

    df["month"] = df.Date.dt.month
    df["season"] = df["month"].apply(lambda month_number: get_season(month_number))

    time_encoded = pd.get_dummies(
        df[["month", "season"]],
        columns=["month", "season"],
        drop_first=True,
        prefix=["month", "season"],
        dtype=int,
    )

    # SEPARA AS VARIÁVEIS CATEGÓRICAS E NUMÉRICAS
    # CATEGORIZA AS VARIÁVEIS NECESSÁRIAS
    categorical_columns = pd.get_dummies(
        df.select_dtypes(include=["object", "datetime64"]),
        columns=["WindGustDir", "WindDir3pm"],
        drop_first=True,
        prefix=["WindGustDir", "WindDir3pm"],
        dtype=int,
    )

    numerical_columns = df.select_dtypes("number")

    df = pd.concat([numerical_columns, categorical_columns, time_encoded], axis=1)

    return df.drop(columns=["month", "Date", "season"]), {
        "numerical_columns": numerical_columns.drop(
            columns=["RainTomorrow", "month"]
        ).columns.to_list(),
        "categorical_columns": categorical_columns.drop(
            columns=["Date", "season"]
        ).columns.to_list(),
        "time_encoded": time_encoded.columns.to_list(),
    }


## APLICA AS TRANSFORMAÇÕES

In [7]:
def instance_transformations(n_components=8):
    discretizer = KBinsDiscretizer(
        n_bins=5, encode="ordinal", strategy="kmeans", random_state=42
    )

    smote = over_sampling.SMOTE(random_state=42)
    under = under_sampling.RandomUnderSampler(random_state=42)

    scaler = StandardScaler()
    pca = PCA(n_components=n_components, random_state=42)

    return discretizer, scaler, pca, smote, under

In [8]:
def adjust_train_volume(df, target_column, smote, under):
    X = df.drop(columns=[target_column])
    y = df[target_column]

    X_smote, y_smote = smote.fit_resample(X, y)
    df_smote = pd.concat([X_smote, y_smote], axis=1)

    X_under, y_under = under.fit_resample(X, y)
    df_under = pd.concat([X_under, y_under], axis=1)

    return df_smote, df_under

In [9]:
def fit_transformmations(df, cols, discretizer, scaler, pca, discrete_col="Rainfall"):
    discretizer.fit(df[discrete_col].values.reshape(-1, 1))

    scaler.fit(df[cols])
    pca.fit(df[cols])

    return discretizer, scaler, pca


In [10]:
def transform_data(df, cols, discretizer, scaler, pca, discrete_col="Rainfall"):
    df[discrete_col + "_Discretized"] = discretizer.transform(
        df[discrete_col].values.reshape(-1, 1)
    )
    df[cols] = scaler.transform(df[cols])
    df = df.reset_index(drop=True)

    pca_result = pca.transform(df[cols])

    df_pca = pd.DataFrame(
        data=pca_result,
        columns=[f"PC{comp}" for comp in range(1, pca.n_components_ + 1)],
    )
    df_pca = df_pca.reset_index(drop=True)
    df_pca["RainTomorrow"] = df["RainTomorrow"]

    return df, df_pca

# INICIO DO PRÉ-PROCESSAMENTO

## SEPARA TREINO E TESTE (80/20)

In [11]:
df_train, df_test = split_data(weather_aus, "RainTomorrow")

## APLICAÇÕES SOBRE TREINO

### AJUSTA O TIPO DE DADO

In [12]:
df_train = adjust_data_types(df_train)

### CRIA VARIÁVEIS AUXILIARES

In [13]:
df_train, columns_names = create_auxiliar_columns(df_train)

### APLICA AS TRANSFORMAÇÕES

In [14]:
discretizer, scaler, pca, smote, under = instance_transformations()

#### AJUSTA VOLUME DA RESPOSTA

In [15]:
alvo = df_train.RainTomorrow.value_counts().to_frame("QTD").reset_index()
alvo["PERC"] = alvo.QTD / alvo.QTD.sum()
alvo

Unnamed: 0,RainTomorrow,QTD,PERC
0,0,35194,0.779732
1,1,9942,0.220268


In [16]:
df_train_smote, df_train_under = adjust_train_volume(
    df_train, "RainTomorrow", smote, under
)

In [17]:
alvo_smote = df_train_smote.RainTomorrow.value_counts().to_frame("QTD").reset_index()
alvo_smote["PERC"] = alvo_smote.QTD / alvo_smote.QTD.sum()
alvo_smote

Unnamed: 0,RainTomorrow,QTD,PERC
0,1,35194,0.5
1,0,35194,0.5


In [18]:
alvo_under = df_train_under.RainTomorrow.value_counts().to_frame("QTD").reset_index()
alvo_under["PERC"] = alvo_under.QTD / alvo_under.QTD.sum()
alvo_under

Unnamed: 0,RainTomorrow,QTD,PERC
0,0,9942,0.5
1,1,9942,0.5


#### TRANSFORMA OS DADOS

In [19]:
discretizer_full, scaler_full, pca_full = fit_transformmations(
    df_train, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [20]:
discretizer_smote, scaler_smote, pca_smote = fit_transformmations(
    df_train_smote, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [21]:
discretizer_under, scaler_under, pca_under = fit_transformmations(
    df_train_under, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [22]:
df_train_full, df_train_full_pca = transform_data(
    df_train, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [23]:
df_train_smote, df_train_smote_pca = transform_data(
    df_train_smote,
    columns_names["numerical_columns"],
    discretizer_smote,
    scaler_smote,
    pca_smote,
)

In [24]:
df_train_under, df_train_under_pca = transform_data(
    df_train_under,
    columns_names["numerical_columns"],
    discretizer_under,
    scaler_under,
    pca_under,
)

## APLICAÇÕES SOBRE TESTE

### AJUSTA O TIPO DE DADO

In [25]:
df_test = adjust_data_types(df_test)

### CRIA VARIÁVEIS AUXILIARES

In [26]:
df_test, columns_names = create_auxiliar_columns(df_test)

### TRANSFORMA OS DADOS

In [27]:
df_test_full, df_test_full_pca = transform_data(
    df_test, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [28]:
df_test_smote, df_test_smote_pca = transform_data(
    df_test,
    columns_names["numerical_columns"],
    discretizer_smote,
    scaler_smote,
    pca_smote,
)

In [29]:
df_test_under, df_test_under_pca = transform_data(
    df_test,
    columns_names["numerical_columns"],
    discretizer_under,
    scaler_under,
    pca_under,
)

## LIBERA MEMÓRIA

In [30]:
del (
    df_train,
    df_test,
    weather_aus,
    discretizer_under,
    scaler_under,
    pca_under,
    discretizer_smote,
    scaler_smote,
    pca_smote,
    discretizer,
    scaler,
    pca,
    smote,
    under,
)
gc.collect()

87

# PROCESSO DE MODELAGEM

## FUNÇÕES DE CROSS VALIDATION

### NUMÉRICAS

In [31]:
def run_cross_validation(
    train_data, train_target, test_data, test_target, model, param_grid, k_folds=5
):
    skf = StratifiedKFold(n_splits=k_folds)
    scores_validation = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    scores_test = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    best_params = None
    best_model = None
    best_f1 = 0
    counter = 1
    print(f"====> INICIANDO PROCESSAMENTO: {datetime.now()} <====")

    for train_index, valid_index in skf.split(train_data, train_target):
        inicio = datetime.now()
        print(f"====> {inicio} | EXECUTANDO FOLD {counter} ")

        # Use .iloc to select rows based on indices
        X_train_fold, X_valid_fold = (
            train_data.iloc[train_index],
            train_data.iloc[valid_index],
        )
        y_train_fold, y_valid_fold = (
            train_target.iloc[train_index],
            train_target.iloc[valid_index],
        )

        # Etapa 2: Treinamento e ajuste de hiperparâmetros com validação cruzada interna
        grid_search = GridSearchCV(
            model,
            param_grid,
            scoring="f1",
            cv=StratifiedKFold(n_splits=3),
            n_jobs=-1,
            verbose=3,
        )
        grid_search.fit(X_train_fold, y_train_fold)
        model = grid_search.best_estimator_
        f1 = grid_search.best_score_

        if f1 > best_f1:
            best_model = model
            best_params = grid_search.best_params_
            best_f1 = f1

        # Avaliação nos dados de validação
        y_valid_pred = model.predict(X_valid_fold)

        # Cálculo das métricas no fold
        scores_validation["f1"].append(
            f1_score(y_valid_fold, y_valid_pred, average="macro")
        )
        scores_validation["accuracy"].append(accuracy_score(y_valid_fold, y_valid_pred))
        scores_validation["precision"].append(
            precision_score(y_valid_fold, y_valid_pred, average="macro")
        )
        scores_validation["recall"].append(
            recall_score(y_valid_fold, y_valid_pred, average="macro")
        )

        # Avaliar na base de teste com o melhor modelo
        y_test_pred = model.predict(test_data)
        f1_test = f1_score(test_target, y_test_pred, average="macro")
        accuracy_test = accuracy_score(test_target, y_test_pred)
        precision_test = precision_score(test_target, y_test_pred, average="macro")
        recall_test = recall_score(test_target, y_test_pred, average="macro")

        # Armazenar as métricas na base de teste
        scores_test["f1"].append(f1_test)
        scores_test["accuracy"].append(accuracy_test)
        scores_test["precision"].append(precision_test)
        scores_test["recall"].append(recall_test)

        counter += 1
        final = datetime.now()
        print(
            f"===> {final} | FINALIZADO FOLD {counter - 1} | TEMPO TOTAL {final - inicio}\n\n"
        )

    # Calcula as médias das métricas nos folds de validação
    avg_scores_validation = {
        metric: np.mean(values) for metric, values in scores_validation.items()
    }
    test_scores = {metric: np.mean(values) for metric, values in scores_test.items()}

    # Criar log dos resultados
    log = pd.DataFrame(scores_validation)
    log["dataset"] = "Validação Cruzada"

    test_log = pd.DataFrame(scores_test)
    test_log["dataset"] = "Teste"

    # Concatenar os logs da validação cruzada e do teste
    full_log = pd.concat([log, test_log])

    return avg_scores_validation, test_scores, best_params, full_log

### CATEGÓRICAS

In [32]:
def run_cross_mix(
    train_data,
    train_target,
    test_data,
    test_target,
    alphas,
    k_folds=5,
    categorical_features=None,
):
    skf = StratifiedKFold(n_splits=k_folds)
    scores_validation = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    scores_test = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    best_model = None
    best_alpha = None
    best_f1 = 0
    counter = 1
    print(f"====> INICIANDO PROCESSAMENTO: {datetime.now()} <====")

    # Validação cruzada estratificada
    for train_index, valid_index in skf.split(train_data, train_target):
        inicio = datetime.now()
        print(f"====> {inicio} | EXECUTANDO FOLD {counter} ")

        X_train_fold, X_valid_fold = (
            train_data.iloc[train_index],
            train_data.iloc[valid_index],
        )
        y_train_fold, y_valid_fold = (
            train_target.iloc[train_index],
            train_target.iloc[valid_index],
        )

        # Iterar pelos valores de alpha
        for alpha in alphas:
            print(f"========> Testando: alpha = {alpha}")
            # Treina o modelo com o valor de alpha atual
            model = MixedNB(categorical_features=categorical_features, alpha=alpha)
            model.fit(X_train_fold, y_train_fold)

            # Avaliação nos dados de validação
            y_valid_pred = model.predict(X_valid_fold)
            f1 = f1_score(y_valid_fold, y_valid_pred, average="macro")
            accuracy = accuracy_score(y_valid_fold, y_valid_pred)
            precision = precision_score(y_valid_fold, y_valid_pred, average="macro")
            recall = recall_score(y_valid_fold, y_valid_pred, average="macro")

            # Seleciona o melhor modelo baseado na métrica F1-score
            if f1 > best_f1:
                best_f1 = f1
                best_model = model
                best_alpha = alpha

        # Armazenar as métricas para o fold atual
        scores_validation["f1"].append(f1)
        scores_validation["accuracy"].append(accuracy)
        scores_validation["precision"].append(precision)
        scores_validation["recall"].append(recall)

        # Avaliar na base de teste com o melhor modelo
        y_test_pred = model.predict(test_data)
        f1_test = f1_score(test_target, y_test_pred, average="macro")
        accuracy_test = accuracy_score(test_target, y_test_pred)
        precision_test = precision_score(test_target, y_test_pred, average="macro")
        recall_test = recall_score(test_target, y_test_pred, average="macro")

        # Armazenar as métricas na base de teste
        scores_test["f1"].append(f1_test)
        scores_test["accuracy"].append(accuracy_test)
        scores_test["precision"].append(precision_test)
        scores_test["recall"].append(recall_test)

        counter += 1
        final = datetime.now()
        print(
            f"====> {final} | FINALIZADO FOLD {counter - 1} | TEMPO TOTAL {final - inicio}\n\n"
        )

    # Calcular as médias das métricas de validação cruzada
    avg_scores_validation = {
        metric: np.mean(values) for metric, values in scores_validation.items()
    }
    test_scores = {metric: np.mean(values) for metric, values in scores_test.items()}

    # Criar log dos resultados
    log = pd.DataFrame(scores_validation)
    log["dataset"] = "Validação Cruzada"

    test_log = pd.DataFrame(scores_test)
    test_log["dataset"] = "Teste"

    # Concatenar os logs da validação cruzada e do teste
    full_log = pd.concat([log, test_log])

    return avg_scores_validation, best_alpha, test_scores, full_log


## MODELOS

### NAIVE BAYES

#### TODAS AS CARACTERÍSTICAS

##### UNDERSAMPLING

In [33]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_under = run_cross_mix(
    df_train_under.drop(columns=["RainTomorrow"]),
    df_train_under["RainTomorrow"],
    df_test_under.drop(columns=["RainTomorrow"]),
    df_test_under["RainTomorrow"],
    alphas,
    k_folds=5,
    categorical_features=[
        df_train_under.drop(columns="RainTomorrow").columns.get_loc(col)
        for col in columns_names["categorical_columns"] + columns_names["time_encoded"]
    ],
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:39:53.632821 <====
====> 2024-11-23 05:39:53.636381 | EXECUTANDO FOLD 1 
====> 2024-11-23 05:39:55.322745 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:01.686364


====> 2024-11-23 05:39:55.323761 | EXECUTANDO FOLD 2 
====> 2024-11-23 05:39:57.019780 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:01.696019


====> 2024-11-23 05:39:57.020784 | EXECUTANDO FOLD 3 
====> 2024-11-23 05:39:58.647049 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:01.626265


====> 2024-11-23 05:39:58.648048 | EXECUTANDO FOLD 4 
====> 2024-11-23 05:40:00.394007 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:01.745959


====> 2024-11-23 05:40:00.394532 | EXECUTANDO FOLD 5 
====> 2024-11-23 05:40:01.795682 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:01.401150


Resultados na validação cruzada: {'f1': 0.7626343459024936, 'accuracy': 0.7629254152017966, 'precision': 0.7642264309677982, 'recall': 0.762925238712314}
Resultados na base de teste: {'f1': 0.4381317532241199, 'accuracy': 0.7797766749379652, 'pre

In [34]:
results_list_bayes_under.to_csv(
    "../../DATA/RESULTS/results_list_bayes_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [35]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_full = run_cross_mix(
    df_train_full.drop(columns=["RainTomorrow"]),
    df_train_full["RainTomorrow"],
    df_test_full.drop(columns=["RainTomorrow"]),
    df_test_full["RainTomorrow"],
    alphas,
    k_folds=5,
    categorical_features=[
        df_train_full.drop(columns="RainTomorrow").columns.get_loc(col)
        for col in columns_names["categorical_columns"] + columns_names["time_encoded"]
    ],
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:02.205616 <====
====> 2024-11-23 05:40:02.215681 | EXECUTANDO FOLD 1 
====> 2024-11-23 05:40:05.595749 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:03.380068


====> 2024-11-23 05:40:05.595749 | EXECUTANDO FOLD 2 
====> 2024-11-23 05:40:08.831902 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:03.236153


====> 2024-11-23 05:40:08.833412 | EXECUTANDO FOLD 3 
====> 2024-11-23 05:40:12.290304 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:03.456892


====> 2024-11-23 05:40:12.290304 | EXECUTANDO FOLD 4 
====> 2024-11-23 05:40:15.833448 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:03.543144


====> 2024-11-23 05:40:15.834620 | EXECUTANDO FOLD 5 
====> 2024-11-23 05:40:19.300578 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:03.465958


Resultados na validação cruzada: {'f1': 0.7316497551807923, 'accuracy': 0.8014222458406302, 'precision': 0.7191078286403904, 'recall': 0.7513100041244096}
Resultados na base de teste: {'f1': 0.7379562346605752, 'accuracy': 0.8068060971286778, 'pr

In [36]:
results_list_bayes_full.to_csv(
    "../../DATA/RESULTS/results_list_bayes_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [37]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_smote = run_cross_mix(
    df_train_smote.drop(columns=["RainTomorrow"]),
    df_train_smote["RainTomorrow"],
    df_test_smote.drop(columns=["RainTomorrow"]),
    df_test_smote["RainTomorrow"],
    alphas,
    k_folds=5,
    categorical_features=[
        df_train_smote.drop(columns="RainTomorrow").columns.get_loc(col)
        for col in columns_names["categorical_columns"] + columns_names["time_encoded"]
    ],
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:19.845717 <====
====> 2024-11-23 05:40:19.850268 | EXECUTANDO FOLD 1 
====> 2024-11-23 05:40:24.818661 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:04.968393


====> 2024-11-23 05:40:24.819680 | EXECUTANDO FOLD 2 
====> 2024-11-23 05:40:29.840277 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:05.020597


====> 2024-11-23 05:40:29.840277 | EXECUTANDO FOLD 3 
====> 2024-11-23 05:40:34.385633 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:04.545356


====> 2024-11-23 05:40:34.390700 | EXECUTANDO FOLD 4 
====> 2024-11-23 05:40:39.040680 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:04.649980


====> 2024-11-23 05:40:39.040680 | EXECUTANDO FOLD 5 
====> 2024-11-23 05:40:43.635327 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:04.594647


Resultados na validação cruzada: {'f1': 0.792563113894891, 'accuracy': 0.7935022446662893, 'precision': 0.7972085273933975, 'recall': 0.7935022089611482}
Resultados na base de teste: {'f1': 0.4381317532241199, 'accuracy': 0.7797766749379652, 'pre

In [38]:
results_list_bayes_smote.to_csv(
    "../../DATA/RESULTS/results_list_bayes_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [39]:
del results_list_bayes_smote, results_list_bayes_under, results_list_bayes_full
gc.collect()

0

#### PCA

##### UNDERSAMPLING

In [40]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_pca_under = (
    run_cross_validation(
        df_train_under_pca.drop(columns=["RainTomorrow"]),
        df_train_under_pca["RainTomorrow"],
        df_test_under_pca.drop(columns=["RainTomorrow"]),
        df_test_under_pca["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:43.772562 <====
====> 2024-11-23 05:40:43.777125 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:47.720454 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:03.943329


====> 2024-11-23 05:40:47.720454 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:47.845336 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.124882


====> 2024-11-23 05:40:47.845336 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:47.994777 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.149441


====> 2024-11-23 05:40:47.995291 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:48.162993 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.167702


====> 2024-11-23 05:40:48.162993 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:48.335645 | 

In [41]:
results_list_bayes_pca_under.to_csv(
    "../../DATA/RESULTS/results_list_bayes_pca_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [42]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_pca_full = (
    run_cross_validation(
        df_train_full_pca.drop(columns=["RainTomorrow"]),
        df_train_full_pca["RainTomorrow"],
        df_test_full_pca.drop(columns=["RainTomorrow"]),
        df_test_full_pca["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:48.368940 <====
====> 2024-11-23 05:40:48.377485 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:48.745293 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.367808


====> 2024-11-23 05:40:48.745293 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:49.115251 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.369958


====> 2024-11-23 05:40:49.115251 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:49.486835 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.371584


====> 2024-11-23 05:40:49.487792 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:49.800353 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.312561


====> 2024-11-23 05:40:49.805441 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:50.129464 | 

In [43]:
results_list_bayes_pca_full.to_csv(
    "../../DATA/RESULTS/results_list_bayes_pca_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [44]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_pca_smote = (
    run_cross_validation(
        df_train_smote_pca.drop(columns=["RainTomorrow"]),
        df_train_smote_pca["RainTomorrow"],
        df_test_smote_pca.drop(columns=["RainTomorrow"]),
        df_test_smote_pca["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:50.170818 <====
====> 2024-11-23 05:40:50.186265 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:50.715358 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.529093


====> 2024-11-23 05:40:50.715358 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:51.151193 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.435835


====> 2024-11-23 05:40:51.151193 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:51.505577 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.354384


====> 2024-11-23 05:40:51.505577 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:51.888015 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.382438


====> 2024-11-23 05:40:51.889017 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:52.315381 | 

In [45]:
results_list_bayes_pca_smote.to_csv(
    "../../DATA/RESULTS/results_list_bayes_pca_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [46]:
del (
    results_list_bayes_pca_under,
    results_list_bayes_pca_smote,
    results_list_bayes_pca_full,
)
gc.collect()

182

#### CORRELAÇÃO

In [47]:
filtro_correlacao = ["Sunshine", "Humidity3pm", "Cloud9am", "Cloud3pm", "RangeTemp"]

##### UNDERSAMPLING

In [48]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_corr_under = (
    run_cross_validation(
        df_train_under[filtro_correlacao],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_correlacao],
        df_test_under["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:52.469609 <====
====> 2024-11-23 05:40:52.473122 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:52.643903 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.170781


====> 2024-11-23 05:40:52.643903 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:52.792831 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.148928


====> 2024-11-23 05:40:52.792831 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:52.955398 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.162567


====> 2024-11-23 05:40:52.955398 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:53.100066 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.144668


====> 2024-11-23 05:40:53.101069 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:53.227429 | 

In [49]:
results_list_bayes_corr_under.to_csv(
    "../../DATA/RESULTS/results_list_bayes_corr_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [50]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_corr_full = (
    run_cross_validation(
        df_train_full[filtro_correlacao],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_correlacao],
        df_test_full["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:53.245308 <====
====> 2024-11-23 05:40:53.261147 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:53.592198 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.331051


====> 2024-11-23 05:40:53.593197 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:53.888458 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.295261


====> 2024-11-23 05:40:53.895528 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:54.160400 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.264872


====> 2024-11-23 05:40:54.160400 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:54.420410 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.260010


====> 2024-11-23 05:40:54.423916 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:54.697298 | 

In [51]:
results_list_bayes_corr_full.to_csv(
    "../../DATA/RESULTS/results_list_bayes_corr_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [52]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_corr_smote = (
    run_cross_validation(
        df_train_smote[filtro_correlacao],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_correlacao],
        df_test_smote["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:54.725123 <====
====> 2024-11-23 05:40:54.749559 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:55.200714 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.451155


====> 2024-11-23 05:40:55.201776 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:55.675221 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.473445


====> 2024-11-23 05:40:55.675221 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:56.106203 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.430982


====> 2024-11-23 05:40:56.106203 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:56.506484 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.400281


====> 2024-11-23 05:40:56.506723 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:56.904587 | 

In [53]:
results_list_bayes_corr_smote.to_csv(
    "../../DATA/RESULTS/results_list_bayes_corr_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [54]:
del (
    results_list_bayes_corr_smote,
    results_list_bayes_corr_full,
    results_list_bayes_corr_under,
)
gc.collect()

182

#### RANDOM FOREST - FEATURE IMPORTANCE

In [55]:
filtro_random_forest = [
    "Humidity3pm",
    "Sunshine",
    "Pressure3pm",
    "Cloud3pm",
    "RangeTemp",
    "WindGustSpeed",
    "Humidity9am",
    "Rainfall",
    "MinTemp",
    "MaxTemp",
    "Evaporation",
    "WindSpeed3pm",
    "WindSpeed9am",
    "Cloud9am",
    "Rainfall_Discretized",
]

##### UNDERSAMPLING

In [56]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_rf_under = (
    run_cross_mix(
        df_train_under[filtro_random_forest],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_random_forest],
        df_test_under["RainTomorrow"],
        alphas,
        k_folds=5,
        categorical_features=[14],
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:57.065307 <====
====> 2024-11-23 05:40:57.072348 | EXECUTANDO FOLD 1 
====> 2024-11-23 05:40:57.260253 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.187905


====> 2024-11-23 05:40:57.260253 | EXECUTANDO FOLD 2 
====> 2024-11-23 05:40:57.445389 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.185136


====> 2024-11-23 05:40:57.445389 | EXECUTANDO FOLD 3 
====> 2024-11-23 05:40:57.643555 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.198166


====> 2024-11-23 05:40:57.643555 | EXECUTANDO FOLD 4 
====> 2024-11-23 05:40:57.832546 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.188991


====> 2024-11-23 05:40:57.832546 | EXECUTANDO FOLD 5 
====> 2024-11-23 05:40:58.002651 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:00.170105


Resultados na validação cruzada: {'f1': 0.7640619875453153, 'accuracy': 0.764082205073539, 'precision': 0.7641743119100982, 'recall': 0.7640818263022073}
Resultados na base de teste: {'f1': 0.4381317532241199, 'accuracy': 0.7797766749379652, 'pre

In [57]:
results_list_bayes_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_bayes_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [58]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_rf_full = (
    run_cross_mix(
        df_train_full[filtro_random_forest],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_random_forest],
        df_test_full["RainTomorrow"],
        alphas,
        k_folds=5,
        categorical_features=[14],
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:58.037022 <====
====> 2024-11-23 05:40:58.045031 | EXECUTANDO FOLD 1 
====> 2024-11-23 05:40:58.397110 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.352079


====> 2024-11-23 05:40:58.397110 | EXECUTANDO FOLD 2 
====> 2024-11-23 05:40:58.795035 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.397925


====> 2024-11-23 05:40:58.795035 | EXECUTANDO FOLD 3 
====> 2024-11-23 05:40:59.146634 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.351599


====> 2024-11-23 05:40:59.146634 | EXECUTANDO FOLD 4 
====> 2024-11-23 05:40:59.455254 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.308620


====> 2024-11-23 05:40:59.455254 | EXECUTANDO FOLD 5 
====> 2024-11-23 05:40:59.745162 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:00.289908


Resultados na validação cruzada: {'f1': 0.7285803915846831, 'accuracy': 0.793889490392604, 'precision': 0.7140290150484855, 'recall': 0.7561141410271192}
Resultados na base de teste: {'f1': 0.7352714027673639, 'accuracy': 0.7990960652250975, 'pre

In [59]:
results_list_bayes_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_bayes_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [60]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_rf_smote = (
    run_cross_mix(
        df_train_smote[filtro_random_forest],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_random_forest],
        df_test_smote["RainTomorrow"],
        alphas,
        k_folds=5,
        categorical_features=[14],
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:59.791003 <====
====> 2024-11-23 05:40:59.804349 | EXECUTANDO FOLD 1 
====> 2024-11-23 05:41:00.319675 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.515326


====> 2024-11-23 05:41:00.319675 | EXECUTANDO FOLD 2 
====> 2024-11-23 05:41:00.649501 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.329826


====> 2024-11-23 05:41:00.649501 | EXECUTANDO FOLD 3 
====> 2024-11-23 05:41:01.086389 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.436888


====> 2024-11-23 05:41:01.086389 | EXECUTANDO FOLD 4 
====> 2024-11-23 05:41:01.471825 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.385436


====> 2024-11-23 05:41:01.471825 | EXECUTANDO FOLD 5 
====> 2024-11-23 05:41:01.890311 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:00.418486


Resultados na validação cruzada: {'f1': 0.7654347686025795, 'accuracy': 0.7654430062537438, 'precision': 0.7654807242887745, 'recall': 0.7654429805507341}
Resultados na base de teste: {'f1': 0.4381317532241199, 'accuracy': 0.7797766749379652, 'pr

In [61]:
results_list_bayes_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_bayes_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [62]:
del results_list_bayes_rf_smote, results_list_bayes_rf_under, results_list_bayes_rf_full
gc.collect()

0

### RANDOM FOREST

#### TODAS AS CARACTERÍSTICAS

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_under = (
    run_cross_validation(
        df_train_under.drop(columns=["RainTomorrow"]),
        df_train_under["RainTomorrow"],
        df_test_under.drop(columns=["RainTomorrow"]),
        df_test_under["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:41:02.043890 <====
====> 2024-11-23 05:41:02.048385 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [None]:
results_list_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_full = (
    run_cross_validation(
        df_train_full.drop(columns=["RainTomorrow"]),
        df_train_full["RainTomorrow"],
        df_test_full.drop(columns=["RainTomorrow"]),
        df_test_full["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_smote = (
    run_cross_validation(
        df_train_smote.drop(columns=["RainTomorrow"]),
        df_train_smote["RainTomorrow"],
        df_test_smote.drop(columns=["RainTomorrow"]),
        df_test_smote["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_rf_smote, results_list_rf_full, results_list_rf_under
gc.collect()

#### PCA

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_pca_under = (
    run_cross_validation(
        df_train_under_pca.drop(columns=["RainTomorrow"]),
        df_train_under_pca["RainTomorrow"],
        df_test_under_pca.drop(columns=["RainTomorrow"]),
        df_test_under_pca["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_pca_under.to_csv(
    "../../DATA/RESULTS/results_list_rf_pca_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_pca_full = (
    run_cross_validation(
        df_train_full_pca.drop(columns=["RainTomorrow"]),
        df_train_full_pca["RainTomorrow"],
        df_test_full_pca.drop(columns=["RainTomorrow"]),
        df_test_full_pca["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_pca_full.to_csv(
    "../../DATA/RESULTS/results_list_rf_pca_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_pca_smote = (
    run_cross_validation(
        df_train_smote_pca.drop(columns=["RainTomorrow"]),
        df_train_smote_pca["RainTomorrow"],
        df_test_smote_pca.drop(columns=["RainTomorrow"]),
        df_test_smote_pca["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_pca_smote.to_csv(
    "../../DATA/RESULTS/results_list_rf_pca_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_rf_pca_smote, results_list_rf_pca_full, results_list_rf_pca_under
gc.collect()

#### CORRELAÇÃO

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_corr_under = (
    run_cross_validation(
        df_train_under.drop(columns=["RainTomorrow"]),
        df_train_under["RainTomorrow"],
        df_test_under.drop(columns=["RainTomorrow"]),
        df_test_under["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_corr_under.to_csv(
    "../../DATA/RESULTS/results_list_rf_corr_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_corr_full = (
    run_cross_validation(
        df_train_full.drop(columns=["RainTomorrow"]),
        df_train_full["RainTomorrow"],
        df_test_full.drop(columns=["RainTomorrow"]),
        df_test_full["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_corr_full.to_csv(
    "../../DATA/RESULTS/results_list_rf_corr_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_corr_smote = (
    run_cross_validation(
        df_train_smote.drop(columns=["RainTomorrow"]),
        df_train_smote["RainTomorrow"],
        df_test_smote.drop(columns=["RainTomorrow"]),
        df_test_smote["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_corr_smote.to_csv(
    "../../DATA/RESULTS/results_list_rf_corr_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_rf_corr_smote, results_list_rf_corr_full, results_list_rf_corr_under
gc.collect()

#### RANDOM FOREST - FEATURE IMPORTANCE

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_rf_under = (
    run_cross_validation(
        df_train_under[filtro_random_forest],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_random_forest],
        df_test_under["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_rf_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_rf_full = (
    run_cross_validation(
        df_train_full[filtro_random_forest],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_random_forest],
        df_test_full["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_rf_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_rf_smote = (
    run_cross_validation(
        df_train_smote[filtro_random_forest],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_random_forest],
        df_test_smote["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_rf_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_rf_rf_smote, results_list_rf_rf_full, results_list_rf_rf_under
gc.collect()

### SVM

#### TODAS AS CARACTERÍSTICAS

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_under = (
    run_cross_validation(
        df_train_under.drop(columns=["RainTomorrow"]),
        df_train_under["RainTomorrow"],
        df_test_under.drop(columns=["RainTomorrow"]),
        df_test_under["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_under.to_csv(
    "../../DATA/RESULTS/results_list_svm_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_full = (
    run_cross_validation(
        df_train_full.drop(columns=["RainTomorrow"]),
        df_train_full["RainTomorrow"],
        df_test_full.drop(columns=["RainTomorrow"]),
        df_test_full["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_full.to_csv(
    "../../DATA/RESULTS/results_list_svm_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_smote = (
    run_cross_validation(
        df_train_smote.drop(columns=["RainTomorrow"]),
        df_train_smote["RainTomorrow"],
        df_test_smote.drop(columns=["RainTomorrow"]),
        df_test_smote["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_smote.to_csv(
    "../../DATA/RESULTS/results_list_svm_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_svm_smote, results_list_svm_under, results_list_svm_full
gc.collect()

#### PCA

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_pca_under = (
    run_cross_validation(
        df_train_under_pca.drop(columns=["RainTomorrow"]),
        df_train_under_pca["RainTomorrow"],
        df_test_under_pca.drop(columns=["RainTomorrow"]),
        df_test_under_pca["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_pca_under.to_csv(
    "../../DATA/RESULTS/results_list_svm_pca_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_pca_full = (
    run_cross_validation(
        df_train_full_pca.drop(columns=["RainTomorrow"]),
        df_train_full_pca["RainTomorrow"],
        df_test_full_pca.drop(columns=["RainTomorrow"]),
        df_test_full_pca["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_pca_full.to_csv(
    "../../DATA/RESULTS/results_list_svm_pca_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_pca_smote = (
    run_cross_validation(
        df_train_smote_pca.drop(columns=["RainTomorrow"]),
        df_train_smote_pca["RainTomorrow"],
        df_test_smote_pca.drop(columns=["RainTomorrow"]),
        df_test_smote_pca["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_pca_smote.to_csv(
    "../../DATA/RESULTS/results_list_svm_pca_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_svm_pca_smote, results_list_svm_pca_under, results_list_svm_pca_full
gc.collect()

#### CORRELAÇÃO

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_corr_under = (
    run_cross_validation(
        df_train_under[filtro_correlacao],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_correlacao],
        df_test_under["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_corr_under.to_csv(
    "../../DATA/RESULTS/results_list_svm_corr_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_corr_full = (
    run_cross_validation(
        df_train_full[filtro_correlacao],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_correlacao],
        df_test_full["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_corr_full.to_csv(
    "../../DATA/RESULTS/results_list_svm_corr_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_corr_smote = (
    run_cross_validation(
        df_train_smote[filtro_correlacao],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_correlacao],
        df_test_smote["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_corr_smote.to_csv(
    "../../DATA/RESULTS/results_list_svm_corr_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_svm_corr_smote, results_list_svm_corr_full, results_list_svm_corr_under
gc.collect()

#### RANDOM FOREST - FEATURE IMPORTANCE

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_rf_under = (
    run_cross_validation(
        df_train_under[filtro_random_forest],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_random_forest],
        df_test_under["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_svm_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_rf_full = (
    run_cross_validation(
        df_train_full[filtro_random_forest],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_random_forest],
        df_test_full["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_svm_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_rf_smote = (
    run_cross_validation(
        df_train_smote[filtro_random_forest],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_random_forest],
        df_test_smote["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_svm_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_svm_rf_smote, results_list_svm_rf_full, results_list_svm_rf_under
gc.collect()

### MLP

#### TODAS AS CARACTERÍSTICAS

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_under = (
    run_cross_validation(
        df_train_under.drop(columns=["RainTomorrow"]),
        df_train_under["RainTomorrow"],
        df_test_under.drop(columns=["RainTomorrow"]),
        df_test_under["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_under.to_csv(
    "../../DATA/RESULTS/results_list_mlp_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_full = (
    run_cross_validation(
        df_train_full.drop(columns=["RainTomorrow"]),
        df_train_full["RainTomorrow"],
        df_test_full.drop(columns=["RainTomorrow"]),
        df_test_full["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_full.to_csv(
    "../../DATA/RESULTS/results_list_mlp_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_smote = (
    run_cross_validation(
        df_train_smote.drop(columns=["RainTomorrow"]),
        df_train_smote["RainTomorrow"],
        df_test_smote.drop(columns=["RainTomorrow"]),
        df_test_smote["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_smote.to_csv(
    "../../DATA/RESULTS/results_list_mlp_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_mlp_smote, results_list_mlp_under, results_list_mlp_full
gc.collect()

#### PCA

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_pca_under = (
    run_cross_validation(
        df_train_under_pca.drop(columns=["RainTomorrow"]),
        df_train_under_pca["RainTomorrow"],
        df_test_under_pca.drop(columns=["RainTomorrow"]),
        df_test_under_pca["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_pca_under.to_csv(
    "../../DATA/RESULTS/results_list_mlp_pca_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_pca_full = (
    run_cross_validation(
        df_train_full_pca.drop(columns=["RainTomorrow"]),
        df_train_full_pca["RainTomorrow"],
        df_test_full_pca.drop(columns=["RainTomorrow"]),
        df_test_full_pca["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_pca_full.to_csv(
    "../../DATA/RESULTS/results_list_mlp_pca_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_pca_smote = (
    run_cross_validation(
        df_train_smote_pca.drop(columns=["RainTomorrow"]),
        df_train_smote_pca["RainTomorrow"],
        df_test_smote_pca.drop(columns=["RainTomorrow"]),
        df_test_smote_pca["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_pca_smote.to_csv(
    "../../DATA/RESULTS/results_list_mlp_pca_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_mlp_pca_smote, results_list_mlp_pca_under, results_list_mlp_pca_full
gc.collect()

#### CORRELAÇÃO

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_corr_under = (
    run_cross_validation(
        df_train_under[filtro_correlacao],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_correlacao],
        df_test_under["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_corr_under.to_csv(
    "../../DATA/RESULTS/results_list_mlp_corr_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_corr_full = (
    run_cross_validation(
        df_train_full[filtro_correlacao],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_correlacao],
        df_test_full["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_corr_full.to_csv(
    "../../DATA/RESULTS/results_list_mlp_corr_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_corr_smote = (
    run_cross_validation(
        df_train_smote[filtro_correlacao],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_correlacao],
        df_test_smote["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_corr_smote.to_csv(
    "../../DATA/RESULTS/results_list_mlp_corr_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_mlp_corr_smote, results_list_mlp_corr_full, results_list_mlp_corr_under
gc.collect()

#### RANDOM FOREST - FEATURE IMPORTANCE

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_rf_under = (
    run_cross_validation(
        df_train_under[filtro_random_forest],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_random_forest],
        df_test_under["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_mlp_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_rf_full = (
    run_cross_validation(
        df_train_full[filtro_random_forest],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_random_forest],
        df_test_full["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_mlp_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_rf_smote = (
    run_cross_validation(
        df_train_smote[filtro_random_forest],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_random_forest],
        df_test_smote["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_mlp_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_mlp_rf_smote, results_list_mlp_rf_full, results_list_mlp_rf_under
gc.collect()

In [None]:
final_processo = datetime.now()
print(f'PROCESSO DE MODELAGEM FINALIZADO EM: {final_processo}\n\n => TOTAL UTILIZADO: {final_processo - inicio_processo}')