# IMPORTAÇÃO DE BIBLIOTECAS

In [1]:
# MANIPULAÇÃO DE DADOS
import pandas as pd
import numpy as np
from datetime import datetime

# VISUALIZAÇÃO DE DADOS
import matplotlib.pyplot as plt

# TRANSFORMAÇÕES
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from imblearn import over_sampling, under_sampling


# PREPARAÇÃO TREINO E AVALIAÇÃO
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# MODELOS UTILIZADOS
from mixed_naive_bayes import MixedNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# CONFIGURAÇÕES DE EXIBIÇÃO
import gc
import warnings

warnings.filterwarnings("ignore")

# pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

In [None]:
inicio_processo = datetime.now()
print(f"PROCESSO DE MODELAGEM INICIADO EM: {inicio_processo}")

PROCESSO DE MODELAGEM INICIADO EM: 2024-11-25 18:16:20.336681


# IMPORTAÇÃO DE DADOS

In [3]:
weather_aus = pd.read_csv("../../DATA/weatherAUS.csv").dropna()

print(
    f"O dataset possui {weather_aus.shape[0]:,} instâncias (linhas) e {weather_aus.shape[1]:,} características (colunas)."
)

print(f"As características (colunas) do dataset são: {weather_aus.columns.to_list()}")

O dataset possui 56,420 instâncias (linhas) e 23 características (colunas).
As características (colunas) do dataset são: ['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow']


# FUNÇÕES PARA PRÉ-PROCESSAMENTO

## SEPARA TREINO E TESTE (80/20)

In [4]:
def split_data(df, target_column):
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Dividir a base em treino e teste, mantendo a proporção das classes
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    df_train = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)

    return df_train, df_test

## AJUSTA O FORMATO DO ALVO

In [5]:
def adjust_data_types(df):
    df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
    df["RainTomorrow"] = df.RainTomorrow.map({"Yes": 1, "No": 0})

    return df


## CRIAÇÃO DE VARIÁVEIS AUXILIARES

In [6]:
def create_auxiliar_columns(df):
    def get_season(month_number) -> str:
        quarter = month_number % 12 // 3 + 1
        if quarter == 1:
            return "summer"

        if quarter == 2:
            return "autumn"

        if quarter == 3:
            return "winter"

        if quarter == 4:
            return "spring"

    # COLUNAS SELECIONADAS COM BASE NAS AVALIAÇÕES ANTERIORES
    df = df[
        [
            "Date",
            "MinTemp",
            "MaxTemp",
            "Rainfall",
            "Evaporation",
            "Sunshine",
            "WindGustDir",
            "WindGustSpeed",
            "WindDir3pm",
            "WindSpeed9am",
            "WindSpeed3pm",
            "Humidity9am",
            "Humidity3pm",
            "Pressure3pm",
            "Cloud9am",
            "Cloud3pm",
            "RainTomorrow",
        ]
    ]

    df["RangeTemp"] = df["MaxTemp"] - df["MinTemp"]

    df["month"] = df.Date.dt.month
    df["season"] = df["month"].apply(lambda month_number: get_season(month_number))

    time_encoded = pd.get_dummies(
        df[["month", "season"]],
        columns=["month", "season"],
        drop_first=True,
        prefix=["month", "season"],
        dtype=int,
    )

    # SEPARA AS VARIÁVEIS CATEGÓRICAS E NUMÉRICAS
    # CATEGORIZA AS VARIÁVEIS NECESSÁRIAS
    categorical_columns = pd.get_dummies(
        df.select_dtypes(include=["object", "datetime64"]),
        columns=["WindGustDir", "WindDir3pm"],
        drop_first=True,
        prefix=["WindGustDir", "WindDir3pm"],
        dtype=int,
    )

    numerical_columns = df.select_dtypes("number")

    df = pd.concat([numerical_columns, categorical_columns, time_encoded], axis=1)

    return df.drop(columns=["month", "Date", "season"]), {
        "numerical_columns": numerical_columns.drop(
            columns=["RainTomorrow", "month"]
        ).columns.to_list(),
        "categorical_columns": categorical_columns.drop(
            columns=["Date", "season"]
        ).columns.to_list(),
        "time_encoded": time_encoded.columns.to_list(),
    }


## APLICA AS TRANSFORMAÇÕES

In [7]:
def instance_transformations(n_components=8):
    discretizer = KBinsDiscretizer(
        n_bins=5, encode="ordinal", strategy="kmeans", random_state=42
    )

    smote = over_sampling.SMOTE(random_state=42)
    under = under_sampling.RandomUnderSampler(random_state=42)

    scaler = StandardScaler()
    pca = PCA(n_components=n_components, random_state=42)

    return discretizer, scaler, pca, smote, under

In [8]:
def adjust_train_volume(df, target_column, smote, under):
    X = df.drop(columns=[target_column])
    y = df[target_column]

    X_smote, y_smote = smote.fit_resample(X, y)
    df_smote = pd.concat([X_smote, y_smote], axis=1)

    X_under, y_under = under.fit_resample(X, y)
    df_under = pd.concat([X_under, y_under], axis=1)

    return df_smote, df_under

In [9]:
def fit_transformmations(df, cols, discretizer, scaler, pca, discrete_col="Rainfall"):
    discretizer.fit(df[discrete_col].values.reshape(-1, 1))

    scaler.fit(df[cols])
    pca.fit(df[cols])

    return discretizer, scaler, pca


In [10]:
def transform_data(df, cols, discretizer, scaler, pca, discrete_col="Rainfall"):
    df[discrete_col + "_Discretized"] = discretizer.transform(
        df[discrete_col].values.reshape(-1, 1)
    )
    df[cols] = scaler.transform(df[cols])
    df = df.reset_index(drop=True)

    pca_result = pca.transform(df[cols])

    df_pca = pd.DataFrame(
        data=pca_result,
        columns=[f"PC{comp}" for comp in range(1, pca.n_components_ + 1)],
    )
    df_pca = df_pca.reset_index(drop=True)
    df_pca["RainTomorrow"] = df["RainTomorrow"]

    return df, df_pca

# INICIO DO PRÉ-PROCESSAMENTO

## SEPARA TREINO E TESTE (80/20)

In [11]:
df_train, df_test = split_data(weather_aus, "RainTomorrow")

## APLICAÇÕES SOBRE TREINO

### AJUSTA O TIPO DE DADO

In [12]:
df_train = adjust_data_types(df_train)

### CRIA VARIÁVEIS AUXILIARES

In [13]:
df_train, columns_names = create_auxiliar_columns(df_train)

### APLICA AS TRANSFORMAÇÕES

In [14]:
discretizer, scaler, pca, smote, under = instance_transformations()

#### AJUSTA VOLUME DA RESPOSTA

In [15]:
alvo = df_train.RainTomorrow.value_counts().to_frame("QTD").reset_index()
alvo["PERC"] = alvo.QTD / alvo.QTD.sum()
alvo

Unnamed: 0,RainTomorrow,QTD,PERC
0,0,35194,0.779732
1,1,9942,0.220268


In [16]:
df_train_smote, df_train_under = adjust_train_volume(
    df_train, "RainTomorrow", smote, under
)

In [17]:
alvo_smote = df_train_smote.RainTomorrow.value_counts().to_frame("QTD").reset_index()
alvo_smote["PERC"] = alvo_smote.QTD / alvo_smote.QTD.sum()
alvo_smote

Unnamed: 0,RainTomorrow,QTD,PERC
0,1,35194,0.5
1,0,35194,0.5


In [18]:
alvo_under = df_train_under.RainTomorrow.value_counts().to_frame("QTD").reset_index()
alvo_under["PERC"] = alvo_under.QTD / alvo_under.QTD.sum()
alvo_under

Unnamed: 0,RainTomorrow,QTD,PERC
0,0,9942,0.5
1,1,9942,0.5


#### TRANSFORMA OS DADOS

In [19]:
discretizer_full, scaler_full, pca_full = fit_transformmations(
    df_train, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [20]:
discretizer_smote, scaler_smote, pca_smote = fit_transformmations(
    df_train_smote, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [21]:
discretizer_under, scaler_under, pca_under = fit_transformmations(
    df_train_under, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [22]:
df_train_full, df_train_full_pca = transform_data(
    df_train, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [23]:
df_train_smote, df_train_smote_pca = transform_data(
    df_train_smote,
    columns_names["numerical_columns"],
    discretizer_smote,
    scaler_smote,
    pca_smote,
)

In [24]:
df_train_under, df_train_under_pca = transform_data(
    df_train_under,
    columns_names["numerical_columns"],
    discretizer_under,
    scaler_under,
    pca_under,
)

## APLICAÇÕES SOBRE TESTE

### AJUSTA O TIPO DE DADO

In [25]:
df_test = adjust_data_types(df_test)

### CRIA VARIÁVEIS AUXILIARES

In [26]:
df_test, columns_names = create_auxiliar_columns(df_test)

### TRANSFORMA OS DADOS

In [27]:
df_test_full, df_test_full_pca = transform_data(
    df_test, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [28]:
df_test_smote, df_test_smote_pca = transform_data(
    df_test,
    columns_names["numerical_columns"],
    discretizer_smote,
    scaler_smote,
    pca_smote,
)

In [29]:
df_test_under, df_test_under_pca = transform_data(
    df_test,
    columns_names["numerical_columns"],
    discretizer_under,
    scaler_under,
    pca_under,
)

## LIBERA MEMÓRIA

In [30]:
del (
    df_train,
    df_test,
    weather_aus,
    discretizer_under,
    scaler_under,
    pca_under,
    discretizer_smote,
    scaler_smote,
    pca_smote,
    discretizer,
    scaler,
    pca,
    smote,
    under,
)
gc.collect()

87

# PROCESSO DE MODELAGEM

## FUNÇÕES DE CROSS VALIDATION

### NUMÉRICAS

In [31]:
def run_cross_validation(
    train_data, train_target, test_data, test_target, model, param_grid, k_folds=5
):
    skf = StratifiedKFold(n_splits=k_folds)
    scores_validation = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    scores_test = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    best_params = None
    best_model = None
    best_f1 = 0
    counter = 1
    print(f"====> INICIANDO PROCESSAMENTO: {datetime.now()} <====")

    for train_index, valid_index in skf.split(train_data, train_target):
        inicio = datetime.now()
        print(f"====> {inicio} | EXECUTANDO FOLD {counter} ")

        # Use .iloc to select rows based on indices
        X_train_fold, X_valid_fold = (
            train_data.iloc[train_index],
            train_data.iloc[valid_index],
        )
        y_train_fold, y_valid_fold = (
            train_target.iloc[train_index],
            train_target.iloc[valid_index],
        )

        # Etapa 2: Treinamento e ajuste de hiperparâmetros com validação cruzada interna
        grid_search = GridSearchCV(
            model,
            param_grid,
            scoring="f1",
            cv=StratifiedKFold(n_splits=3),
            n_jobs=-1,
            verbose=3,
        )
        grid_search.fit(X_train_fold, y_train_fold)
        model = grid_search.best_estimator_
        f1 = grid_search.best_score_

        if f1 > best_f1:
            best_model = model
            best_params = grid_search.best_params_
            best_f1 = f1

        # Avaliação nos dados de validação
        y_valid_pred = model.predict(X_valid_fold)

        # Cálculo das métricas no fold
        scores_validation["f1"].append(
            f1_score(y_valid_fold, y_valid_pred, average="macro")
        )
        scores_validation["accuracy"].append(accuracy_score(y_valid_fold, y_valid_pred))
        scores_validation["precision"].append(
            precision_score(y_valid_fold, y_valid_pred, average="macro")
        )
        scores_validation["recall"].append(
            recall_score(y_valid_fold, y_valid_pred, average="macro")
        )

        # Avaliar na base de teste com o melhor modelo
        y_test_pred = model.predict(test_data)
        f1_test = f1_score(test_target, y_test_pred, average="macro")
        accuracy_test = accuracy_score(test_target, y_test_pred)
        precision_test = precision_score(test_target, y_test_pred, average="macro")
        recall_test = recall_score(test_target, y_test_pred, average="macro")

        # Armazenar as métricas na base de teste
        scores_test["f1"].append(f1_test)
        scores_test["accuracy"].append(accuracy_test)
        scores_test["precision"].append(precision_test)
        scores_test["recall"].append(recall_test)

        counter += 1
        final = datetime.now()
        print(
            f"===> {final} | FINALIZADO FOLD {counter - 1} | TEMPO TOTAL {final - inicio}\n\n"
        )

    # Calcula as médias das métricas nos folds de validação
    avg_scores_validation = {
        metric: np.mean(values) for metric, values in scores_validation.items()
    }
    test_scores = {metric: np.mean(values) for metric, values in scores_test.items()}

    # Criar log dos resultados
    log = pd.DataFrame(scores_validation)
    log["dataset"] = "Validação Cruzada"

    test_log = pd.DataFrame(scores_test)
    test_log["dataset"] = "Teste"

    # Concatenar os logs da validação cruzada e do teste
    full_log = pd.concat([log, test_log])

    return avg_scores_validation, test_scores, best_params, full_log

### CATEGÓRICAS

In [32]:
def run_cross_mix(
    train_data,
    train_target,
    test_data,
    test_target,
    alphas,
    k_folds=5,
    categorical_features=None,
):
    skf = StratifiedKFold(n_splits=k_folds)
    scores_validation = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    scores_test = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    best_model = None
    best_alpha = None
    best_f1 = 0
    counter = 1
    print(f"====> INICIANDO PROCESSAMENTO: {datetime.now()} <====")

    # Validação cruzada estratificada
    for train_index, valid_index in skf.split(train_data, train_target):
        inicio = datetime.now()
        print(f"====> {inicio} | EXECUTANDO FOLD {counter} ")

        X_train_fold, X_valid_fold = (
            train_data.iloc[train_index],
            train_data.iloc[valid_index],
        )
        y_train_fold, y_valid_fold = (
            train_target.iloc[train_index],
            train_target.iloc[valid_index],
        )

        # Iterar pelos valores de alpha
        for alpha in alphas:
            print(f"========> Testando: alpha = {alpha}")
            # Treina o modelo com o valor de alpha atual
            model = MixedNB(categorical_features=categorical_features, alpha=alpha)
            model.fit(X_train_fold, y_train_fold)

            # Avaliação nos dados de validação
            y_valid_pred = model.predict(X_valid_fold)
            f1 = f1_score(y_valid_fold, y_valid_pred, average="macro")
            accuracy = accuracy_score(y_valid_fold, y_valid_pred)
            precision = precision_score(y_valid_fold, y_valid_pred, average="macro")
            recall = recall_score(y_valid_fold, y_valid_pred, average="macro")

            # Seleciona o melhor modelo baseado na métrica F1-score
            if f1 > best_f1:
                best_f1 = f1
                best_model = model
                best_alpha = alpha

        # Armazenar as métricas para o fold atual
        scores_validation["f1"].append(f1)
        scores_validation["accuracy"].append(accuracy)
        scores_validation["precision"].append(precision)
        scores_validation["recall"].append(recall)

        # Avaliar na base de teste com o melhor modelo
        y_test_pred = model.predict(test_data)
        f1_test = f1_score(test_target, y_test_pred, average="macro")
        accuracy_test = accuracy_score(test_target, y_test_pred)
        precision_test = precision_score(test_target, y_test_pred, average="macro")
        recall_test = recall_score(test_target, y_test_pred, average="macro")

        # Armazenar as métricas na base de teste
        scores_test["f1"].append(f1_test)
        scores_test["accuracy"].append(accuracy_test)
        scores_test["precision"].append(precision_test)
        scores_test["recall"].append(recall_test)

        counter += 1
        final = datetime.now()
        print(
            f"====> {final} | FINALIZADO FOLD {counter - 1} | TEMPO TOTAL {final - inicio}\n\n"
        )

    # Calcular as médias das métricas de validação cruzada
    avg_scores_validation = {
        metric: np.mean(values) for metric, values in scores_validation.items()
    }
    test_scores = {metric: np.mean(values) for metric, values in scores_test.items()}

    # Criar log dos resultados
    log = pd.DataFrame(scores_validation)
    log["dataset"] = "Validação Cruzada"

    test_log = pd.DataFrame(scores_test)
    test_log["dataset"] = "Teste"

    # Concatenar os logs da validação cruzada e do teste
    full_log = pd.concat([log, test_log])

    return avg_scores_validation, best_alpha, test_scores, full_log


## MODELOS

### NAIVE BAYES

#### TODAS AS CARACTERÍSTICAS

##### UNDERSAMPLING

In [33]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_under = run_cross_mix(
    df_train_under.drop(columns=["RainTomorrow"]),
    df_train_under["RainTomorrow"],
    df_test_under.drop(columns=["RainTomorrow"]),
    df_test_under["RainTomorrow"],
    alphas,
    k_folds=5,
    categorical_features=[
        df_train_under.drop(columns="RainTomorrow").columns.get_loc(col)
        for col in columns_names["categorical_columns"] + columns_names["time_encoded"]
    ],
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:39:53.632821 <====
====> 2024-11-23 05:39:53.636381 | EXECUTANDO FOLD 1 
====> 2024-11-23 05:39:55.322745 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:01.686364


====> 2024-11-23 05:39:55.323761 | EXECUTANDO FOLD 2 
====> 2024-11-23 05:39:57.019780 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:01.696019


====> 2024-11-23 05:39:57.020784 | EXECUTANDO FOLD 3 
====> 2024-11-23 05:39:58.647049 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:01.626265


====> 2024-11-23 05:39:58.648048 | EXECUTANDO FOLD 4 
====> 2024-11-23 05:40:00.394007 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:01.745959


====> 2024-11-23 05:40:00.394532 | EXECUTANDO FOLD 5 
====> 2024-11-23 05:40:01.795682 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:01.401150


Resultados na validação cruzada: {'f1': 0.7626343459024936, 'accuracy': 0.7629254152017966, 'precision': 0.7642264309677982, 'recall': 0.762925238712314}
Resultados na base de teste: {'f1': 0.4381317532241199, 'accuracy': 0.7797766749379652, 'pre

In [34]:
results_list_bayes_under.to_csv(
    "../../DATA/RESULTS/results_list_bayes_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [35]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_full = run_cross_mix(
    df_train_full.drop(columns=["RainTomorrow"]),
    df_train_full["RainTomorrow"],
    df_test_full.drop(columns=["RainTomorrow"]),
    df_test_full["RainTomorrow"],
    alphas,
    k_folds=5,
    categorical_features=[
        df_train_full.drop(columns="RainTomorrow").columns.get_loc(col)
        for col in columns_names["categorical_columns"] + columns_names["time_encoded"]
    ],
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:02.205616 <====
====> 2024-11-23 05:40:02.215681 | EXECUTANDO FOLD 1 
====> 2024-11-23 05:40:05.595749 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:03.380068


====> 2024-11-23 05:40:05.595749 | EXECUTANDO FOLD 2 
====> 2024-11-23 05:40:08.831902 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:03.236153


====> 2024-11-23 05:40:08.833412 | EXECUTANDO FOLD 3 
====> 2024-11-23 05:40:12.290304 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:03.456892


====> 2024-11-23 05:40:12.290304 | EXECUTANDO FOLD 4 
====> 2024-11-23 05:40:15.833448 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:03.543144


====> 2024-11-23 05:40:15.834620 | EXECUTANDO FOLD 5 
====> 2024-11-23 05:40:19.300578 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:03.465958


Resultados na validação cruzada: {'f1': 0.7316497551807923, 'accuracy': 0.8014222458406302, 'precision': 0.7191078286403904, 'recall': 0.7513100041244096}
Resultados na base de teste: {'f1': 0.7379562346605752, 'accuracy': 0.8068060971286778, 'pr

In [36]:
results_list_bayes_full.to_csv(
    "../../DATA/RESULTS/results_list_bayes_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [37]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_smote = run_cross_mix(
    df_train_smote.drop(columns=["RainTomorrow"]),
    df_train_smote["RainTomorrow"],
    df_test_smote.drop(columns=["RainTomorrow"]),
    df_test_smote["RainTomorrow"],
    alphas,
    k_folds=5,
    categorical_features=[
        df_train_smote.drop(columns="RainTomorrow").columns.get_loc(col)
        for col in columns_names["categorical_columns"] + columns_names["time_encoded"]
    ],
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:19.845717 <====
====> 2024-11-23 05:40:19.850268 | EXECUTANDO FOLD 1 
====> 2024-11-23 05:40:24.818661 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:04.968393


====> 2024-11-23 05:40:24.819680 | EXECUTANDO FOLD 2 
====> 2024-11-23 05:40:29.840277 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:05.020597


====> 2024-11-23 05:40:29.840277 | EXECUTANDO FOLD 3 
====> 2024-11-23 05:40:34.385633 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:04.545356


====> 2024-11-23 05:40:34.390700 | EXECUTANDO FOLD 4 
====> 2024-11-23 05:40:39.040680 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:04.649980


====> 2024-11-23 05:40:39.040680 | EXECUTANDO FOLD 5 
====> 2024-11-23 05:40:43.635327 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:04.594647


Resultados na validação cruzada: {'f1': 0.792563113894891, 'accuracy': 0.7935022446662893, 'precision': 0.7972085273933975, 'recall': 0.7935022089611482}
Resultados na base de teste: {'f1': 0.4381317532241199, 'accuracy': 0.7797766749379652, 'pre

In [38]:
results_list_bayes_smote.to_csv(
    "../../DATA/RESULTS/results_list_bayes_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [39]:
del results_list_bayes_smote, results_list_bayes_under, results_list_bayes_full
gc.collect()

0

#### PCA

##### UNDERSAMPLING

In [40]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_pca_under = (
    run_cross_validation(
        df_train_under_pca.drop(columns=["RainTomorrow"]),
        df_train_under_pca["RainTomorrow"],
        df_test_under_pca.drop(columns=["RainTomorrow"]),
        df_test_under_pca["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:43.772562 <====
====> 2024-11-23 05:40:43.777125 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:47.720454 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:03.943329


====> 2024-11-23 05:40:47.720454 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:47.845336 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.124882


====> 2024-11-23 05:40:47.845336 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:47.994777 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.149441


====> 2024-11-23 05:40:47.995291 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:48.162993 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.167702


====> 2024-11-23 05:40:48.162993 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:48.335645 | 

In [41]:
results_list_bayes_pca_under.to_csv(
    "../../DATA/RESULTS/results_list_bayes_pca_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [42]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_pca_full = (
    run_cross_validation(
        df_train_full_pca.drop(columns=["RainTomorrow"]),
        df_train_full_pca["RainTomorrow"],
        df_test_full_pca.drop(columns=["RainTomorrow"]),
        df_test_full_pca["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:48.368940 <====
====> 2024-11-23 05:40:48.377485 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:48.745293 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.367808


====> 2024-11-23 05:40:48.745293 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:49.115251 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.369958


====> 2024-11-23 05:40:49.115251 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:49.486835 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.371584


====> 2024-11-23 05:40:49.487792 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:49.800353 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.312561


====> 2024-11-23 05:40:49.805441 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:50.129464 | 

In [43]:
results_list_bayes_pca_full.to_csv(
    "../../DATA/RESULTS/results_list_bayes_pca_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [44]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_pca_smote = (
    run_cross_validation(
        df_train_smote_pca.drop(columns=["RainTomorrow"]),
        df_train_smote_pca["RainTomorrow"],
        df_test_smote_pca.drop(columns=["RainTomorrow"]),
        df_test_smote_pca["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:50.170818 <====
====> 2024-11-23 05:40:50.186265 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:50.715358 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.529093


====> 2024-11-23 05:40:50.715358 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:51.151193 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.435835


====> 2024-11-23 05:40:51.151193 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:51.505577 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.354384


====> 2024-11-23 05:40:51.505577 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:51.888015 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.382438


====> 2024-11-23 05:40:51.889017 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:52.315381 | 

In [45]:
results_list_bayes_pca_smote.to_csv(
    "../../DATA/RESULTS/results_list_bayes_pca_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [46]:
del (
    results_list_bayes_pca_under,
    results_list_bayes_pca_smote,
    results_list_bayes_pca_full,
)
gc.collect()

182

#### CORRELAÇÃO

In [32]:
filtro_correlacao = ["Sunshine", "Humidity3pm", "Cloud9am", "Cloud3pm", "RangeTemp"]

##### UNDERSAMPLING

In [48]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_corr_under = (
    run_cross_validation(
        df_train_under[filtro_correlacao],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_correlacao],
        df_test_under["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:52.469609 <====
====> 2024-11-23 05:40:52.473122 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:52.643903 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.170781


====> 2024-11-23 05:40:52.643903 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:52.792831 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.148928


====> 2024-11-23 05:40:52.792831 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:52.955398 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.162567


====> 2024-11-23 05:40:52.955398 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:53.100066 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.144668


====> 2024-11-23 05:40:53.101069 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:53.227429 | 

In [49]:
results_list_bayes_corr_under.to_csv(
    "../../DATA/RESULTS/results_list_bayes_corr_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [50]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_corr_full = (
    run_cross_validation(
        df_train_full[filtro_correlacao],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_correlacao],
        df_test_full["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:53.245308 <====
====> 2024-11-23 05:40:53.261147 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:53.592198 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.331051


====> 2024-11-23 05:40:53.593197 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:53.888458 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.295261


====> 2024-11-23 05:40:53.895528 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:54.160400 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.264872


====> 2024-11-23 05:40:54.160400 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:54.420410 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.260010


====> 2024-11-23 05:40:54.423916 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:54.697298 | 

In [51]:
results_list_bayes_corr_full.to_csv(
    "../../DATA/RESULTS/results_list_bayes_corr_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [52]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_corr_smote = (
    run_cross_validation(
        df_train_smote[filtro_correlacao],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_correlacao],
        df_test_smote["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:54.725123 <====
====> 2024-11-23 05:40:54.749559 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:55.200714 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.451155


====> 2024-11-23 05:40:55.201776 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:55.675221 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.473445


====> 2024-11-23 05:40:55.675221 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:56.106203 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.430982


====> 2024-11-23 05:40:56.106203 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:56.506484 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.400281


====> 2024-11-23 05:40:56.506723 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-23 05:40:56.904587 | 

In [53]:
results_list_bayes_corr_smote.to_csv(
    "../../DATA/RESULTS/results_list_bayes_corr_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [54]:
del (
    results_list_bayes_corr_smote,
    results_list_bayes_corr_full,
    results_list_bayes_corr_under,
)
gc.collect()

182

#### RANDOM FOREST - FEATURE IMPORTANCE

In [33]:
filtro_random_forest = [
    "Humidity3pm",
    "Sunshine",
    "Pressure3pm",
    "Cloud3pm",
    "RangeTemp",
    "WindGustSpeed",
    "Humidity9am",
    "Rainfall",
    "MinTemp",
    "MaxTemp",
    "Evaporation",
    "WindSpeed3pm",
    "WindSpeed9am",
    "Cloud9am",
    "Rainfall_Discretized",
]

##### UNDERSAMPLING

In [56]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_rf_under = (
    run_cross_mix(
        df_train_under[filtro_random_forest],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_random_forest],
        df_test_under["RainTomorrow"],
        alphas,
        k_folds=5,
        categorical_features=[14],
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:57.065307 <====
====> 2024-11-23 05:40:57.072348 | EXECUTANDO FOLD 1 
====> 2024-11-23 05:40:57.260253 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.187905


====> 2024-11-23 05:40:57.260253 | EXECUTANDO FOLD 2 
====> 2024-11-23 05:40:57.445389 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.185136


====> 2024-11-23 05:40:57.445389 | EXECUTANDO FOLD 3 
====> 2024-11-23 05:40:57.643555 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.198166


====> 2024-11-23 05:40:57.643555 | EXECUTANDO FOLD 4 
====> 2024-11-23 05:40:57.832546 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.188991


====> 2024-11-23 05:40:57.832546 | EXECUTANDO FOLD 5 
====> 2024-11-23 05:40:58.002651 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:00.170105


Resultados na validação cruzada: {'f1': 0.7640619875453153, 'accuracy': 0.764082205073539, 'precision': 0.7641743119100982, 'recall': 0.7640818263022073}
Resultados na base de teste: {'f1': 0.4381317532241199, 'accuracy': 0.7797766749379652, 'pre

In [57]:
results_list_bayes_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_bayes_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [58]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_rf_full = (
    run_cross_mix(
        df_train_full[filtro_random_forest],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_random_forest],
        df_test_full["RainTomorrow"],
        alphas,
        k_folds=5,
        categorical_features=[14],
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:58.037022 <====
====> 2024-11-23 05:40:58.045031 | EXECUTANDO FOLD 1 
====> 2024-11-23 05:40:58.397110 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.352079


====> 2024-11-23 05:40:58.397110 | EXECUTANDO FOLD 2 
====> 2024-11-23 05:40:58.795035 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.397925


====> 2024-11-23 05:40:58.795035 | EXECUTANDO FOLD 3 
====> 2024-11-23 05:40:59.146634 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.351599


====> 2024-11-23 05:40:59.146634 | EXECUTANDO FOLD 4 
====> 2024-11-23 05:40:59.455254 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.308620


====> 2024-11-23 05:40:59.455254 | EXECUTANDO FOLD 5 
====> 2024-11-23 05:40:59.745162 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:00.289908


Resultados na validação cruzada: {'f1': 0.7285803915846831, 'accuracy': 0.793889490392604, 'precision': 0.7140290150484855, 'recall': 0.7561141410271192}
Resultados na base de teste: {'f1': 0.7352714027673639, 'accuracy': 0.7990960652250975, 'pre

In [59]:
results_list_bayes_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_bayes_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [60]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_rf_smote = (
    run_cross_mix(
        df_train_smote[filtro_random_forest],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_random_forest],
        df_test_smote["RainTomorrow"],
        alphas,
        k_folds=5,
        categorical_features=[14],
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:40:59.791003 <====
====> 2024-11-23 05:40:59.804349 | EXECUTANDO FOLD 1 
====> 2024-11-23 05:41:00.319675 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.515326


====> 2024-11-23 05:41:00.319675 | EXECUTANDO FOLD 2 
====> 2024-11-23 05:41:00.649501 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.329826


====> 2024-11-23 05:41:00.649501 | EXECUTANDO FOLD 3 
====> 2024-11-23 05:41:01.086389 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.436888


====> 2024-11-23 05:41:01.086389 | EXECUTANDO FOLD 4 
====> 2024-11-23 05:41:01.471825 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.385436


====> 2024-11-23 05:41:01.471825 | EXECUTANDO FOLD 5 
====> 2024-11-23 05:41:01.890311 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:00.418486


Resultados na validação cruzada: {'f1': 0.7654347686025795, 'accuracy': 0.7654430062537438, 'precision': 0.7654807242887745, 'recall': 0.7654429805507341}
Resultados na base de teste: {'f1': 0.4381317532241199, 'accuracy': 0.7797766749379652, 'pr

In [61]:
results_list_bayes_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_bayes_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [62]:
del results_list_bayes_rf_smote, results_list_bayes_rf_under, results_list_bayes_rf_full
gc.collect()

0

### RANDOM FOREST

#### TODAS AS CARACTERÍSTICAS

##### UNDERSAMPLING

In [63]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_under = (
    run_cross_validation(
        df_train_under.drop(columns=["RainTomorrow"]),
        df_train_under["RainTomorrow"],
        df_test_under.drop(columns=["RainTomorrow"]),
        df_test_under["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:41:02.043890 <====
====> 2024-11-23 05:41:02.048385 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 05:44:13.660687 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:03:11.612302


====> 2024-11-23 05:44:13.660687 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 05:47:14.372617 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:03:00.711930


====> 2024-11-23 05:47:14.372617 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 05:50:17.238648 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:03:02.866031


====> 2024-11-23 05:50:17.238648 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 05:53:28.354012 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:03:11.115364


====> 2024-11-23 05:53:28.355011 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 05:56:36.7495

In [64]:
results_list_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [65]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_full = (
    run_cross_validation(
        df_train_full.drop(columns=["RainTomorrow"]),
        df_train_full["RainTomorrow"],
        df_test_full.drop(columns=["RainTomorrow"]),
        df_test_full["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 05:56:36.907043 <====
====> 2024-11-23 05:56:36.912560 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 06:05:11.171152 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:08:34.258592


====> 2024-11-23 06:05:11.172155 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 06:13:49.348002 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:08:38.175847


====> 2024-11-23 06:13:49.349001 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 06:22:26.717874 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:08:37.368873


====> 2024-11-23 06:22:26.718918 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 06:31:08.615450 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:08:41.896532


====> 2024-11-23 06:31:08.616454 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 06:39:53.4644

In [66]:
results_list_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [67]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_smote = (
    run_cross_validation(
        df_train_smote.drop(columns=["RainTomorrow"]),
        df_train_smote["RainTomorrow"],
        df_test_smote.drop(columns=["RainTomorrow"]),
        df_test_smote["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 06:39:53.668422 <====
====> 2024-11-23 06:39:53.682592 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 06:59:07.653890 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:19:13.971298


====> 2024-11-23 06:59:07.654854 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 07:17:14.858874 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:18:07.204020


====> 2024-11-23 07:17:14.859873 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 07:34:44.202928 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:17:29.343055


====> 2024-11-23 07:34:44.204501 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 07:52:30.965056 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:17:46.760555


====> 2024-11-23 07:52:30.966139 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 08:10:35.2101

In [68]:
results_list_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [69]:
del results_list_rf_smote, results_list_rf_full, results_list_rf_under
gc.collect()

446

#### PCA

##### UNDERSAMPLING

In [70]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_pca_under = (
    run_cross_validation(
        df_train_under_pca.drop(columns=["RainTomorrow"]),
        df_train_under_pca["RainTomorrow"],
        df_test_under_pca.drop(columns=["RainTomorrow"]),
        df_test_under_pca["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 08:10:35.760204 <====
====> 2024-11-23 08:10:35.764199 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 08:15:22.195459 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:04:46.431260


====> 2024-11-23 08:15:22.196427 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 08:20:05.358933 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:04:43.162506


====> 2024-11-23 08:20:05.360259 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 08:24:35.742354 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:04:30.382095


====> 2024-11-23 08:24:35.743442 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 08:29:16.850010 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:04:41.106568


====> 2024-11-23 08:29:16.850010 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 08:33:52.8542

In [71]:
results_list_rf_pca_under.to_csv(
    "../../DATA/RESULTS/results_list_rf_pca_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [72]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_pca_full = (
    run_cross_validation(
        df_train_full_pca.drop(columns=["RainTomorrow"]),
        df_train_full_pca["RainTomorrow"],
        df_test_full_pca.drop(columns=["RainTomorrow"]),
        df_test_full_pca["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 08:33:53.037362 <====
====> 2024-11-23 08:33:53.047771 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 08:47:28.946078 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:13:35.898307


====> 2024-11-23 08:47:28.947079 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 09:00:46.566061 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:13:17.618982


====> 2024-11-23 09:00:46.567066 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 09:14:06.583362 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:13:20.016296


====> 2024-11-23 09:14:06.584358 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 09:27:19.572186 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:13:12.987828


====> 2024-11-23 09:27:19.572186 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 09:39:48.8347

In [73]:
results_list_rf_pca_full.to_csv(
    "../../DATA/RESULTS/results_list_rf_pca_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [75]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_pca_smote = (
    run_cross_validation(
        df_train_smote_pca.drop(columns=["RainTomorrow"]),
        df_train_smote_pca["RainTomorrow"],
        df_test_smote_pca.drop(columns=["RainTomorrow"]),
        df_test_smote_pca["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 09:59:07.883760 <====
====> 2024-11-23 09:59:07.900776 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 10:19:40.348532 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:20:32.447756


====> 2024-11-23 10:19:40.349533 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 10:40:01.852913 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:20:21.503380


====> 2024-11-23 10:40:01.852913 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 11:00:28.277157 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:20:26.424244


====> 2024-11-23 11:00:28.278191 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 11:21:02.337235 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:20:34.059044


====> 2024-11-23 11:21:02.338196 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 11:41:39.6525

In [76]:
results_list_rf_pca_smote.to_csv(
    "../../DATA/RESULTS/results_list_rf_pca_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [77]:
del results_list_rf_pca_smote, results_list_rf_pca_full, results_list_rf_pca_under
gc.collect()

892

#### CORRELAÇÃO

##### UNDERSAMPLING

In [78]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_corr_under = (
    run_cross_validation(
        df_train_under[filtro_correlacao],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_correlacao],
        df_test_under["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 11:41:39.988127 <====
====> 2024-11-23 11:41:39.992140 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 11:43:10.409791 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:01:30.417651


====> 2024-11-23 11:43:10.409791 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 11:44:37.291346 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:01:26.881555


====> 2024-11-23 11:44:37.291346 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 11:46:07.292285 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:01:30.000939


====> 2024-11-23 11:46:07.293285 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 11:47:40.037442 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:01:32.744157


====> 2024-11-23 11:47:40.037442 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 11:49:08.9033

In [79]:
results_list_rf_corr_under.to_csv(
    "../../DATA/RESULTS/results_list_rf_corr_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [80]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_corr_full = (
    run_cross_validation(
        df_train_full[filtro_correlacao],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_correlacao],
        df_test_full["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 11:49:09.168346 <====
====> 2024-11-23 11:49:09.176857 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 11:52:22.237727 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:03:13.060870


====> 2024-11-23 11:52:22.237727 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 11:55:33.358176 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:03:11.120449


====> 2024-11-23 11:55:33.359179 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 11:58:46.888117 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:03:13.528938


====> 2024-11-23 11:58:46.889117 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 12:01:59.552944 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:03:12.663827


====> 2024-11-23 12:01:59.552944 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 12:05:11.4795

In [81]:
results_list_rf_corr_full.to_csv(
    "../../DATA/RESULTS/results_list_rf_corr_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [82]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_corr_smote = (
    run_cross_validation(
        df_train_smote[filtro_correlacao],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_correlacao],
        df_test_smote["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 12:05:11.690624 <====
====> 2024-11-23 12:05:11.721732 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 12:12:23.969096 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:07:12.247364


====> 2024-11-23 12:12:23.970109 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 12:19:26.770069 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:07:02.799960


====> 2024-11-23 12:19:26.771029 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 12:26:21.739287 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:06:54.968258


====> 2024-11-23 12:26:21.740284 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 12:33:22.853197 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:07:01.112913


====> 2024-11-23 12:33:22.853197 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 12:40:26.3202

In [83]:
results_list_rf_corr_smote.to_csv(
    "../../DATA/RESULTS/results_list_rf_corr_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [84]:
del results_list_rf_corr_smote, results_list_rf_corr_full, results_list_rf_corr_under
gc.collect()

470

#### RANDOM FOREST - FEATURE IMPORTANCE

##### UNDERSAMPLING

In [85]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_rf_under = (
    run_cross_validation(
        df_train_under[filtro_random_forest],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_random_forest],
        df_test_under["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 12:40:26.842727 <====
====> 2024-11-23 12:40:26.846129 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 12:43:28.364393 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:03:01.518264


====> 2024-11-23 12:43:28.364393 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 12:46:26.311119 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:02:57.946726


====> 2024-11-23 12:46:26.311119 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 12:49:22.401832 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:02:56.090713


====> 2024-11-23 12:49:22.401832 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 12:52:17.643063 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:02:55.241231


====> 2024-11-23 12:52:17.644024 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 12:55:16.3341

In [86]:
results_list_rf_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_rf_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [87]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_rf_full = (
    run_cross_validation(
        df_train_full[filtro_random_forest],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_random_forest],
        df_test_full["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 12:55:16.589414 <====
====> 2024-11-23 12:55:16.602551 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 13:02:10.997247 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:06:54.394696


====> 2024-11-23 13:02:10.997247 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 13:08:58.993925 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:06:47.996678


====> 2024-11-23 13:08:58.994924 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 13:16:09.433056 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:07:10.438132


====> 2024-11-23 13:16:09.434057 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 13:23:30.952620 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:07:21.518563


====> 2024-11-23 13:23:30.954646 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 13:30:37.1233

In [None]:
results_list_rf_rf_fullresults_list_rf_rf_full.to_csv(
    "../../DATA/RESULTS/.csv", sep="|", index=False
)

##### OVERSAMPLING

In [89]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_rf_smote = (
    run_cross_validation(
        df_train_smote[filtro_random_forest],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_random_forest],
        df_test_smote["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 13:30:37.375249 <====
====> 2024-11-23 13:30:37.385025 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 13:48:50.866876 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:18:13.481851


====> 2024-11-23 13:48:50.867873 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 14:05:50.879381 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:17:00.011508


====> 2024-11-23 14:05:50.879381 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 14:21:54.017192 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:16:03.137811


====> 2024-11-23 14:21:54.017731 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 14:38:10.478867 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:16:16.461136


====> 2024-11-23 14:38:10.479825 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-23 14:53:38.3633

In [90]:
results_list_rf_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_rf_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [91]:
del results_list_rf_rf_smote, results_list_rf_rf_full, results_list_rf_rf_under
gc.collect()

446

### SVM

#### TODAS AS CARACTERÍSTICAS

##### UNDERSAMPLING

In [78]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_under = (
    run_cross_validation(
        df_train_under.drop(columns=["RainTomorrow"]),
        df_train_under["RainTomorrow"],
        df_test_under.drop(columns=["RainTomorrow"]),
        df_test_under["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 20:09:37.862286 <====
====> 2024-11-23 20:09:37.866295 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-23 20:14:47.296921 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:05:09.430626


====> 2024-11-23 20:14:47.296921 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-23 20:19:52.040552 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:05:04.743631


====> 2024-11-23 20:19:52.041553 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-23 20:25:02.820559 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:05:10.779006


====> 2024-11-23 20:25:02.821577 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-23 20:30:10.959380 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:05:08.137803


====> 2024-11-23 20:30:10.960380 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-23 20:35:23.226740 | 

In [79]:
results_list_svm_under.to_csv(
    "../../DATA/RESULTS/results_list_svm_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [80]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_full = (
    run_cross_validation(
        df_train_full.drop(columns=["RainTomorrow"]),
        df_train_full["RainTomorrow"],
        df_test_full.drop(columns=["RainTomorrow"]),
        df_test_full["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 20:35:23.263897 <====
====> 2024-11-23 20:35:23.273404 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-23 21:39:58.019291 | FINALIZADO FOLD 1 | TEMPO TOTAL 1:04:34.745887


====> 2024-11-23 21:39:58.019291 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-23 22:49:57.704445 | FINALIZADO FOLD 2 | TEMPO TOTAL 1:09:59.685154


====> 2024-11-23 22:49:57.704445 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-23 23:58:23.340977 | FINALIZADO FOLD 3 | TEMPO TOTAL 1:08:25.636532


====> 2024-11-23 23:58:23.340977 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 01:04:01.106793 | FINALIZADO FOLD 4 | TEMPO TOTAL 1:05:37.765816


====> 2024-11-24 01:04:01.106793 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 02:11:24.686125 | 

In [81]:
results_list_svm_full.to_csv(
    "../../DATA/RESULTS/results_list_svm_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [82]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_smote = (
    run_cross_validation(
        df_train_smote.drop(columns=["RainTomorrow"]),
        df_train_smote["RainTomorrow"],
        df_test_smote.drop(columns=["RainTomorrow"]),
        df_test_smote["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-24 02:11:24.792522 <====
====> 2024-11-24 02:11:24.803741 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 03:10:32.703776 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:59:07.900035


====> 2024-11-24 03:10:32.704757 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 05:13:23.647742 | FINALIZADO FOLD 2 | TEMPO TOTAL 2:02:50.942985


====> 2024-11-24 05:13:23.648741 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 07:44:03.132253 | FINALIZADO FOLD 3 | TEMPO TOTAL 2:30:39.483512


====> 2024-11-24 07:44:03.132253 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 10:27:24.627655 | FINALIZADO FOLD 4 | TEMPO TOTAL 2:43:21.495402


====> 2024-11-24 10:27:24.627655 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 13:00:46.406310 | 

In [83]:
results_list_svm_smote.to_csv(
    "../../DATA/RESULTS/results_list_svm_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [84]:
del results_list_svm_smote, results_list_svm_under, results_list_svm_full
gc.collect()

78

#### PCA

##### UNDERSAMPLING

In [85]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_pca_under = (
    run_cross_validation(
        df_train_under_pca.drop(columns=["RainTomorrow"]),
        df_train_under_pca["RainTomorrow"],
        df_test_under_pca.drop(columns=["RainTomorrow"]),
        df_test_under_pca["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-24 13:00:46.915264 <====
====> 2024-11-24 13:00:46.917460 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 13:01:45.933449 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:59.015989


====> 2024-11-24 13:01:45.933449 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 13:02:36.559061 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:50.625612


====> 2024-11-24 13:02:36.560673 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 13:03:27.085294 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:50.524621


====> 2024-11-24 13:03:27.085845 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 13:04:22.768547 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:55.682702


====> 2024-11-24 13:04:22.768547 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 13:05:20.172025 | 

In [86]:
results_list_svm_pca_under.to_csv(
    "../../DATA/RESULTS/results_list_svm_pca_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [87]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_pca_full = (
    run_cross_validation(
        df_train_full_pca.drop(columns=["RainTomorrow"]),
        df_train_full_pca["RainTomorrow"],
        df_test_full_pca.drop(columns=["RainTomorrow"]),
        df_test_full_pca["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-24 13:05:20.210267 <====
====> 2024-11-24 13:05:20.221722 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 13:08:09.559817 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:02:49.338095


====> 2024-11-24 13:08:09.559817 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 13:10:54.486215 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:02:44.926398


====> 2024-11-24 13:10:54.486215 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 13:13:36.803498 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:02:42.317283


====> 2024-11-24 13:13:36.804460 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 13:16:22.537722 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:02:45.733262


====> 2024-11-24 13:16:22.537831 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 13:19:06.740982 | 

In [88]:
results_list_svm_pca_full.to_csv(
    "../../DATA/RESULTS/results_list_svm_pca_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [89]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_pca_smote = (
    run_cross_validation(
        df_train_smote_pca.drop(columns=["RainTomorrow"]),
        df_train_smote_pca["RainTomorrow"],
        df_test_smote_pca.drop(columns=["RainTomorrow"]),
        df_test_smote_pca["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-24 13:19:06.780223 <====
====> 2024-11-24 13:19:06.792744 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 13:34:32.126481 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:15:25.333737


====> 2024-11-24 13:34:32.128445 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 13:51:02.283230 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:16:30.154785


====> 2024-11-24 13:51:02.283732 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 14:07:34.810435 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:16:32.526703


====> 2024-11-24 14:07:34.810435 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 14:24:08.546598 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:16:33.736163


====> 2024-11-24 14:24:08.547602 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 14:41:05.129407 | 

In [90]:
results_list_svm_pca_smote.to_csv(
    "../../DATA/RESULTS/results_list_svm_pca_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [91]:
del results_list_svm_pca_smote, results_list_svm_pca_under, results_list_svm_pca_full
gc.collect()

26

#### CORRELAÇÃO

##### UNDERSAMPLING

In [37]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_corr_under = (
    run_cross_validation(
        df_train_under[filtro_correlacao],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_correlacao],
        df_test_under["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-24 17:42:57.450548 <====
====> 2024-11-24 17:42:57.454554 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 17:44:30.898215 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:01:33.443661


====> 2024-11-24 17:44:30.899315 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 17:46:06.227864 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:01:35.328549


====> 2024-11-24 17:46:06.228880 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 17:47:38.809709 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:01:32.580829


====> 2024-11-24 17:47:38.810708 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 17:49:07.837890 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:01:29.027182


====> 2024-11-24 17:49:07.837890 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 17:50:55.936104 | 

In [38]:
results_list_svm_corr_under.to_csv(
    "../../DATA/RESULTS/results_list_svm_corr_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [34]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_corr_full = (
    run_cross_validation(
        df_train_full[filtro_correlacao],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_correlacao],
        df_test_full["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-24 20:44:52.663327 <====
====> 2024-11-24 20:44:52.676990 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 23:38:02.879473 | FINALIZADO FOLD 1 | TEMPO TOTAL 2:53:10.202483


====> 2024-11-24 23:38:02.881394 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 02:41:59.130160 | FINALIZADO FOLD 2 | TEMPO TOTAL 3:03:56.248766


====> 2024-11-25 02:41:59.130160 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 04:06:35.958569 | FINALIZADO FOLD 3 | TEMPO TOTAL 1:24:36.828409


====> 2024-11-25 04:06:35.958569 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 06:57:07.075623 | FINALIZADO FOLD 4 | TEMPO TOTAL 2:50:31.117054


====> 2024-11-25 06:57:07.076123 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 08:09:42.299960 | 

In [35]:
results_list_svm_corr_full.to_csv(
    "../../DATA/RESULTS/results_list_svm_corr_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [36]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_corr_smote = (
    run_cross_validation(
        df_train_smote[filtro_correlacao],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_correlacao],
        df_test_smote["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-25 08:09:42.388970 <====
====> 2024-11-25 08:09:42.403966 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 08:22:59.101950 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:13:16.697984


====> 2024-11-25 08:22:59.101950 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 08:36:45.335582 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:13:46.233632


====> 2024-11-25 08:36:45.335582 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 08:50:37.829167 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:13:52.493585


====> 2024-11-25 08:50:37.829167 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 09:04:34.245218 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:13:56.416051


====> 2024-11-25 09:04:34.245218 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 09:18:35.888146 | 

In [37]:
results_list_svm_corr_smote.to_csv(
    "../../DATA/RESULTS/results_list_svm_corr_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_svm_corr_smote, results_list_svm_corr_full, results_list_svm_corr_under
gc.collect()

#### RANDOM FOREST - FEATURE IMPORTANCE

##### UNDERSAMPLING

In [40]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_rf_under = (
    run_cross_validation(
        df_train_under[filtro_random_forest],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_random_forest],
        df_test_under["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-24 18:29:29.394547 <====
====> 2024-11-24 18:29:29.399608 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 18:32:55.290291 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:03:25.890683


====> 2024-11-24 18:32:55.291314 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 18:36:00.246119 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:03:04.954805


====> 2024-11-24 18:36:00.247104 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 18:38:55.674744 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:02:55.427640


====> 2024-11-24 18:38:55.675283 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 18:41:59.224165 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:03:03.548882


====> 2024-11-24 18:41:59.224739 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-24 18:44:55.395694 | 

In [41]:
results_list_svm_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_svm_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [38]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_rf_full = (
    run_cross_validation(
        df_train_full[filtro_random_forest],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_random_forest],
        df_test_full["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-25 09:18:35.940454 <====
====> 2024-11-25 09:18:35.951192 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 09:33:15.146768 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:14:39.195576


====> 2024-11-25 09:33:15.146768 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 09:47:50.671285 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:14:35.524517


====> 2024-11-25 09:47:50.671285 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 10:02:29.261540 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:14:38.590255


====> 2024-11-25 10:02:29.262537 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 10:17:21.976350 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:14:52.713813


====> 2024-11-25 10:17:21.977361 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 10:32:21.022796 | 

In [39]:
results_list_svm_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_svm_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [34]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_rf_smote = (
    run_cross_validation(
        df_train_smote[filtro_random_forest],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_random_forest],
        df_test_smote["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-25 18:16:48.558043 <====
====> 2024-11-25 18:16:48.574123 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 19:20:12.462904 | FINALIZADO FOLD 1 | TEMPO TOTAL 1:03:23.888781


====> 2024-11-25 19:20:12.463950 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 20:26:17.743330 | FINALIZADO FOLD 2 | TEMPO TOTAL 1:06:05.279380


====> 2024-11-25 20:26:17.744292 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 21:36:54.909887 | FINALIZADO FOLD 3 | TEMPO TOTAL 1:10:37.165595


====> 2024-11-25 21:36:54.910878 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 22:44:06.742668 | FINALIZADO FOLD 4 | TEMPO TOTAL 1:07:11.831790


====> 2024-11-25 22:44:06.743634 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-25 23:51:03.544238 | 

In [35]:
results_list_svm_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_svm_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_svm_rf_smote, results_list_svm_rf_full, results_list_svm_rf_under
gc.collect()

### MLP

#### TODAS AS CARACTERÍSTICAS

##### UNDERSAMPLING

In [50]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_under = (
    run_cross_validation(
        df_train_under.drop(columns=["RainTomorrow"]),
        df_train_under["RainTomorrow"],
        df_test_under.drop(columns=["RainTomorrow"]),
        df_test_under["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 19:13:23.080195 <====
====> 2024-11-23 19:13:23.083233 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:14:24.287887 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:01:01.204654


====> 2024-11-23 19:14:24.287887 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:15:14.253135 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:49.965248


====> 2024-11-23 19:15:14.254639 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:16:04.946751 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:50.692112


====> 2024-11-23 19:16:04.947804 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:17:03.385763 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:58.437959


====> 2024-11-23 19:17:03.386801 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:17:56.9098

In [51]:
results_list_mlp_under.to_csv(
    "../../DATA/RESULTS/results_list_mlp_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [52]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_full = (
    run_cross_validation(
        df_train_full.drop(columns=["RainTomorrow"]),
        df_train_full["RainTomorrow"],
        df_test_full.drop(columns=["RainTomorrow"]),
        df_test_full["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 19:17:56.947052 <====
====> 2024-11-23 19:17:56.954718 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:19:53.533429 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:01:56.578711


====> 2024-11-23 19:19:53.533429 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:21:54.891139 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:02:01.357710


====> 2024-11-23 19:21:54.891139 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:23:47.733852 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:01:52.842713


====> 2024-11-23 19:23:47.734832 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:25:39.584858 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:01:51.850026


====> 2024-11-23 19:25:39.584858 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:27:39.1909

In [53]:
results_list_mlp_full.to_csv(
    "../../DATA/RESULTS/results_list_mlp_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [54]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_smote = (
    run_cross_validation(
        df_train_smote.drop(columns=["RainTomorrow"]),
        df_train_smote["RainTomorrow"],
        df_test_smote.drop(columns=["RainTomorrow"]),
        df_test_smote["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 19:27:39.238780 <====
====> 2024-11-23 19:27:39.250313 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:30:11.251392 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:02:32.001079


====> 2024-11-23 19:30:11.252347 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:32:31.834326 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:02:20.581979


====> 2024-11-23 19:32:31.834326 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:34:47.403034 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:02:15.568708


====> 2024-11-23 19:34:47.403034 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:37:07.775053 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:02:20.372019


====> 2024-11-23 19:37:07.775053 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:39:36.4480

In [55]:
results_list_mlp_smote.to_csv(
    "../../DATA/RESULTS/results_list_mlp_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [56]:
del results_list_mlp_smote, results_list_mlp_under, results_list_mlp_full
gc.collect()

981

#### PCA

##### UNDERSAMPLING

In [57]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_pca_under = (
    run_cross_validation(
        df_train_under_pca.drop(columns=["RainTomorrow"]),
        df_train_under_pca["RainTomorrow"],
        df_test_under_pca.drop(columns=["RainTomorrow"]),
        df_test_under_pca["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 19:39:36.591353 <====
====> 2024-11-23 19:39:36.594352 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits


===> 2024-11-23 19:39:46.481149 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:09.886797


====> 2024-11-23 19:39:46.481149 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:39:55.952476 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:09.471327


====> 2024-11-23 19:39:55.952476 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:40:05.808987 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:09.856511


====> 2024-11-23 19:40:05.808987 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:40:17.185589 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:11.376602


====> 2024-11-23 19:40:17.186587 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:40:29.764080 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:12.577493


Resultados na validação cruzada: {'f1': 0.5641451591333341, 'accuracy': 0.6329757777239247, 'precision': 0.5401231696699631, 

In [58]:
results_list_mlp_pca_under.to_csv(
    "../../DATA/RESULTS/results_list_mlp_pca_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [59]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_pca_full = (
    run_cross_validation(
        df_train_full_pca.drop(columns=["RainTomorrow"]),
        df_train_full_pca["RainTomorrow"],
        df_test_full_pca.drop(columns=["RainTomorrow"]),
        df_test_full_pca["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 19:40:29.793015 <====
====> 2024-11-23 19:40:29.800323 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:41:05.603275 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:35.802952


====> 2024-11-23 19:41:05.603922 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:41:35.974877 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:30.370955


====> 2024-11-23 19:41:35.975868 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:42:07.658196 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:31.682328


====> 2024-11-23 19:42:07.658196 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:42:43.474634 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:35.816438


====> 2024-11-23 19:42:43.475884 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:43:13.2185

In [60]:
results_list_mlp_pca_full.to_csv(
    "../../DATA/RESULTS/results_list_mlp_pca_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [61]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_pca_smote = (
    run_cross_validation(
        df_train_smote_pca.drop(columns=["RainTomorrow"]),
        df_train_smote_pca["RainTomorrow"],
        df_test_smote_pca.drop(columns=["RainTomorrow"]),
        df_test_smote_pca["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 19:43:13.249750 <====
====> 2024-11-23 19:43:13.261277 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:43:58.610957 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:45.349680


====> 2024-11-23 19:43:58.611951 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:44:40.715636 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:42.103685


====> 2024-11-23 19:44:40.716654 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:45:20.708592 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:39.991938


====> 2024-11-23 19:45:20.709684 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:46:06.844693 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:46.135009


====> 2024-11-23 19:46:06.844693 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:46:49.0316

In [62]:
results_list_mlp_pca_smote.to_csv(
    "../../DATA/RESULTS/results_list_mlp_pca_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [63]:
del results_list_mlp_pca_smote, results_list_mlp_pca_under, results_list_mlp_pca_full
gc.collect()

130

#### CORRELAÇÃO

##### UNDERSAMPLING

In [64]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_corr_under = (
    run_cross_validation(
        df_train_under[filtro_correlacao],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_correlacao],
        df_test_under["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 19:46:49.165538 <====
====> 2024-11-23 19:46:49.170747 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits


===> 2024-11-23 19:47:11.818836 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:22.648089


====> 2024-11-23 19:47:11.819836 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:47:34.528907 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:22.709071


====> 2024-11-23 19:47:34.529910 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:47:56.396729 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:21.866819


====> 2024-11-23 19:47:56.396729 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:48:18.410731 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:22.014002


====> 2024-11-23 19:48:18.412331 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:48:40.372807 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:00:21.960476


Resultados na validação cruzada: {'f1': 0.7576235356533327, 'accuracy': 0.7581981074275677, 'precision': 0.7606398146555641, 

In [65]:
results_list_mlp_corr_under.to_csv(
    "../../DATA/RESULTS/results_list_mlp_corr_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [66]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_corr_full = (
    run_cross_validation(
        df_train_full[filtro_correlacao],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_correlacao],
        df_test_full["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 19:48:40.404516 <====
====> 2024-11-23 19:48:40.414335 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:49:14.300195 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:33.885860


====> 2024-11-23 19:49:14.300195 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:49:47.404807 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:33.104612


====> 2024-11-23 19:49:47.404807 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:50:20.764267 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:33.359460


====> 2024-11-23 19:50:20.764267 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:50:56.296135 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:35.531868


====> 2024-11-23 19:50:56.296135 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:51:34.0574

In [67]:
results_list_mlp_corr_full.to_csv(
    "../../DATA/RESULTS/results_list_mlp_corr_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [68]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_corr_smote = (
    run_cross_validation(
        df_train_smote[filtro_correlacao],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_correlacao],
        df_test_smote["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 19:51:34.090099 <====
====> 2024-11-23 19:51:34.104190 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:52:37.862222 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:01:03.758032


====> 2024-11-23 19:52:37.863164 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:53:35.411626 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:57.548462


====> 2024-11-23 19:53:35.412633 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:54:31.561588 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:56.148955


====> 2024-11-23 19:54:31.561588 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:55:23.119586 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:51.557998


====> 2024-11-23 19:55:23.119586 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:56:17.3820

In [69]:
results_list_mlp_corr_smote.to_csv(
    "../../DATA/RESULTS/results_list_mlp_corr_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [70]:
del results_list_mlp_corr_smote, results_list_mlp_corr_full, results_list_mlp_corr_under
gc.collect()

130

#### RANDOM FOREST - FEATURE IMPORTANCE

##### UNDERSAMPLING

In [71]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_rf_under = (
    run_cross_validation(
        df_train_under[filtro_random_forest],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_random_forest],
        df_test_under["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 19:56:17.531682 <====
====> 2024-11-23 19:56:17.535206 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:56:43.588800 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:26.053594


====> 2024-11-23 19:56:43.590309 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:57:12.335159 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:28.744850


====> 2024-11-23 19:57:12.336153 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:57:42.143740 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:29.807587


====> 2024-11-23 19:57:42.143740 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:58:07.799302 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:25.655562


====> 2024-11-23 19:58:07.800303 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:58:33.4338

In [72]:
results_list_mlp_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_mlp_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [73]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_rf_full = (
    run_cross_validation(
        df_train_full[filtro_random_forest],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_random_forest],
        df_test_full["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 19:58:33.463047 <====
====> 2024-11-23 19:58:33.470064 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 19:59:20.508362 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:47.038298


====> 2024-11-23 19:59:20.508362 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 20:00:04.387910 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:43.879548


====> 2024-11-23 20:00:04.387910 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 20:00:55.156343 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:50.768433


====> 2024-11-23 20:00:55.157344 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 20:01:41.172508 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:46.015164


====> 2024-11-23 20:01:41.173507 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 20:02:26.7491

In [74]:
results_list_mlp_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_mlp_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [75]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_rf_smote = (
    run_cross_validation(
        df_train_smote[filtro_random_forest],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_random_forest],
        df_test_smote["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-23 20:02:26.780965 <====
====> 2024-11-23 20:02:26.791552 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 20:03:56.307886 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:01:29.516334


====> 2024-11-23 20:03:56.307886 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 20:05:24.480524 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:01:28.172638


====> 2024-11-23 20:05:24.481526 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 20:06:47.324831 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:01:22.843305


====> 2024-11-23 20:06:47.324831 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 20:08:10.264319 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:01:22.939488


====> 2024-11-23 20:08:10.264319 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-23 20:09:37.7281

In [76]:
results_list_mlp_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_mlp_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [77]:
del results_list_mlp_rf_smote, results_list_mlp_rf_full, results_list_mlp_rf_under
gc.collect()

130

# FINAL DO PROCESSO

In [None]:
final_processo = datetime.now()
print(
    f"PROCESSO DE MODELAGEM FINALIZADO EM: {final_processo}\n\n => TOTAL UTILIZADO: {final_processo - inicio_processo}"
)

PROCESSO DE MODELAGEM FINALIZADO EM: 2024-11-25 23:51:03.590744

 => TOTAL UTILIZADO: 5:34:43.254063
