# IMPORTAÇÃO DE BIBLIOTECAS

In [1]:
# MANIPULAÇÃO DE DADOS
import pandas as pd
import numpy as np
from datetime import datetime

# VISUALIZAÇÃO DE DADOS
import matplotlib.pyplot as plt

# TRANSFORMAÇÕES
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from imblearn import over_sampling, under_sampling


# PREPARAÇÃO TREINO E AVALIAÇÃO
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# MODELOS UTILIZADOS
from mixed_naive_bayes import MixedNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# CONFIGURAÇÕES DE EXIBIÇÃO
import gc
import warnings

warnings.filterwarnings("ignore")

# pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

In [None]:
inicio_processo = datetime.now()
print(f'PROCESSO DE MODELAGEM INICIADO EM: {inicio_processo}')

# IMPORTAÇÃO DE DADOS

In [2]:
weather_aus = pd.read_csv("../../DATA/weatherAUS.csv").dropna()

print(
    f"O dataset possui {weather_aus.shape[0]:,} instâncias (linhas) e {weather_aus.shape[1]:,} características (colunas)."
)

print(f"As características (colunas) do dataset são: {weather_aus.columns.to_list()}")

O dataset possui 56,420 instâncias (linhas) e 23 características (colunas).
As características (colunas) do dataset são: ['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow']


# FUNÇÕES PARA PRÉ-PROCESSAMENTO

## SEPARA TREINO E TESTE (80/20)

In [3]:
def split_data(df, target_column):
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Dividir a base em treino e teste, mantendo a proporção das classes
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    df_train = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)

    return df_train, df_test

## AJUSTA O FORMATO DO ALVO

In [4]:
def adjust_data_types(df):
    df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
    df["RainTomorrow"] = df.RainTomorrow.map({"Yes": 1, "No": 0})

    return df


## CRIAÇÃO DE VARIÁVEIS AUXILIARES

In [None]:
def create_auxiliar_columns(df):
    def get_season(month_number) -> str:
        quarter = month_number % 12 // 3 + 1
        if quarter == 1:
            return "summer"

        if quarter == 2:
            return "autumn"

        if quarter == 3:
            return "winter"

        if quarter == 4:
            return "spring"

    # COLUNAS SELECIONADAS COM BASE NAS AVALIAÇÕES ANTERIORES
    df = df[
        [
            "Date",
            "MinTemp",
            "MaxTemp",
            "Rainfall",
            "Evaporation",
            "Sunshine",
            "WindGustDir",
            "WindGustSpeed",
            "WindDir3pm",
            "WindSpeed9am",
            "WindSpeed3pm",
            "Humidity9am",
            "Humidity3pm",
            "Pressure3pm",
            "Cloud9am",
            "Cloud3pm",
            "RainTomorrow",
        ]
    ]

    df["RangeTemp"] = df["MaxTemp"] - df["MinTemp"]

    df["month"] = df.Date.dt.month
    df["season"] = df["month"].apply(lambda month_number: get_season(month_number))

    time_encoded = pd.get_dummies(
        df[["month", "season"]],
        columns=["month", "season"],
        drop_first=True,
        prefix=["month", "season"],
        dtype=int,
    )

    # SEPARA AS VARIÁVEIS CATEGÓRICAS E NUMÉRICAS
    # CATEGORIZA AS VARIÁVEIS NECESSÁRIAS
    categorical_columns = pd.get_dummies(
        df.select_dtypes(include=["object", "datetime64"]),
        columns=["WindGustDir", "WindDir3pm"],
        drop_first=True,
        prefix=["WindGustDir", "WindDir3pm"],
        dtype=int,
    )

    numerical_columns = df.select_dtypes("number")

    df = pd.concat([numerical_columns, categorical_columns, time_encoded], axis=1)

    return df.drop(columns=["month", "Date", "season"]), {
        "numerical_columns": numerical_columns.drop(
            columns=["RainTomorrow", "month"]
        ).columns.to_list(),
        "categorical_columns": categorical_columns.drop(
            columns=["Date", "season"]
        ).columns.to_list(),
        "time_encoded": time_encoded.columns.to_list(),
    }


## APLICA AS TRANSFORMAÇÕES

In [None]:
def instance_transformations(n_components=8):
    discretizer = KBinsDiscretizer(
        n_bins=5, encode="ordinal", strategy="kmeans", random_state=42
    )

    smote = over_sampling.SMOTE(random_state=42)
    under = under_sampling.RandomUnderSampler(random_state=42)

    scaler = StandardScaler()
    pca = PCA(n_components=n_components, random_state=42)

    return discretizer, scaler, pca, smote, under

In [7]:
def adjust_train_volume(df, target_column, smote, under):
    X = df.drop(columns=[target_column])
    y = df[target_column]

    X_smote, y_smote = smote.fit_resample(X, y)
    df_smote = pd.concat([X_smote, y_smote], axis=1)

    X_under, y_under = under.fit_resample(X, y)
    df_under = pd.concat([X_under, y_under], axis=1)

    return df_smote, df_under

In [8]:
def fit_transformmations(df, cols, discretizer, scaler, pca, discrete_col="Rainfall"):
    discretizer.fit(df[discrete_col].values.reshape(-1, 1))

    scaler.fit(df[cols])
    pca.fit(df[cols])

    return discretizer, scaler, pca


In [9]:
def transform_data(df, cols, discretizer, scaler, pca, discrete_col="Rainfall"):
    df[discrete_col + "_Discretized"] = discretizer.transform(
        df[discrete_col].values.reshape(-1, 1)
    )
    df[cols] = scaler.transform(df[cols])
    df = df.reset_index(drop=True)

    pca_result = pca.transform(df[cols])

    df_pca = pd.DataFrame(
        data=pca_result,
        columns=[f"PC{comp}" for comp in range(1, pca.n_components_ + 1)],
    )
    df_pca = df_pca.reset_index(drop=True)
    df_pca["RainTomorrow"] = df["RainTomorrow"]

    return df, df_pca

# INICIO DO PRÉ-PROCESSAMENTO

## SEPARA TREINO E TESTE (80/20)

In [25]:
df_train, df_test = split_data(weather_aus, "RainTomorrow")

## APLICAÇÕES SOBRE TREINO

### AJUSTA O TIPO DE DADO

In [26]:
df_train = adjust_data_types(df_train)

### CRIA VARIÁVEIS AUXILIARES

In [27]:
df_train, columns_names = create_auxiliar_columns(df_train)

### APLICA AS TRANSFORMAÇÕES

In [13]:
discretizer, scaler, pca, smote, under = instance_transformations()

#### AJUSTA VOLUME DA RESPOSTA

In [None]:
alvo = df_train.RainTomorrow.value_counts().to_frame("QTD").reset_index()
alvo["PERC"] = alvo.QTD / alvo.QTD.sum()
alvo

Unnamed: 0,RainTomorrow,QTD,PERC
0,0,35194,0.779732
1,1,9942,0.220268


In [None]:
df_train_smote, df_train_under = adjust_train_volume(
    df_train, "RainTomorrow", smote, under
)

In [None]:
alvo_smote = df_train_smote.RainTomorrow.value_counts().to_frame("QTD").reset_index()
alvo_smote["PERC"] = alvo_smote.QTD / alvo_smote.QTD.sum()
alvo_smote

Unnamed: 0,RainTomorrow,QTD,PERC
0,1,35194,0.5
1,0,35194,0.5


In [None]:
alvo_under = df_train_under.RainTomorrow.value_counts().to_frame("QTD").reset_index()
alvo_under["PERC"] = alvo_under.QTD / alvo_under.QTD.sum()
alvo_under

Unnamed: 0,RainTomorrow,QTD,PERC
0,0,9942,0.5
1,1,9942,0.5


#### TRANSFORMA OS DADOS

In [None]:
discretizer_full, scaler_full, pca_full = fit_transformmations(
    df_train, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [None]:
discretizer_smote, scaler_smote, pca_smote = fit_transformmations(
    df_train_smote, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [None]:
discretizer_under, scaler_under, pca_under = fit_transformmations(
    df_train_under, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [None]:
df_train_full, df_train_full_pca = transform_data(
    df_train, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [None]:
df_train_smote, df_train_smote_pca = transform_data(
    df_train_smote,
    columns_names["numerical_columns"],
    discretizer_smote,
    scaler_smote,
    pca_smote,
)

In [None]:
df_train_under, df_train_under_pca = transform_data(
    df_train_under,
    columns_names["numerical_columns"],
    discretizer_under,
    scaler_under,
    pca_under,
)

## APLICAÇÕES SOBRE TESTE

### AJUSTA O TIPO DE DADO

In [39]:
df_test = adjust_data_types(df_test)

### CRIA VARIÁVEIS AUXILIARES

In [40]:
df_test, columns_names = create_auxiliar_columns(df_test)

### TRANSFORMA OS DADOS

In [None]:
df_test_full, df_test_full_pca = transform_data(
    df_test, columns_names["numerical_columns"], discretizer, scaler, pca
)

In [None]:
df_test_smote, df_test_smote_pca = transform_data(
    df_test,
    columns_names["numerical_columns"],
    discretizer_smote,
    scaler_smote,
    pca_smote,
)

In [None]:
df_test_under, df_test_under_pca = transform_data(
    df_test,
    columns_names["numerical_columns"],
    discretizer_under,
    scaler_under,
    pca_under,
)

## LIBERA MEMÓRIA

In [None]:
del (
    df_train,
    df_test,
    weather_aus,
    discretizer_under,
    scaler_under,
    pca_under,
    discretizer_smote,
    scaler_smote,
    pca_smote,
    discretizer,
    scaler,
    pca,
    smote,
    under,
)
gc.collect()

58

# PROCESSO DE MODELAGEM

## FUNÇÕES DE CROSS VALIDATION

### NUMÉRICAS

In [None]:
def run_cross_validation(
    train_data, train_target, test_data, test_target, model, param_grid, k_folds=5
):
    skf = StratifiedKFold(n_splits=k_folds)
    scores_validation = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    scores_test = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    best_params = None
    best_model = None
    best_f1 = 0
    counter = 1
    print(f"====> INICIANDO PROCESSAMENTO: {datetime.now()} <====")

    for train_index, valid_index in skf.split(train_data, train_target):
        inicio = datetime.now()
        print(f"====> {inicio} | EXECUTANDO FOLD {counter} ")

        # Use .iloc to select rows based on indices
        X_train_fold, X_valid_fold = (
            train_data.iloc[train_index],
            train_data.iloc[valid_index],
        )
        y_train_fold, y_valid_fold = (
            train_target.iloc[train_index],
            train_target.iloc[valid_index],
        )

        # Etapa 2: Treinamento e ajuste de hiperparâmetros com validação cruzada interna
        grid_search = GridSearchCV(
            model,
            param_grid,
            scoring="f1",
            cv=StratifiedKFold(n_splits=3),
            n_jobs=-1,
            verbose=3,
        )
        grid_search.fit(X_train_fold, y_train_fold)
        model = grid_search.best_estimator_
        f1 = grid_search.best_score_

        if f1 > best_f1:
            best_model = model
            best_params = grid_search.best_params_
            best_f1 = f1

        # Avaliação nos dados de validação
        y_valid_pred = model.predict(X_valid_fold)

        # Cálculo das métricas no fold
        scores_validation["f1"].append(
            f1_score(y_valid_fold, y_valid_pred, average="macro")
        )
        scores_validation["accuracy"].append(accuracy_score(y_valid_fold, y_valid_pred))
        scores_validation["precision"].append(
            precision_score(y_valid_fold, y_valid_pred, average="macro")
        )
        scores_validation["recall"].append(
            recall_score(y_valid_fold, y_valid_pred, average="macro")
        )

        # Avaliar na base de teste com o melhor modelo
        y_test_pred = model.predict(test_data)
        f1_test = f1_score(test_target, y_test_pred, average="macro")
        accuracy_test = accuracy_score(test_target, y_test_pred)
        precision_test = precision_score(test_target, y_test_pred, average="macro")
        recall_test = recall_score(test_target, y_test_pred, average="macro")

        # Armazenar as métricas na base de teste
        scores_test["f1"].append(f1_test)
        scores_test["accuracy"].append(accuracy_test)
        scores_test["precision"].append(precision_test)
        scores_test["recall"].append(recall_test)

        counter += 1
        final = datetime.now()
        print(
            f"===> {final} | FINALIZADO FOLD {counter - 1} | TEMPO TOTAL {final - inicio}\n\n"
        )

    # Calcula as médias das métricas nos folds de validação
    avg_scores_validation = {
        metric: np.mean(values) for metric, values in scores_validation.items()
    }
    test_scores = {metric: np.mean(values) for metric, values in scores_test.items()}

    # Criar log dos resultados
    log = pd.DataFrame(scores_validation)
    log["dataset"] = "Validação Cruzada"

    test_log = pd.DataFrame(scores_test)
    test_log["dataset"] = "Teste"

    # Concatenar os logs da validação cruzada e do teste
    full_log = pd.concat([log, test_log])

    return avg_scores_validation, test_scores, best_params, full_log

### CATEGÓRICAS

In [None]:
def run_cross_mix(
    train_data,
    train_target,
    test_data,
    test_target,
    alphas,
    k_folds=5,
    categorical_features=None,
):
    skf = StratifiedKFold(n_splits=k_folds)
    scores_validation = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    scores_test = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    best_model = None
    best_alpha = None
    best_f1 = 0
    counter = 1
    print(f"====> INICIANDO PROCESSAMENTO: {datetime.now()} <====")

    # Validação cruzada estratificada
    for train_index, valid_index in skf.split(train_data, train_target):
        inicio = datetime.now()
        print(f"====> {inicio} | EXECUTANDO FOLD {counter} ")

        X_train_fold, X_valid_fold = (
            train_data.iloc[train_index],
            train_data.iloc[valid_index],
        )
        y_train_fold, y_valid_fold = (
            train_target.iloc[train_index],
            train_target.iloc[valid_index],
        )

        # Iterar pelos valores de alpha
        for alpha in alphas:
            print(f"========> Testando: alpha = {alpha}")
            # Treina o modelo com o valor de alpha atual
            model = MixedNB(categorical_features=categorical_features, alpha=alpha)
            model.fit(X_train_fold, y_train_fold)

            # Avaliação nos dados de validação
            y_valid_pred = model.predict(X_valid_fold)
            f1 = f1_score(y_valid_fold, y_valid_pred, average="macro")
            accuracy = accuracy_score(y_valid_fold, y_valid_pred)
            precision = precision_score(y_valid_fold, y_valid_pred, average="macro")
            recall = recall_score(y_valid_fold, y_valid_pred, average="macro")

            # Seleciona o melhor modelo baseado na métrica F1-score
            if f1 > best_f1:
                best_f1 = f1
                best_model = model
                best_alpha = alpha

        # Armazenar as métricas para o fold atual
        scores_validation["f1"].append(f1)
        scores_validation["accuracy"].append(accuracy)
        scores_validation["precision"].append(precision)
        scores_validation["recall"].append(recall)

        # Avaliar na base de teste com o melhor modelo
        y_test_pred = model.predict(test_data)
        f1_test = f1_score(test_target, y_test_pred, average="macro")
        accuracy_test = accuracy_score(test_target, y_test_pred)
        precision_test = precision_score(test_target, y_test_pred, average="macro")
        recall_test = recall_score(test_target, y_test_pred, average="macro")

        # Armazenar as métricas na base de teste
        scores_test["f1"].append(f1_test)
        scores_test["accuracy"].append(accuracy_test)
        scores_test["precision"].append(precision_test)
        scores_test["recall"].append(recall_test)

        counter += 1
        final = datetime.now()
        print(
            f"====> {final} | FINALIZADO FOLD {counter - 1} | TEMPO TOTAL {final - inicio}\n\n"
        )

    # Calcular as médias das métricas de validação cruzada
    avg_scores_validation = {
        metric: np.mean(values) for metric, values in scores_validation.items()
    }
    test_scores = {metric: np.mean(values) for metric, values in scores_test.items()}

    # Criar log dos resultados
    log = pd.DataFrame(scores_validation)
    log["dataset"] = "Validação Cruzada"

    test_log = pd.DataFrame(scores_test)
    test_log["dataset"] = "Teste"

    # Concatenar os logs da validação cruzada e do teste
    full_log = pd.concat([log, test_log])

    return avg_scores_validation, best_alpha, test_scores, full_log


## MODELOS

### NAIVE BAYES

#### TODAS AS CARACTERÍSTICAS

##### UNDERSAMPLING

In [None]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_under = run_cross_mix(
    df_train_under.drop(columns=["RainTomorrow"]),
    df_train_under["RainTomorrow"],
    df_test_under.drop(columns=["RainTomorrow"]),
    df_test_under["RainTomorrow"],
    alphas,
    k_folds=5,
    categorical_features=[
        df_train_under.drop(columns="RainTomorrow").columns.get_loc(col)
        for col in [
            columns_names["categorical_columns"] + columns_names["time_encoded"]
        ]
    ],
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_bayes_under.to_csv(
    "../../DATA/RESULTS/results_list_bayes_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_full = run_cross_mix(
    df_train_full.drop(columns=["RainTomorrow"]),
    df_train_full["RainTomorrow"],
    df_test_full.drop(columns=["RainTomorrow"]),
    df_test_full["RainTomorrow"],
    alphas,
    k_folds=5,
    categorical_features=[
        df_train_full.drop(columns="RainTomorrow").columns.get_loc(col)
        for col in [
            columns_names["categorical_columns"] + columns_names["time_encoded"]
        ]
    ],
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_bayes_full.to_csv(
    "../../DATA/RESULTS/results_list_bayes_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_smote = run_cross_mix(
    df_train_smote.drop(columns=["RainTomorrow"]),
    df_train_smote["RainTomorrow"],
    df_test_smote.drop(columns=["RainTomorrow"]),
    df_test_smote["RainTomorrow"],
    alphas,
    k_folds=5,
    categorical_features=[
        df_train_smote.drop(columns="RainTomorrow").columns.get_loc(col)
        for col in [
            columns_names["categorical_columns"] + columns_names["time_encoded"]
        ]
    ],
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_bayes_smote.to_csv(
    "../../DATA/RESULTS/results_list_bayes_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_bayes_smote, results_list_bayes_under, results_list_bayes_full
gc.collect()

#### PCA

##### UNDERSAMPLING

In [None]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_pca_under = (
    run_cross_validation(
        df_train_under_pca.drop(columns=["RainTomorrow"]),
        df_train_under_pca["RainTomorrow"],
        df_test_under_pca.drop(columns=["RainTomorrow"]),
        df_test_under_pca["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_bayes_pca_under.to_csv(
    "../../DATA/RESULTS/results_list_bayes_pca_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_pca_full = (
    run_cross_validation(
        df_train_full_pca.drop(columns=["RainTomorrow"]),
        df_train_full_pca["RainTomorrow"],
        df_test_full_pca.drop(columns=["RainTomorrow"]),
        df_test_full_pca["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_bayes_pca_full.to_csv(
    "../../DATA/RESULTS/results_list_bayes_pca_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_pca_smote = (
    run_cross_validation(
        df_train_smote_pca.drop(columns=["RainTomorrow"]),
        df_train_smote_pca["RainTomorrow"],
        df_test_smote_pca.drop(columns=["RainTomorrow"]),
        df_test_smote_pca["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_bayes_pca_smote.to_csv(
    "../../DATA/RESULTS/results_list_bayes_pca_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del (
    results_list_bayes_pca_under,
    results_list_bayes_pca_smote,
    results_list_bayes_pca_full,
)
gc.collect()

#### CORRELAÇÃO

In [None]:
filtro_correlacao = ["Sunshine", "Humidity3pm", "Cloud9am", "Cloud3pm", "RangeTemp"]

##### UNDERSAMPLING

In [None]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_corr_under = (
    run_cross_validation(
        df_train_under[filtro_correlacao],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_correlacao],
        df_test_under["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_bayes_corr_under.to_csv(
    "../../DATA/RESULTS/results_list_bayes_corr_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_corr_full = (
    run_cross_validation(
        df_train_full[filtro_correlacao],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_correlacao],
        df_test_full["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_bayes_corr_full.to_csv(
    "../../DATA/RESULTS/results_list_bayes_corr_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_bayes_corr_smote = (
    run_cross_validation(
        df_train_smote[filtro_correlacao],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_correlacao],
        df_test_smote["RainTomorrow"],
        GaussianNB(),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_bayes_corr_smote.to_csv(
    "../../DATA/RESULTS/results_list_bayes_corr_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del (
    results_list_bayes_corr_smote,
    results_list_bayes_corr_full,
    results_list_bayes_corr_under,
)
gc.collect()

#### RANDOM FOREST - FEATURE IMPORTANCE

In [None]:
filtro_random_forest = [
    "Humidity3pm",
    "Sunshine",
    "Pressure3pm",
    "Cloud3pm",
    "RangeTemp",
    "WindGustSpeed",
    "Humidity9am",
    "Rainfall",
    "MinTemp",
    "MaxTemp",
    "Evaporation",
    "WindSpeed3pm",
    "WindSpeed9am",
    "Cloud9am",
    "Rainfall_Discretized",
]

##### UNDERSAMPLING

In [None]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_rf_under = (
    run_cross_mix(
        df_train_under[filtro_random_forest],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_random_forest],
        df_test_under["RainTomorrow"],
        alphas,
        k_folds=5,
        categorical_features=["Rainfall_Discretized"],
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_bayes_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_bayes_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_rf_full = (
    run_cross_mix(
        df_train_full[filtro_random_forest],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_random_forest],
        df_test_full["RainTomorrow"],
        alphas,
        k_folds=5,
        categorical_features=["Rainfall_Discretized"],
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_bayes_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_bayes_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_bayes_rf_smote = (
    run_cross_mix(
        df_train_smote[filtro_random_forest],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_random_forest],
        df_test_smote["RainTomorrow"],
        alphas,
        k_folds=5,
        categorical_features=["Rainfall_Discretized"],
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_bayes_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_bayes_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_bayes_rf_smote, results_list_bayes_rf_under, results_list_bayes_rf_full
gc.collect()

### RANDOM FOREST

#### TODAS AS CARACTERÍSTICAS

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_under = (
    run_cross_validation(
        df_train_under.drop(columns=["RainTomorrow"]),
        df_train_under["RainTomorrow"],
        df_test_under.drop(columns=["RainTomorrow"]),
        df_test_under["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_full = (
    run_cross_validation(
        df_train_full.drop(columns=["RainTomorrow"]),
        df_train_full["RainTomorrow"],
        df_test_full.drop(columns=["RainTomorrow"]),
        df_test_full["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_smote = (
    run_cross_validation(
        df_train_smote.drop(columns=["RainTomorrow"]),
        df_train_smote["RainTomorrow"],
        df_test_smote.drop(columns=["RainTomorrow"]),
        df_test_smote["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_rf_smote, results_list_rf_full, results_list_rf_under
gc.collect()

#### PCA

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_pca_under = (
    run_cross_validation(
        df_train_under_pca.drop(columns=["RainTomorrow"]),
        df_train_under_pca["RainTomorrow"],
        df_test_under_pca.drop(columns=["RainTomorrow"]),
        df_test_under_pca["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_pca_under.to_csv(
    "../../DATA/RESULTS/results_list_rf_pca_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_pca_full = (
    run_cross_validation(
        df_train_full_pca.drop(columns=["RainTomorrow"]),
        df_train_full_pca["RainTomorrow"],
        df_test_full_pca.drop(columns=["RainTomorrow"]),
        df_test_full_pca["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_pca_full.to_csv(
    "../../DATA/RESULTS/results_list_rf_pca_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_pca_smote = (
    run_cross_validation(
        df_train_smote_pca.drop(columns=["RainTomorrow"]),
        df_train_smote_pca["RainTomorrow"],
        df_test_smote_pca.drop(columns=["RainTomorrow"]),
        df_test_smote_pca["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_pca_smote.to_csv(
    "../../DATA/RESULTS/results_list_rf_pca_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_rf_pca_smote, results_list_rf_pca_full, results_list_rf_pca_under
gc.collect()

#### CORRELAÇÃO

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_corr_under = (
    run_cross_validation(
        df_train_under.drop(columns=["RainTomorrow"]),
        df_train_under["RainTomorrow"],
        df_test_under.drop(columns=["RainTomorrow"]),
        df_test_under["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_corr_under.to_csv(
    "../../DATA/RESULTS/results_list_rf_corr_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_corr_full = (
    run_cross_validation(
        df_train_full.drop(columns=["RainTomorrow"]),
        df_train_full["RainTomorrow"],
        df_test_full.drop(columns=["RainTomorrow"]),
        df_test_full["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_corr_full.to_csv(
    "../../DATA/RESULTS/results_list_rf_corr_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_corr_smote = (
    run_cross_validation(
        df_train_smote.drop(columns=["RainTomorrow"]),
        df_train_smote["RainTomorrow"],
        df_test_smote.drop(columns=["RainTomorrow"]),
        df_test_smote["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_corr_smote.to_csv(
    "../../DATA/RESULTS/results_list_rf_corr_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_rf_corr_smote, results_list_rf_corr_full, results_list_rf_corr_under
gc.collect()

#### RANDOM FOREST - FEATURE IMPORTANCE

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_rf_under = (
    run_cross_validation(
        df_train_under[filtro_random_forest],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_random_forest],
        df_test_under["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_rf_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_rf_full = (
    run_cross_validation(
        df_train_full[filtro_random_forest],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_random_forest],
        df_test_full["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_rf_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rf_rf_smote = (
    run_cross_validation(
        df_train_smote[filtro_random_forest],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_random_forest],
        df_test_smote["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_rf_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_rf_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_rf_rf_smote, results_list_rf_rf_full, results_list_rf_rf_under
gc.collect()

### SVM

#### TODAS AS CARACTERÍSTICAS

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_under = (
    run_cross_validation(
        df_train_under.drop(columns=["RainTomorrow"]),
        df_train_under["RainTomorrow"],
        df_test_under.drop(columns=["RainTomorrow"]),
        df_test_under["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_under.to_csv(
    "../../DATA/RESULTS/results_list_svm_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_full = (
    run_cross_validation(
        df_train_full.drop(columns=["RainTomorrow"]),
        df_train_full["RainTomorrow"],
        df_test_full.drop(columns=["RainTomorrow"]),
        df_test_full["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_full.to_csv(
    "../../DATA/RESULTS/results_list_svm_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_smote = (
    run_cross_validation(
        df_train_smote.drop(columns=["RainTomorrow"]),
        df_train_smote["RainTomorrow"],
        df_test_smote.drop(columns=["RainTomorrow"]),
        df_test_smote["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_smote.to_csv(
    "../../DATA/RESULTS/results_list_svm_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_svm_smote, results_list_svm_under, results_list_svm_full
gc.collect()

#### PCA

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_pca_under = (
    run_cross_validation(
        df_train_under_pca.drop(columns=["RainTomorrow"]),
        df_train_under_pca["RainTomorrow"],
        df_test_under_pca.drop(columns=["RainTomorrow"]),
        df_test_under_pca["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_pca_under.to_csv(
    "../../DATA/RESULTS/results_list_svm_pca_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_pca_full = (
    run_cross_validation(
        df_train_full_pca.drop(columns=["RainTomorrow"]),
        df_train_full_pca["RainTomorrow"],
        df_test_full_pca.drop(columns=["RainTomorrow"]),
        df_test_full_pca["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_pca_full.to_csv(
    "../../DATA/RESULTS/results_list_svm_pca_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_pca_smote = (
    run_cross_validation(
        df_train_smote_pca.drop(columns=["RainTomorrow"]),
        df_train_smote_pca["RainTomorrow"],
        df_test_smote_pca.drop(columns=["RainTomorrow"]),
        df_test_smote_pca["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_pca_smote.to_csv(
    "../../DATA/RESULTS/results_list_svm_pca_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_svm_pca_smote, results_list_svm_pca_under, results_list_svm_pca_full
gc.collect()

#### CORRELAÇÃO

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_corr_under = (
    run_cross_validation(
        df_train_under[filtro_correlacao],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_correlacao],
        df_test_under["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_corr_under.to_csv(
    "../../DATA/RESULTS/results_list_svm_corr_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_corr_full = (
    run_cross_validation(
        df_train_full[filtro_correlacao],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_correlacao],
        df_test_full["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_corr_full.to_csv(
    "../../DATA/RESULTS/results_list_svm_corr_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_corr_smote = (
    run_cross_validation(
        df_train_smote[filtro_correlacao],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_correlacao],
        df_test_smote["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_corr_smote.to_csv(
    "../../DATA/RESULTS/results_list_svm_corr_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_svm_corr_smote, results_list_svm_corr_full, results_list_svm_corr_under
gc.collect()

#### RANDOM FOREST - FEATURE IMPORTANCE

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_rf_under = (
    run_cross_validation(
        df_train_under[filtro_random_forest],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_random_forest],
        df_test_under["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_svm_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_rf_full = (
    run_cross_validation(
        df_train_full[filtro_random_forest],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_random_forest],
        df_test_full["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_svm_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "C": [10, 100],
    "kernel": ["rbf", "linear"],
    "gamma": [0.01, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_svm_rf_smote = (
    run_cross_validation(
        df_train_smote[filtro_random_forest],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_random_forest],
        df_test_smote["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_svm_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_svm_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_svm_rf_smote, results_list_svm_rf_full, results_list_svm_rf_under
gc.collect()

### MLP

#### TODAS AS CARACTERÍSTICAS

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_under = (
    run_cross_validation(
        df_train_under.drop(columns=["RainTomorrow"]),
        df_train_under["RainTomorrow"],
        df_test_under.drop(columns=["RainTomorrow"]),
        df_test_under["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_under.to_csv(
    "../../DATA/RESULTS/results_list_mlp_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_full = (
    run_cross_validation(
        df_train_full.drop(columns=["RainTomorrow"]),
        df_train_full["RainTomorrow"],
        df_test_full.drop(columns=["RainTomorrow"]),
        df_test_full["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_full.to_csv(
    "../../DATA/RESULTS/results_list_mlp_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_smote = (
    run_cross_validation(
        df_train_smote.drop(columns=["RainTomorrow"]),
        df_train_smote["RainTomorrow"],
        df_test_smote.drop(columns=["RainTomorrow"]),
        df_test_smote["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_smote.to_csv(
    "../../DATA/RESULTS/results_list_mlp_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_mlp_smote, results_list_mlp_under, results_list_mlp_full
gc.collect()

#### PCA

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_pca_under = (
    run_cross_validation(
        df_train_under_pca.drop(columns=["RainTomorrow"]),
        df_train_under_pca["RainTomorrow"],
        df_test_under_pca.drop(columns=["RainTomorrow"]),
        df_test_under_pca["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_pca_under.to_csv(
    "../../DATA/RESULTS/results_list_mlp_pca_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_pca_full = (
    run_cross_validation(
        df_train_full_pca.drop(columns=["RainTomorrow"]),
        df_train_full_pca["RainTomorrow"],
        df_test_full_pca.drop(columns=["RainTomorrow"]),
        df_test_full_pca["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_pca_full.to_csv(
    "../../DATA/RESULTS/results_list_mlp_pca_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_pca_smote = (
    run_cross_validation(
        df_train_smote_pca.drop(columns=["RainTomorrow"]),
        df_train_smote_pca["RainTomorrow"],
        df_test_smote_pca.drop(columns=["RainTomorrow"]),
        df_test_smote_pca["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_pca_smote.to_csv(
    "../../DATA/RESULTS/results_list_mlp_pca_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_mlp_pca_smote, results_list_mlp_pca_under, results_list_mlp_pca_full
gc.collect()

#### CORRELAÇÃO

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_corr_under = (
    run_cross_validation(
        df_train_under[filtro_correlacao],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_correlacao],
        df_test_under["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_corr_under.to_csv(
    "../../DATA/RESULTS/results_list_mlp_corr_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_corr_full = (
    run_cross_validation(
        df_train_full[filtro_correlacao],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_correlacao],
        df_test_full["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_corr_full.to_csv(
    "../../DATA/RESULTS/results_list_mlp_corr_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_corr_smote = (
    run_cross_validation(
        df_train_smote[filtro_correlacao],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_correlacao],
        df_test_smote["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_corr_smote.to_csv(
    "../../DATA/RESULTS/results_list_mlp_corr_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_mlp_corr_smote, results_list_mlp_corr_full, results_list_mlp_corr_under
gc.collect()

#### RANDOM FOREST - FEATURE IMPORTANCE

##### UNDERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_rf_under = (
    run_cross_validation(
        df_train_under[filtro_random_forest],
        df_train_under["RainTomorrow"],
        df_test_under[filtro_random_forest],
        df_test_under["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_rf_under.to_csv(
    "../../DATA/RESULTS/results_list_mlp_rf_under.csv", sep="|", index=False
)

##### SEM BALANCEAMENTO

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_rf_full = (
    run_cross_validation(
        df_train_full[filtro_random_forest],
        df_train_full["RainTomorrow"],
        df_test_full[filtro_random_forest],
        df_test_full["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_rf_full.to_csv(
    "../../DATA/RESULTS/results_list_mlp_rf_full.csv", sep="|", index=False
)

##### OVERSAMPLING

In [None]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_mlp_rf_smote = (
    run_cross_validation(
        df_train_smote[filtro_random_forest],
        df_train_smote["RainTomorrow"],
        df_test_smote[filtro_random_forest],
        df_test_smote["RainTomorrow"],
        MLPClassifier(random_state=42),
        param_grid_categorical,
        k_folds=10,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

In [None]:
results_list_mlp_rf_smote.to_csv(
    "../../DATA/RESULTS/results_list_mlp_rf_smote.csv", sep="|", index=False
)

##### LIBERA MEMÓRIA

In [None]:
del results_list_mlp_rf_smote, results_list_mlp_rf_full, results_list_mlp_rf_under
gc.collect()

In [None]:
final_processo = datetime.now()
print(f'PROCESSO DE MODELAGEM FINALIZADO EM: {final_processo}\n\n => TOTAL UTILIZADO: {final_processo - inicio_processo}')