# IMPORTA BIBLIOTECAS

In [1]:
# MANIPULAÇÃO DE DADOS
import pandas as pd
import numpy as np
from datetime import datetime

# VISUALIZAÇÃO DE DADOS
import matplotlib.pyplot as plt

# TRANSFORMAÇÕES
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from imblearn import over_sampling


# PREPARAÇÃO TREINO E AVALIAÇÃO
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# MODELOS UTILIZADOS
from mixed_naive_bayes import MixedNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# CONFIGURAÇÕES DE EXIBIÇÃO
import warnings

warnings.filterwarnings("ignore")

# pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

# IMPORTAÇÃO DE DADOS

In [2]:
weather_aus = pd.read_csv("../DATA/weatherAUS.csv").dropna()

print(
    f"O dataset possui {weather_aus.shape[0]:,} instâncias (linhas) e {weather_aus.shape[1]:,} características (colunas)."
)

print(f"As características (colunas) do dataset são: {weather_aus.columns.to_list()}")

O dataset possui 56,420 instâncias (linhas) e 23 características (colunas).
As características (colunas) do dataset são: ['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow']


# FUNÇÕES PARA PRÉ-PROCESSAMENTO


In [3]:
def split_data(df, target_column):
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Dividir a base em treino e teste, mantendo a proporção das classes
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    df_train = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)

    return df_train, df_test

In [4]:
def adjust_data_types(df):
    df.RainTomorrow = df.RainTomorrow.map({"Yes": 1, "No": 0})

    return df


In [5]:
def create_auxiliar_columns(df):
    # COLUNAS SELECIONADAS COM BASE NAS AVALIAÇÕES ANTERIORES
    df = df[
        [
            "MinTemp",
            "MaxTemp",
            "Rainfall",
            "Evaporation",
            "Sunshine",
            "WindGustSpeed",
            "WindSpeed9am",
            "WindSpeed3pm",
            "Humidity9am",
            "Humidity3pm",
            "Pressure3pm",
            "Cloud9am",
            "Cloud3pm",
            "RainTomorrow",
            "WindGustDir",
            "WindDir3pm",
        ]
    ]

    df["RangeTemp"] = df["MaxTemp"] - df["MinTemp"]

    # SEPARA AS VARIÁVEIS CATEGÓRICAS E NUMÉRICAS
    # CATEGORIZA AS VARIÁVEIS NECESSÁRIAS
    categorical_columns = pd.get_dummies(
        df.select_dtypes(include=["object", "datetime64"]),
        columns=["WindGustDir", "WindDir3pm"],
        drop_first=True,
        prefix=["WindGustDir", "WindDir3pm"],
        dtype=int,
    )

    numerical_columns = df.select_dtypes(include=["int64", "float64"])

    df = pd.concat([numerical_columns, categorical_columns], axis=1)

    return df


In [6]:
def instance_transformations(n_components=5):
    discretizer = KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="kmeans", random_state=42)
    smote = over_sampling.SMOTE(random_state=42)

    scaler = StandardScaler()
    pca = PCA(n_components=n_components, random_state=42)

    return discretizer, scaler, pca, smote

In [7]:
def adjust_train_volume(df, target_column, smote):
    X = df.drop(columns=[target_column])
    y = df[target_column]

    X_res, y_res = smote.fit_resample(X, y)
    df_smote = pd.concat([X_res, y_res], axis=1)

    return df_smote

In [8]:
def fit_transformmations(df, cols, discretizer, scaler, pca, discrete_col="Rainfall"):
    discretizer.fit(df[discrete_col].values.reshape(-1, 1))

    scaler.fit(df[cols])
    pca.fit(df[cols])

    return discretizer, scaler, pca


In [9]:
def transform_data(df, cols, discretizer, scaler, pca, discrete_col="Rainfall"):
    df[discrete_col + "_Discretized"] = discretizer.transform(
        df[discrete_col].values.reshape(-1, 1)
    )
    df[cols] = scaler.transform(df[cols])
    df = df.reset_index(drop=True)

    pca_result = pca.transform(df[cols])

    df_pca = pd.DataFrame(
        data=pca_result,
        columns=[f"PC{comp}" for comp in range(1, pca.n_components_ + 1)],
    )
    df_pca = df_pca.reset_index(drop=True)
    df_pca["RainTomorrow"] = df["RainTomorrow"]

    return df, df_pca

# INICIO PRÉ-PROCESSAMENTO

SEPARA EM TREINO E TESTE

In [10]:
df_train, df_test = split_data(weather_aus, "RainTomorrow")

## PRÉ-PROCESSAMENTO NA BASE DE TREINO

AJUSTA TIPOS DE DADOS NO TREINO

In [11]:
df_train = adjust_data_types(df_train)

CRIA VARIÁVEIS AUXILIARES

In [12]:
df_train = create_auxiliar_columns(df_train)

CRIA AS INSTÂNCIAS DAS TRANSFORMAÇÕES

In [13]:
discretizer, scaler, pca, smote = instance_transformations()

AJUSTA O VOLUME DE TREINO

In [14]:
(
    df_train.RainTomorrow.value_counts(dropna=False),
    df_train.RainTomorrow.value_counts(normalize=True, dropna=False),
)

(RainTomorrow
 0    35194
 1     9942
 Name: count, dtype: int64,
 RainTomorrow
 0    0.779732
 1    0.220268
 Name: proportion, dtype: float64)

In [15]:
# df_train = adjust_train_volume(df_train, "RainTomorrow", smote)

In [16]:
(
    df_train.RainTomorrow.value_counts(dropna=False),
    df_train.RainTomorrow.value_counts(normalize=True, dropna=False),
)

(RainTomorrow
 0    35194
 1     9942
 Name: count, dtype: int64,
 RainTomorrow
 0    0.779732
 1    0.220268
 Name: proportion, dtype: float64)

TREINA AS TRANSFORMAÇÕES COM A BASE DE TREINO

In [17]:
continuous_columns = [
    "MinTemp",
    "MaxTemp",
    "Rainfall",
    "Evaporation",
    "Sunshine",
    "WindGustSpeed",
    "WindSpeed9am",
    "WindSpeed3pm",
    "Humidity9am",
    "Humidity3pm",
    "Pressure3pm",
    "Cloud9am",
    "Cloud3pm",
    "RangeTemp",
]

categorical_columns = [
    "WindGustDir_ENE",
    "WindGustDir_ESE",
    "WindGustDir_N",
    "WindGustDir_NE",
    "WindGustDir_NNE",
    "WindGustDir_NNW",
    "WindGustDir_NW",
    "WindGustDir_S",
    "WindGustDir_SE",
    "WindGustDir_SSE",
    "WindGustDir_SSW",
    "WindGustDir_SW",
    "WindGustDir_W",
    "WindGustDir_WNW",
    "WindGustDir_WSW",
    "WindDir3pm_ENE",
    "WindDir3pm_ESE",
    "WindDir3pm_N",
    "WindDir3pm_NE",
    "WindDir3pm_NNE",
    "WindDir3pm_NNW",
    "WindDir3pm_NW",
    "WindDir3pm_S",
    "WindDir3pm_SE",
    "WindDir3pm_SSE",
    "WindDir3pm_SSW",
    "WindDir3pm_SW",
    "WindDir3pm_W",
    "WindDir3pm_WNW",
    "WindDir3pm_WSW",
    "Rainfall_Discretized",
]

discretizer, scaler, pca = fit_transformmations(
    df_train, continuous_columns, discretizer, scaler, pca
)

APLICA AS TRANSFORMAÇÕES NA BASE DE TREINO

In [18]:
df_train, df_train_pca = transform_data(
    df_train, continuous_columns, discretizer, scaler, pca
)

## APLICAÇÕES SOBRE BASE DE TESTE

AJUSTA OS TIPOS DE DADOS

In [19]:
df_test = adjust_data_types(df_test)

CRIA AS COLUNAS AUXILIARES

In [20]:
df_test = create_auxiliar_columns(df_test)

APLICA AS TRANSFORMAÇÕES JÁ TREINADAS

In [21]:
df_test, df_test_pca = transform_data(
    df_test, continuous_columns, discretizer, scaler, pca
)

In [22]:
(
    df_test.RainTomorrow.value_counts(dropna=False),
    df_test.RainTomorrow.value_counts(normalize=True, dropna=False),
)

(RainTomorrow
 0    8799
 1    2485
 Name: count, dtype: int64,
 RainTomorrow
 0    0.779777
 1    0.220223
 Name: proportion, dtype: float64)

# PROCESSO DE MODELAGEM

FUNÇÃO DE APLICAÇÃO DE MODELOS

In [23]:
def run_cross_validation(
    train_data, train_target, test_data, test_target, model, param_grid, k_folds=10
):
    skf = StratifiedKFold(n_splits=k_folds)
    scores_validation = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    scores_test = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    best_params = None
    best_model = None
    best_f1 = 0
    counter = 1
    print(f"====> INICIANDO PROCESSAMENTO: {datetime.now()} <====")

    for train_index, valid_index in skf.split(train_data, train_target):
        inicio = datetime.now()
        print(f"====> {inicio} | EXECUTANDO FOLD {counter} ")
        
        # Use .iloc to select rows based on indices
        X_train_fold, X_valid_fold = (
            train_data.iloc[train_index],
            train_data.iloc[valid_index],
        )
        y_train_fold, y_valid_fold = (
            train_target.iloc[train_index],
            train_target.iloc[valid_index],
        )

        # Etapa 2: Treinamento e ajuste de hiperparâmetros com validação cruzada interna
        grid_search = GridSearchCV(
            model, param_grid, scoring="f1", cv=StratifiedKFold(n_splits=3), n_jobs=-1, verbose=3
        )
        grid_search.fit(X_train_fold, y_train_fold)
        model = grid_search.best_estimator_
        f1 = grid_search.best_score_

        if f1 > best_f1:
            best_model = model
            best_params = grid_search.best_params_
            best_f1 = f1

        # Avaliação nos dados de validação
        y_valid_pred = model.predict(X_valid_fold)

        # Cálculo das métricas no fold
        scores_validation["f1"].append(
            f1_score(y_valid_fold, y_valid_pred, average="macro")
        )
        scores_validation["accuracy"].append(accuracy_score(y_valid_fold, y_valid_pred))
        scores_validation["precision"].append(
            precision_score(y_valid_fold, y_valid_pred, average="macro")
        )
        scores_validation["recall"].append(
            recall_score(y_valid_fold, y_valid_pred, average="macro")
        )

        # Avaliar na base de teste com o melhor modelo
        y_test_pred = model.predict(test_data)
        f1_test = f1_score(test_target, y_test_pred, average="macro")
        accuracy_test = accuracy_score(test_target, y_test_pred)
        precision_test = precision_score(test_target, y_test_pred, average="macro")
        recall_test = recall_score(test_target, y_test_pred, average="macro")

        # Armazenar as métricas na base de teste
        scores_test["f1"].append(f1_test)
        scores_test["accuracy"].append(accuracy_test)
        scores_test["precision"].append(precision_test)
        scores_test["recall"].append(recall_test)
        
        counter += 1
        final = datetime.now()
        print(f"===> {final} | FINALIZADO FOLD {counter - 1} | TEMPO TOTAL {final - inicio}\n")

    # Calcula as médias das métricas nos folds de validação
    avg_scores_validation = {
        metric: np.mean(values) for metric, values in scores_validation.items()
    }
    test_scores = {metric: np.mean(values) for metric, values in scores_test.items()}

    # Criar log dos resultados
    log = pd.DataFrame(scores_validation)
    log["dataset"] = "Validação Cruzada"

    test_log = pd.DataFrame(scores_test)
    test_log["dataset"] = "Teste"

    # Concatenar os logs da validação cruzada e do teste
    full_log = pd.concat([log, test_log])

    return avg_scores_validation, test_scores, best_params, full_log

In [24]:
def run_cross_mix(
    train_data,
    train_target,
    test_data,
    test_target,
    alphas,
    k_folds=10,
    categorical_features=None,
):
    skf = StratifiedKFold(n_splits=k_folds)
    scores_validation = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    scores_test = {"f1": [], "accuracy": [], "precision": [], "recall": []}
    best_model = None
    best_alpha = None
    best_f1 = 0

    # Validação cruzada estratificada
    for train_index, valid_index in skf.split(train_data, train_target):
        X_train_fold, X_valid_fold = (
            train_data.iloc[train_index],
            train_data.iloc[valid_index],
        )
        y_train_fold, y_valid_fold = (
            train_target.iloc[train_index],
            train_target.iloc[valid_index],
        )

        # Iterar pelos valores de alpha
        for alpha in alphas:
            # Treina o modelo com o valor de alpha atual
            model = MixedNB(categorical_features=categorical_features, alpha=alpha)
            model.fit(X_train_fold, y_train_fold)

            # Avaliação nos dados de validação
            y_valid_pred = model.predict(X_valid_fold)
            f1 = f1_score(y_valid_fold, y_valid_pred, average="macro")
            accuracy = accuracy_score(y_valid_fold, y_valid_pred)
            precision = precision_score(y_valid_fold, y_valid_pred, average="macro")
            recall = recall_score(y_valid_fold, y_valid_pred, average="macro")

            # Seleciona o melhor modelo baseado na métrica F1-score
            if f1 > best_f1:
                best_f1 = f1
                best_model = model
                best_alpha = alpha

        # Armazenar as métricas para o fold atual
        scores_validation["f1"].append(f1)
        scores_validation["accuracy"].append(accuracy)
        scores_validation["precision"].append(precision)
        scores_validation["recall"].append(recall)

        # Avaliar na base de teste com o melhor modelo
        y_test_pred = model.predict(test_data)
        f1_test = f1_score(test_target, y_test_pred, average="macro")
        accuracy_test = accuracy_score(test_target, y_test_pred)
        precision_test = precision_score(test_target, y_test_pred, average="macro")
        recall_test = recall_score(test_target, y_test_pred, average="macro")

        # Armazenar as métricas na base de teste
        scores_test["f1"].append(f1_test)
        scores_test["accuracy"].append(accuracy_test)
        scores_test["precision"].append(precision_test)
        scores_test["recall"].append(recall_test)

    # Calcular as médias das métricas de validação cruzada
    avg_scores_validation = {
        metric: np.mean(values) for metric, values in scores_validation.items()
    }
    test_scores = {metric: np.mean(values) for metric, values in scores_test.items()}

    # Criar log dos resultados
    log = pd.DataFrame(scores_validation)
    log["dataset"] = "Validação Cruzada"

    test_log = pd.DataFrame(scores_test)
    test_log["dataset"] = "Teste"

    # Concatenar os logs da validação cruzada e do teste
    full_log = pd.concat([log, test_log])

    return avg_scores_validation, best_alpha, test_scores, full_log


## BAYES

### TODAS AS CARACTERÍSTICAS

In [25]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_full = (
    run_cross_mix(
        df_train.drop(columns=["RainTomorrow"]),
        df_train["RainTomorrow"],
        df_test.drop(columns=["RainTomorrow"]),
        df_test["RainTomorrow"],
        alphas,
        k_folds=5,
        categorical_features=[
            df_train.drop(columns="RainTomorrow").columns.get_loc(col)
            for col in categorical_columns
        ],
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

Resultados na validação cruzada: {'f1': 0.7296689905352627, 'accuracy': 0.7943990703025074, 'precision': 0.7149275575461388, 'recall': 0.7578483264273889}
Resultados na base de teste: {'f1': 0.7340883015629052, 'accuracy': 0.7981566820276498, 'precision': 0.7190936055485329, 'recall': 0.7621159894930442}
Melhores parâmetros escolhidos: 0.0


In [26]:
results_list_full

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.733274,0.797962,0.718511,0.760423,Validação Cruzada
1,0.728845,0.793619,0.714112,0.757099,Validação Cruzada
2,0.72996,0.796056,0.715631,0.75589,Validação Cruzada
3,0.72543,0.79096,0.710917,0.753524,Validação Cruzada
4,0.730836,0.793398,0.715466,0.762306,Validação Cruzada
0,0.733599,0.797767,0.718635,0.761606,Teste
1,0.733833,0.797944,0.718851,0.761864,Teste
2,0.73434,0.798653,0.719429,0.761886,Teste
3,0.735009,0.798653,0.719876,0.763474,Teste
4,0.73366,0.797767,0.718676,0.76175,Teste


### PCA

In [27]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_pca = run_cross_validation(
    df_train_pca.drop(columns=["RainTomorrow"]),
    df_train_pca["RainTomorrow"],
    df_test_pca.drop(columns=["RainTomorrow"]),
    df_test_pca["RainTomorrow"],
    GaussianNB(),
    param_grid_categorical,
    k_folds=5,
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-07 10:08:19.884506 <====
====> 2024-11-07 10:08:19.894124 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-07 10:08:23.683040 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:03.788916

====> 2024-11-07 10:08:23.684038 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-07 10:08:23.974432 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.290394

====> 2024-11-07 10:08:23.974432 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-07 10:08:24.268128 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.293696

====> 2024-11-07 10:08:24.268128 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-07 10:08:24.564581 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.296453

====> 2024-11-07 10:08:24.564581 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-07 10:08:24.834439 | FINA

In [28]:
results_list_pca

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.651889,0.817235,0.767812,0.628322,Validação Cruzada
1,0.646924,0.812784,0.751365,0.624937,Validação Cruzada
2,0.645609,0.812895,0.752616,0.623749,Validação Cruzada
3,0.647407,0.812452,0.749164,0.62545,Validação Cruzada
4,0.650158,0.814446,0.755792,0.627451,Validação Cruzada
0,0.64731,0.815314,0.762573,0.624721,Teste
1,0.649709,0.815757,0.762382,0.626738,Teste
2,0.648382,0.815402,0.76195,0.625644,Teste
3,0.652945,0.816909,0.764795,0.629353,Teste
4,0.651923,0.8162,0.76237,0.62861,Teste


### CORRELAÇÃO

In [29]:
# Parâmetros do modelo
alphas = [0.0, 0.01, 0.1, 0.5, 1.0]

filtro_correlacao = [
    "MinTemp",
    "MaxTemp",
    "Rainfall",
    "Evaporation",
    "Sunshine",
    "WindGustSpeed",
    "WindSpeed3pm",
    "Humidity9am",
    "Humidity3pm",
    "Pressure3pm",
    "Cloud9am",
    "Cloud3pm",
    "RangeTemp",
    "Rainfall_Discretized",
]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_corr = (
    run_cross_mix(
        df_train[filtro_correlacao],
        df_train["RainTomorrow"],
        df_test[filtro_correlacao],
        df_test["RainTomorrow"],
        alphas,
        k_folds=5,
        categorical_features=[13],
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

Resultados na validação cruzada: {'f1': 0.729130514728624, 'accuracy': 0.7941110430339464, 'precision': 0.7144677407519489, 'recall': 0.7570501118962696}
Resultados na base de teste: {'f1': 0.7343663709477923, 'accuracy': 0.7981921304501949, 'precision': 0.7192924424334632, 'recall': 0.7627162497658985}
Melhores parâmetros escolhidos: 0.0


In [30]:
results_list_corr

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.733689,0.798073,0.718824,0.761215,Validação Cruzada
1,0.728303,0.793619,0.713735,0.755836,Validação Cruzada
2,0.729459,0.795945,0.715265,0.754916,Validação Cruzada
3,0.725111,0.790628,0.710599,0.753311,Validação Cruzada
4,0.729091,0.79229,0.713916,0.759972,Validação Cruzada
0,0.734076,0.797944,0.719015,0.762442,Teste
1,0.734568,0.798387,0.719492,0.76287,Teste
2,0.734411,0.798476,0.719416,0.762349,Teste
3,0.734233,0.797855,0.719093,0.762962,Teste
4,0.734543,0.798298,0.719446,0.762958,Teste


### RANDOM FOREST - FEATURE IMPORTANCE

In [31]:
# Parâmetros do modelo
param_grid_categorical = {"var_smoothing": [0.0, 0.01, 0.1, 0.5, 1.0]}

caracteristicas_filtro = [
    "Humidity3pm",
    "Cloud3pm",
    "Sunshine",
    "Rainfall",
    "Pressure3pm",
    "Cloud9am",
    "WindGustSpeed",
    "RangeTemp",
    "Humidity9am",
    "MinTemp",
]

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rd = run_cross_validation(
    df_train[caracteristicas_filtro],
    df_train["RainTomorrow"],
    df_test[caracteristicas_filtro],
    df_test["RainTomorrow"],
    GaussianNB(),
    param_grid_categorical,
    k_folds=5,
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-07 10:08:26.322979 <====
====> 2024-11-07 10:08:26.333503 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-07 10:08:26.701785 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:00.368282

====> 2024-11-07 10:08:26.701785 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-07 10:08:27.034261 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:00.332476

====> 2024-11-07 10:08:27.034261 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-07 10:08:27.364185 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:00.329924

====> 2024-11-07 10:08:27.364185 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-07 10:08:27.682542 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:00.318357

====> 2024-11-07 10:08:27.682542 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 5 candidates, totalling 15 fits
===> 2024-11-07 10:08:28.004560 | FINA

In [32]:
results_list_rd

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.738495,0.803611,0.724036,0.763324,Validação Cruzada
1,0.732757,0.801041,0.71951,0.754284,Validação Cruzada
2,0.735917,0.804254,0.722883,0.756454,Validação Cruzada
3,0.727244,0.794395,0.713254,0.752297,Validação Cruzada
4,0.735195,0.798826,0.72006,0.763621,Validação Cruzada
0,0.739033,0.803261,0.724179,0.765274,Teste
1,0.739156,0.805388,0.725235,0.762017,Teste
2,0.739494,0.80592,0.725695,0.761925,Teste
3,0.73824,0.802021,0.723189,0.765489,Teste
4,0.73878,0.802818,0.723842,0.765423,Teste


## RANDOM FOREST - CLASSIFIER

### TODAS AS CARACTERÍSTICAS

In [33]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_full = run_cross_validation(
    df_train.drop(columns=["RainTomorrow"]),
    df_train["RainTomorrow"],
    df_test.drop(columns=["RainTomorrow"]),
    df_test["RainTomorrow"],
    RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
    param_grid_categorical,
    k_folds=5,
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-07 10:08:28.051561 <====
====> 2024-11-07 10:08:28.058678 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 10:15:12.143682 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:06:44.085004

====> 2024-11-07 10:15:12.143682 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 10:22:18.325774 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:07:06.182092

====> 2024-11-07 10:22:18.326767 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 10:28:45.738219 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:06:27.411452

====> 2024-11-07 10:28:45.739218 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 10:35:22.797450 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:06:37.058232

====> 2024-11-07 10:35:22.798451 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 10:42:07.517965 |

In [34]:
results_list_full

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.772136,0.859216,0.813603,0.747222,Validação Cruzada
1,0.772699,0.861083,0.820225,0.745541,Validação Cruzada
2,0.772653,0.861748,0.822806,0.744591,Validação Cruzada
3,0.768465,0.855766,0.805085,0.745628,Validação Cruzada
4,0.771291,0.859089,0.813969,0.745954,Validação Cruzada
0,0.768863,0.858295,0.813832,0.742809,Teste
1,0.768149,0.858472,0.815393,0.741334,Teste
2,0.767526,0.859979,0.822408,0.738257,Teste
3,0.764204,0.855725,0.809527,0.738273,Teste
4,0.763539,0.85537,0.808953,0.737613,Teste


### PCA

In [25]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_pca = run_cross_validation(
    df_train_pca.drop(columns=["RainTomorrow"]),
    df_train_pca["RainTomorrow"],
    df_test_pca.drop(columns=["RainTomorrow"]),
    df_test_pca["RainTomorrow"],
    RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
    param_grid_categorical,
    k_folds=5,
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-07 11:22:45.069247 <====
====> 2024-11-07 11:22:45.077546 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits


===> 2024-11-07 11:30:55.209521 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:08:10.131975

====> 2024-11-07 11:30:55.209521 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 11:38:43.939856 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:07:48.730335

====> 2024-11-07 11:38:43.939856 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 11:46:38.755321 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:07:54.815465

====> 2024-11-07 11:46:38.755321 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 11:54:30.385848 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:07:51.630527

====> 2024-11-07 11:54:30.385848 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 12:02:26.722473 | FINALIZADO FOLD 5 | TEMPO TOTAL 0:07:56.336625

Resultados na validação cruzada: {'f1': 0.7392379290381368, 'accuracy': 0.8421658644408427, 'precision': 0.7865196609375363, 'reca

In [26]:
results_list_pca

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.742201,0.843708,0.788993,0.717078,Validação Cruzada
1,0.738087,0.843913,0.79366,0.710724,Validação Cruzada
2,0.741128,0.843802,0.790124,0.715397,Validação Cruzada
3,0.732697,0.835715,0.770935,0.710934,Validação Cruzada
4,0.742076,0.843691,0.788887,0.71695,Validação Cruzada
0,0.733585,0.839685,0.782633,0.708352,Teste
1,0.736531,0.8419,0.787665,0.710494,Teste
2,0.735644,0.841546,0.787226,0.709545,Teste
3,0.736425,0.840482,0.782946,0.711751,Teste
4,0.73476,0.840748,0.785244,0.709034,Teste


### CORRELAÇÃO

In [27]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

filtro_correlacao = [
    "MinTemp",
    "MaxTemp",
    "Rainfall",
    "Evaporation",
    "Sunshine",
    "WindGustSpeed",
    "WindSpeed3pm",
    "Humidity9am",
    "Humidity3pm",
    "Pressure3pm",
    "Cloud9am",
    "Cloud3pm",
    "RangeTemp",
    "Rainfall_Discretized",
]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_corr = (
    run_cross_validation(
        df_train[filtro_correlacao],
        df_train["RainTomorrow"],
        df_test[filtro_correlacao],
        df_test["RainTomorrow"],
        RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-07 12:02:26.912570 <====
====> 2024-11-07 12:02:26.922090 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 12:08:10.273799 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:05:43.351709

====> 2024-11-07 12:08:10.273799 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 12:13:52.062492 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:05:41.788693

====> 2024-11-07 12:13:52.062492 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 12:19:35.739005 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:05:43.676513

====> 2024-11-07 12:19:35.739005 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 12:25:36.950514 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:06:01.211509

====> 2024-11-07 12:25:36.950514 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 12:31:33.121388 |

In [28]:
results_list_corr

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.775512,0.861763,0.819249,0.749577,Validação Cruzada
1,0.771429,0.860086,0.817871,0.744721,Validação Cruzada
2,0.774701,0.861748,0.82007,0.748201,Validação Cruzada
3,0.763784,0.853107,0.800671,0.741036,Validação Cruzada
4,0.775267,0.860862,0.815921,0.75052,Validação Cruzada
0,0.768475,0.857675,0.811932,0.742988,Teste
1,0.767381,0.858029,0.814602,0.740617,Teste
2,0.768451,0.859093,0.817482,0.74101,Teste
3,0.764846,0.855548,0.808144,0.739603,Teste
4,0.766202,0.857231,0.812931,0.739672,Teste


### RAIN FOREST - FEATURE IMPORTANCE

In [29]:
param_grid_categorical = {
    "n_estimators": [500, 700, 1000],
    "max_features": ["sqrt", "log2", 10, None],
}

caracteristicas_filtro = [
    "Humidity3pm",
    "Cloud3pm",
    "Sunshine",
    "Rainfall",
    "Pressure3pm",
    "Cloud9am",
    "WindGustSpeed",
    "RangeTemp",
    "Humidity9am",
    "MinTemp",
]

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rd = run_cross_validation(
    df_train[caracteristicas_filtro],
    df_train["RainTomorrow"],
    df_test[caracteristicas_filtro],
    df_test["RainTomorrow"],
    RandomForestClassifier(n_jobs=-1, random_state=42, max_depth=None),
    param_grid_categorical,
    k_folds=5,
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-07 12:31:33.461413 <====
====> 2024-11-07 12:31:33.471504 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 12:36:39.466892 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:05:05.995388

====> 2024-11-07 12:36:39.466892 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 12:41:38.479499 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:04:59.012607

====> 2024-11-07 12:41:38.479499 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 12:46:32.453414 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:04:53.973915

====> 2024-11-07 12:46:32.453414 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 12:51:28.874736 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:04:56.421322

====> 2024-11-07 12:51:28.874736 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 12 candidates, totalling 36 fits
===> 2024-11-07 12:56:26.271803 |

In [30]:
results_list_rd

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.772623,0.860767,0.819002,0.745872,Validação Cruzada
1,0.768832,0.8592,0.81765,0.741448,Validação Cruzada
2,0.767764,0.858646,0.816577,0.740436,Validação Cruzada
3,0.763338,0.855101,0.808184,0.737622,Validação Cruzada
4,0.770653,0.859422,0.81602,0.744363,Validação Cruzada
0,0.76393,0.856079,0.811193,0.737345,Teste
1,0.76643,0.85794,0.815433,0.739116,Teste
2,0.767432,0.859093,0.818809,0.739277,Teste
3,0.768084,0.858561,0.815824,0.741102,Teste
4,0.765626,0.857763,0.815738,0.737992,Teste


## SVM

### TODAS AS CARACTERÍSTICAS

In [25]:
param_grid_categorical = {
    'C': [10, 100],
    'kernel': ['rbf', 'linear'],
    'gamma': [0.01, 0.1]
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_full = run_cross_validation(
    df_train.drop(columns=["RainTomorrow"]),
    df_train["RainTomorrow"],
    df_test.drop(columns=["RainTomorrow"]),
    df_test["RainTomorrow"],
    SVC(random_state=42),
    param_grid_categorical,
    k_folds=5,
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-07 15:58:47.068242 <====
====> 2024-11-07 15:58:47.068242 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 16:39:16.489588 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:40:29.421346

====> 2024-11-07 16:39:16.489588 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 17:21:35.292472 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:42:18.802884

====> 2024-11-07 17:21:35.292472 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 18:04:09.977381 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:42:34.684909

====> 2024-11-07 18:04:09.977381 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 18:45:11.773897 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:41:01.796516

====> 2024-11-07 18:45:11.773897 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 19:28:01.332865 | FINA

In [26]:
results_list_full

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.770272,0.853899,0.797521,0.751568,Validação Cruzada
1,0.770828,0.855655,0.802584,0.749995,Validação Cruzada
2,0.770362,0.854991,0.800702,0.750184,Validação Cruzada
3,0.771538,0.854326,0.797634,0.753368,Validação Cruzada
4,0.767196,0.852553,0.795859,0.7479,Validação Cruzada
0,0.763989,0.851471,0.7952,0.743631,Teste
1,0.758471,0.848547,0.790724,0.737857,Teste
2,0.756086,0.847395,0.789098,0.735242,Teste
3,0.762606,0.850319,0.792806,0.742748,Teste
4,0.763908,0.851471,0.795268,0.743487,Teste


### PCA

In [27]:
param_grid_categorical = {
    'C': [10, 100],
    'kernel': ['rbf', 'linear'],
    'gamma': [0.01, 0.1]
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_pca_linear = run_cross_validation(
    df_train_pca.drop(columns=["RainTomorrow"]),
    df_train_pca["RainTomorrow"],
    df_test_pca.drop(columns=["RainTomorrow"]),
    df_test_pca["RainTomorrow"],
    SVC(random_state=42),
    param_grid_categorical,
    k_folds=5,
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-07 19:28:01.380466 <====
====> 2024-11-07 19:28:01.391140 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 19:30:52.368341 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:02:50.977201

====> 2024-11-07 19:30:52.369343 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 19:33:39.171600 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:02:46.802257

====> 2024-11-07 19:33:39.172566 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 19:36:27.374044 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:02:48.201478

====> 2024-11-07 19:36:27.375040 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 19:39:12.872888 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:02:45.497848

====> 2024-11-07 19:39:12.873889 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 19:42:00.675893 | FINA

In [28]:
results_list_pca_linear

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.737194,0.845813,0.802006,0.707607,Validação Cruzada
1,0.728986,0.841254,0.792863,0.700363,Validação Cruzada
2,0.731697,0.841697,0.791549,0.70376,Validação Cruzada
3,0.721621,0.838374,0.788925,0.692966,Validação Cruzada
4,0.729198,0.840146,0.788193,0.701683,Validação Cruzada
0,0.72644,0.840305,0.791544,0.697776,Teste
1,0.729784,0.840748,0.789861,0.701959,Teste
2,0.728862,0.840482,0.789769,0.700922,Teste
3,0.725234,0.840482,0.793529,0.696013,Teste
4,0.730102,0.840659,0.789217,0.70248,Teste


### CORRELAÇÃO

In [25]:
param_grid_categorical = {
    'C': [10, 100],
    'kernel': ['rbf', 'linear'],
    'gamma': [0.01, 0.1]
}


filtro_correlacao = [
    "MinTemp",
    "MaxTemp",
    "Rainfall",
    "Evaporation",
    "Sunshine",
    "WindGustSpeed",
    "WindSpeed3pm",
    "Humidity9am",
    "Humidity3pm",
    "Pressure3pm",
    "Cloud9am",
    "Cloud3pm",
    "RangeTemp",
    "Rainfall_Discretized",
]

# Executar para diferentes modelos e conjuntos de dados
results_validation, best_params, results_test, results_list_corr = (
    run_cross_validation(
        df_train[filtro_correlacao],
        df_train["RainTomorrow"],
        df_test[filtro_correlacao],
        df_test["RainTomorrow"],
        SVC(random_state=42),
        param_grid_categorical,
        k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-07 20:50:24.709700 <====
====> 2024-11-07 20:50:24.720645 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 21:04:10.235880 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:13:45.515235

====> 2024-11-07 21:04:10.235880 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 21:17:15.176439 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:13:04.940559

====> 2024-11-07 21:17:15.177396 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 21:29:51.827679 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:12:36.650283

====> 2024-11-07 21:29:51.828540 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 21:42:28.316546 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:12:36.488006

====> 2024-11-07 21:42:28.317543 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 21:55:37.066217 | FINA

In [26]:
results_list_corr

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.766103,0.858883,0.819899,0.73727,Validação Cruzada
1,0.761829,0.85776,0.821027,0.731687,Validação Cruzada
2,0.765405,0.858646,0.81967,0.736466,Validação Cruzada
3,0.762196,0.855323,0.810334,0.735418,Validação Cruzada
4,0.766998,0.858536,0.81711,0.739282,Validação Cruzada
0,0.758568,0.856345,0.819149,0.728275,Teste
1,0.761101,0.856966,0.818358,0.731561,Teste
2,0.760562,0.856966,0.819092,0.730694,Teste
3,0.761509,0.856168,0.814507,0.733215,Teste
4,0.759157,0.855991,0.816826,0.729636,Teste


### RANDOM FOREST - FEATURE IMPORTANDCE

In [27]:
param_grid_categorical = {
    'C': [10, 100],
    'kernel': ['rbf', 'linear'],
    'gamma': [0.01, 0.1]
}

caracteristicas_filtro = [
    "Humidity3pm",
    "Cloud3pm",
    "Sunshine",
    "Rainfall",
    "Pressure3pm",
    "Cloud9am",
    "WindGustSpeed",
    "RangeTemp",
    "Humidity9am",
    "MinTemp",
]

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rd = run_cross_validation(
    df_train[caracteristicas_filtro],
    df_train["RainTomorrow"],
    df_test[caracteristicas_filtro],
    df_test["RainTomorrow"],
    SVC(random_state=42),
    param_grid_categorical,
    k_folds=5,
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-07 21:55:37.106019 <====
====> 2024-11-07 21:55:37.110610 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 22:06:50.859647 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:11:13.749037

====> 2024-11-07 22:06:50.860605 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 22:14:59.808668 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:08:08.948063

====> 2024-11-07 22:14:59.808668 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 22:25:42.868374 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:10:43.059706

====> 2024-11-07 22:25:42.868374 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 22:36:44.646299 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:11:01.777925

====> 2024-11-07 22:36:44.646299 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 8 candidates, totalling 24 fits
===> 2024-11-07 22:47:37.047757 | FINA

In [28]:
results_list_rd

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.763346,0.85689,0.815286,0.73527,Validação Cruzada
1,0.760695,0.858536,0.826176,0.728939,Validação Cruzada
2,0.758655,0.853772,0.808493,0.731536,Validação Cruzada
3,0.754982,0.852332,0.807177,0.727364,Validação Cruzada
4,0.75999,0.854547,0.809932,0.732755,Validação Cruzada
0,0.75616,0.853952,0.812213,0.727319,Teste
1,0.757111,0.856523,0.822001,0.72579,Teste
2,0.761259,0.857409,0.820046,0.731267,Teste
3,0.756809,0.854041,0.811764,0.728242,Teste
4,0.757378,0.854927,0.81469,0.728088,Teste


## MLP

### TODAS AS CARACTERÍSTICAS

In [29]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1]
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_full = run_cross_validation(
    df_train.drop(columns=["RainTomorrow"]),
    df_train["RainTomorrow"],
    df_test.drop(columns=["RainTomorrow"]),
    df_test["RainTomorrow"],
    MLPClassifier(random_state=42),
    param_grid_categorical,
    k_folds=5,
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-07 22:47:37.084174 <====
====> 2024-11-07 22:47:37.091158 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 22:49:06.173037 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:01:29.081879

====> 2024-11-07 22:49:06.173037 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 22:50:39.491742 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:01:33.318705

====> 2024-11-07 22:50:39.491742 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 22:52:17.236233 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:01:37.744491

====> 2024-11-07 22:52:17.236233 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 22:53:40.684858 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:01:23.448625

====> 2024-11-07 22:53:40.684858 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 22:55:18.373425 |

In [30]:
results_list_full

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.772742,0.857333,0.806244,0.751064,Validação Cruzada
1,0.771778,0.858203,0.810335,0.748022,Validação Cruzada
2,0.775228,0.859643,0.811497,0.752265,Validação Cruzada
3,0.777055,0.857428,0.80236,0.759146,Validação Cruzada
4,0.777099,0.860086,0.811023,0.755076,Validação Cruzada
0,0.763077,0.85218,0.798255,0.741053,Teste
1,0.768558,0.855902,0.805451,0.745606,Teste
2,0.764169,0.853864,0.802861,0.740689,Teste
3,0.769948,0.85537,0.8023,0.748874,Teste
4,0.770295,0.856434,0.805511,0.747968,Teste


### PCA

In [31]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1]
}

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_pca = run_cross_validation(
    df_train_pca.drop(columns=["RainTomorrow"]),
    df_train_pca["RainTomorrow"],
    df_test_pca.drop(columns=["RainTomorrow"]),
    df_test_pca["RainTomorrow"],
    MLPClassifier(random_state=42),
    param_grid_categorical,
    k_folds=5,
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-07 22:55:18.422115 <====
====> 2024-11-07 22:55:18.427075 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 22:55:54.245380 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:35.818305

====> 2024-11-07 22:55:54.245380 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 22:56:32.637162 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:38.391782

====> 2024-11-07 22:56:32.638117 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 22:57:08.880740 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:36.242623

====> 2024-11-07 22:57:08.880740 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 22:57:45.697555 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:36.816815

====> 2024-11-07 22:57:45.698559 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 22:58:15.905675 |

In [32]:
results_list_pca

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.735989,0.84249,0.790448,0.709083,Validação Cruzada
1,0.723914,0.834497,0.773349,0.699275,Validação Cruzada
2,0.738676,0.836712,0.77018,0.719153,Validação Cruzada
3,0.728204,0.834718,0.770873,0.705241,Validação Cruzada
4,0.730765,0.832281,0.76262,0.711439,Validação Cruzada
0,0.730274,0.839419,0.784561,0.70385,Teste
1,0.730422,0.838355,0.780777,0.705045,Teste
2,0.73956,0.838444,0.774438,0.718674,Teste
3,0.730367,0.837026,0.776479,0.706214,Teste
4,0.730127,0.834101,0.767877,0.70867,Teste


### CORRELAÇÃO

In [33]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1]
}

filtro_correlacao = [
    "MinTemp",
    "MaxTemp",
    "Rainfall",
    "Evaporation",
    "Sunshine",
    "WindGustSpeed",
    "WindSpeed3pm",
    "Humidity9am",
    "Humidity3pm",
    "Pressure3pm",
    "Cloud9am",
    "Cloud3pm",
    "RangeTemp",
    "Rainfall_Discretized",
]

# Executar para diferentes modelos e conjuntos de dados
results_validation_v2, best_params_v2, results_test_v2, results_list_corr_v2 = (
    run_cross_validation(
        df_train[filtro_correlacao],
        df_train["RainTomorrow"],
        df_test[filtro_correlacao],
        df_test["RainTomorrow"],
    MLPClassifier(random_state=42),
    param_grid_categorical,
    k_folds=5,
    )
)

print("Resultados na validação cruzada:", results_validation_v2)
print("Resultados na base de teste:", results_test_v2)
print("Melhores parâmetros escolhidos:", best_params_v2)

====> INICIANDO PROCESSAMENTO: 2024-11-07 22:58:15.946074 <====
====> 2024-11-07 22:58:15.955827 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 22:59:02.475894 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:46.520067

====> 2024-11-07 22:59:02.476894 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 22:59:45.556796 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:43.079902

====> 2024-11-07 22:59:45.556796 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 23:00:29.329596 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:43.772800

====> 2024-11-07 23:00:29.329596 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 23:01:15.788577 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:46.458981

====> 2024-11-07 23:01:15.788577 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 23:02:04.731855 |

In [34]:
results_list_corr_v2

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.769653,0.858219,0.81277,0.744238,Validação Cruzada
1,0.762752,0.85211,0.798513,0.740508,Validação Cruzada
2,0.743469,0.848898,0.807077,0.713611,Validação Cruzada
3,0.760715,0.849673,0.792371,0.740277,Validação Cruzada
4,0.765364,0.855212,0.806371,0.740941,Validação Cruzada
0,0.763335,0.854307,0.805278,0.738663,Teste
1,0.76916,0.85475,0.80102,0.748332,Teste
2,0.743855,0.851028,0.815879,0.712016,Teste
3,0.766953,0.854573,0.802525,0.744609,Teste
4,0.762535,0.854573,0.807111,0.736957,Teste


### RANDOM FOREST - FEATURE IMPORTANCE

In [35]:
param_grid_categorical = {
    "hidden_layer_sizes": [5, 10, 15],
    "activation": ["logistic", "relu"],
    "learning_rate_init": [0.001, 0.01, 0.05, 0.1]
}

caracteristicas_filtro = [
    "Humidity3pm",
    "Cloud3pm",
    "Sunshine",
    "Rainfall",
    "Pressure3pm",
    "Cloud9am",
    "WindGustSpeed",
    "RangeTemp",
    "Humidity9am",
    "MinTemp",
]

# Executar a função e obter os melhores parâmetros
results_validation, results_test, best_params, results_list_rd = run_cross_validation(
    df_train[caracteristicas_filtro],
    df_train["RainTomorrow"],
    df_test[caracteristicas_filtro],
    df_test["RainTomorrow"],
    MLPClassifier(random_state=42),
    param_grid_categorical,
    k_folds=5,
)

print("Resultados na validação cruzada:", results_validation)
print("Resultados na base de teste:", results_test)
print("Melhores parâmetros escolhidos:", best_params)

====> INICIANDO PROCESSAMENTO: 2024-11-07 23:02:04.766868 <====
====> 2024-11-07 23:02:04.773424 | EXECUTANDO FOLD 1 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 23:02:42.790517 | FINALIZADO FOLD 1 | TEMPO TOTAL 0:00:38.017093

====> 2024-11-07 23:02:42.791517 | EXECUTANDO FOLD 2 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 23:03:22.798183 | FINALIZADO FOLD 2 | TEMPO TOTAL 0:00:40.006666

====> 2024-11-07 23:03:22.799100 | EXECUTANDO FOLD 3 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 23:04:06.705586 | FINALIZADO FOLD 3 | TEMPO TOTAL 0:00:43.906486

====> 2024-11-07 23:04:06.705586 | EXECUTANDO FOLD 4 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 23:04:45.661706 | FINALIZADO FOLD 4 | TEMPO TOTAL 0:00:38.956120

====> 2024-11-07 23:04:45.662892 | EXECUTANDO FOLD 5 
Fitting 3 folds for each of 24 candidates, totalling 72 fits
===> 2024-11-07 23:05:23.785495 |

In [36]:
results_list_rd

Unnamed: 0,f1,accuracy,precision,recall,dataset
0,0.757225,0.856668,0.822704,0.72575,Validação Cruzada
1,0.759138,0.853329,0.80647,0.732813,Validação Cruzada
2,0.75251,0.852997,0.812896,0.722737,Validação Cruzada
3,0.744727,0.849784,0.809242,0.71454,Validação Cruzada
4,0.755981,0.848455,0.792494,0.733721,Validação Cruzada
0,0.749332,0.852269,0.813933,0.718731,Teste
1,0.761384,0.853421,0.804162,0.736507,Teste
2,0.751169,0.853775,0.818083,0.719841,Teste
3,0.748139,0.851737,0.813206,0.717524,Teste
4,0.760687,0.851471,0.798122,0.737856,Teste


# Gráficos

In [37]:
# import numpy as np
# import matplotlib.pyplot as plt

# # Novos dados para Naive Bayes
# data_all_features_bayes = {
#     'f1': [0.726119, 0.725592, 0.724683, 0.722375, 0.722874, 0.722682, 0.722375, 0.722934, 0.722738, 0.723016],
#     'accuracy': [0.786335, 0.785537, 0.784030, 0.779599, 0.780308, 0.779865, 0.779599, 0.779954, 0.779865, 0.780220],
#     'precision': [0.710649, 0.710145, 0.709287, 0.707277, 0.707704, 0.707553, 0.707277, 0.707790, 0.707607, 0.707847],
#     'recall': [0.762794, 0.762860, 0.763194, 0.765117, 0.765139, 0.765432, 0.765117, 0.765922, 0.765576, 0.765659]
# }

# data_pca_bayes = {
#     'f1': [0.732789, 0.730902, 0.731559, 0.732407, 0.732860, 0.731385, 0.731249, 0.731013, 0.731946, 0.732835],
#     'accuracy': [0.797501, 0.794045, 0.796172, 0.797235, 0.796881, 0.795108, 0.794842, 0.794222, 0.795374, 0.796792],
#     'precision': [0.718004, 0.715667, 0.716736, 0.717659, 0.717848, 0.716296, 0.716126, 0.715792, 0.716767, 0.717803],
#     'recall': [0.760136, 0.761385, 0.759428, 0.759677, 0.761327, 0.760768, 0.760886, 0.761354, 0.761660, 0.761414]
# }

# data_correlation_filter_bayes = {
#     'f1': [0.714244, 0.714128, 0.713911, 0.714210, 0.715088, 0.714673, 0.714755, 0.714562, 0.714210, 0.714485],
#     'accuracy': [0.766838, 0.766484, 0.766306, 0.766572, 0.767636, 0.767015, 0.767104, 0.766838, 0.766572, 0.766927],
#     'precision': [0.701261, 0.701246, 0.701052, 0.701312, 0.701982, 0.701704, 0.701771, 0.701635, 0.701312, 0.701514],
#     'recall': [0.766897, 0.767247, 0.766989, 0.767304, 0.767697, 0.767732, 0.767789, 0.767763, 0.767304, 0.767387]
# }

# data_rf_filter_bayes = {
#     'f1': [0.711686, 0.711076, 0.710872, 0.711643, 0.712703, 0.711969, 0.711686, 0.711441, 0.711643, 0.712621],
#     'accuracy': [0.761343, 0.760457, 0.760457, 0.761521, 0.762673, 0.761875, 0.761343, 0.761078, 0.761521, 0.762584],
#     'precision': [0.700327, 0.699963, 0.699700, 0.700191, 0.701012, 0.700443, 0.700327, 0.700138, 0.700191, 0.700949],
#     'recall': [0.770304, 0.770313, 0.769736, 0.769840, 0.770579, 0.770068, 0.770304, 0.770134, 0.769840, 0.770522]
# }

# # Dados para Random Forest
# data_all_features_rf = {
#     'f1': [0.768612, 0.767213, 0.771389, 0.780573, 0.781157, 0.780052, 0.779104, 0.778486, 0.783918, 0.780674],
#     'accuracy': [0.852889, 0.852623, 0.853509, 0.850939, 0.851117, 0.850939, 0.849433, 0.849344, 0.852800, 0.851382],
#     'precision': [0.795707, 0.796050, 0.795338, 0.783812, 0.783957, 0.784009, 0.781317, 0.781348, 0.786345, 0.784684],
#     'recall': [0.750027, 0.747835, 0.754323, 0.777509, 0.778489, 0.776354, 0.776976, 0.775764, 0.781590, 0.776926]
# }

# data_pca_rf = {
#     'f1': [0.727469, 0.727767, 0.727271, 0.728724, 0.728118, 0.728312, 0.730578, 0.728352, 0.729553, 0.728998],
#     'accuracy': [0.794399, 0.794576, 0.793867, 0.795108, 0.794222, 0.794488, 0.796437, 0.794399, 0.795640, 0.795197],
#     'precision': [0.713401, 0.713660, 0.713084, 0.714479, 0.713771, 0.713991, 0.716171, 0.713987, 0.715214, 0.714692],
#     'recall': [0.752805, 0.753208, 0.753186, 0.754559, 0.754568, 0.754595, 0.756711, 0.754826, 0.755622, 0.755049]
# }

# data_correlation_filter_rf = {
#     'f1': [0.777317, 0.777999, 0.781135, 0.781051, 0.782910, 0.782677, 0.781339, 0.782004, 0.780691, 0.782620],
#     'accuracy': [0.851117, 0.850939, 0.852446, 0.847749, 0.848724, 0.848901, 0.847395, 0.847838, 0.847306, 0.848458],
#     'precision': [0.785554, 0.784854, 0.786748, 0.777755, 0.779053, 0.779399, 0.777117, 0.777737, 0.777077, 0.778660],
#     'recall': [0.770114, 0.771878, 0.776020, 0.784559, 0.787061, 0.786164, 0.785920, 0.786637, 0.784564, 0.786891]
# }

# # Função para calcular média e desvio padrão
# def calculate_mean_std(data):
#     return {
#         'mean': np.mean(data),
#         'std': np.std(data)
#     }

# # Armazenar resultados para Naive Bayes
# results_bayes = {
#     'All Features': {metric: calculate_mean_std(data_all_features_bayes[metric]) for metric in data_all_features_bayes},
#     'PCA': {metric: calculate_mean_std(data_pca_bayes[metric]) for metric in data_pca_bayes},
#     'RF Filter': {metric: calculate_mean_std(data_rf_filter_bayes[metric]) for metric in data_rf_filter_bayes},
#     'Correlation Filter': {metric: calculate_mean_std(data_correlation_filter_bayes[metric]) for metric in data_correlation_filter_bayes}
# }

# # Armazenar resultados para Random Forest
# results_rf = {
#     'All Features': {metric: calculate_mean_std(data_all_features_rf[metric]) for metric in data_all_features_rf},
#     'PCA': {metric: calculate_mean_std(data_pca_rf[metric]) for metric in data_pca_rf},
#     'RF Filter': {metric: calculate_mean_std(data_rf_filter_bayes[metric]) for metric in data_rf_filter_bayes}, # using data from bayes as a workaround
#     'Correlation Filter': {metric: calculate_mean_std(data_correlation_filter_rf[metric]) for metric in data_correlation_filter_rf}
# }

# # Criar os gráficos
# metrics = ['f1', 'accuracy', 'precision', 'recall']
# x = np.arange(len(results_bayes))  # Localização dos grupos
# width = 0.35  # Largura das barras

# fig, axs = plt.subplots(2, 2, figsize=(11, 8))
# fig.suptitle('Comparação entre Naive Bayes e Random Forest')

# # Iterar sobre as métricas e criar gráficos em subplots
# for i, metric in enumerate(metrics):
#     ax = axs[i//2, i%2]

#     # Dados para Naive Bayes
#     bayes_means = [results_bayes[model][metric]['mean'] for model in results_bayes]
#     bayes_stds = [results_bayes[model][metric]['std'] for model in results_bayes]

#     # Dados para Random Forest
#     rf_means = [results_rf[model][metric]['mean'] for model in results_rf]
#     rf_stds = [results_rf[model][metric]['std'] for model in results_rf]

#     # Plot das barras para Naive Bayes
#     rects1 = ax.bar(x - width/2, bayes_means, width, label='Naive Bayes', yerr=bayes_stds, capsize=5)

#     # Plot das barras para Random Forest
#     rects2 = ax.bar(x + width/2, rf_means, width, label='Random Forest', yerr=rf_stds, capsize=5)

#     # Personalização dos subplots
#     ax.set_title(metric.capitalize())
#     ax.set_xticks(x)
#     ax.set_xticklabels(results_bayes.keys())
#     ax.legend(loc='upper right')
#     ax.set_ylim(0.4, 1)
#     ax.bar_label(rects1, fmt='%.3f', padding=3)
#     ax.bar_label(rects2, fmt='%.3f', padding=3)

# fig.tight_layout(rect=[0, 0, 1, 0.96])
# plt.show()