In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_validate
from sklearn.metrics import accuracy_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier, Perceptron
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from lightgbm import LGBMClassifier
import optuna

df_train = pd.read_csv("training.csv")[["Temperatura", "pH", "Bw", "Crec"]]
df_validation = pd.read_csv("validation.csv")[["Temperatura", "pH", "Bw", "Crec"]]
df = pd.concat([df_train,df_validation])

X = df.drop("Crec", axis=1)
y = df["Crec"]

X_train, X_test, y_train, y_test = train_test_split(
                df.drop("Crec", axis=1),
                df["Crec"],
                train_size = 0.8,
                random_state = 123,
                shuffle = True
            )

cv = RepeatedKFold(n_splits = 5, n_repeats=2, random_state=123)


In [None]:
### MultinomialNB

def objective(trial):
        params = {
                "alpha" : trial.suggest_float('alpha', 0.01, 10.0, log=True),
                "fit_prior" : trial.suggest_categorical('fit_prior', [True, False])
                }

        model = MultinomialNB(
                **params
                )

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        score = roc_auc_score(y_test, predictions)
        return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5, show_progress_bar=False, timeout=60*10)

print('Mejores hiperparámetros:', study.best_params)
print('Mejor score:', study.best_value)

model_best = MultinomialNB(
                **study.best_params)

cv_scores_NB = cross_validate(model_best, 
                               X = X, 
                               y = y, 
                               cv = cv,
                               scoring = ("roc_auc", "accuracy"),
                               return_estimator = True, 
                               return_indices = True,
                               return_train_score = True
                               )

cv_scores_NB["test_roc_auc"], cv_scores_NB["test_accuracy"]

[I 2025-07-02 13:47:21,820] A new study created in memory with name: no-name-578e3e6e-9c7f-4662-8709-954ecc51343b
[I 2025-07-02 13:47:21,826] Trial 0 finished with value: 0.5 and parameters: {'alpha': 0.0702464115746851, 'fit_prior': True}. Best is trial 0 with value: 0.5.
[I 2025-07-02 13:47:21,831] Trial 1 finished with value: 0.5 and parameters: {'alpha': 2.4668045213832963, 'fit_prior': True}. Best is trial 0 with value: 0.5.
[I 2025-07-02 13:47:21,836] Trial 2 finished with value: 0.53232737545499 and parameters: {'alpha': 7.582341567487446, 'fit_prior': False}. Best is trial 2 with value: 0.53232737545499.
[I 2025-07-02 13:47:21,841] Trial 3 finished with value: 0.5 and parameters: {'alpha': 3.293239983060004, 'fit_prior': True}. Best is trial 2 with value: 0.53232737545499.
[I 2025-07-02 13:47:21,847] Trial 4 finished with value: 0.5 and parameters: {'alpha': 2.2964400888986827, 'fit_prior': True}. Best is trial 2 with value: 0.53232737545499.


Mejores hiperparámetros: {'alpha': 7.582341567487446, 'fit_prior': False}
Mejor score: 0.53232737545499


(array([0.59126623, 0.55211706, 0.55079873, 0.60964895, 0.58847599,
        0.60748153, 0.58063376, 0.55258165, 0.57853446, 0.57345256]),
 array([0.53426249, 0.4970964 , 0.50058072, 0.54936121, 0.53368177,
        0.54936121, 0.51103368, 0.50522648, 0.5261324 , 0.52206736]))

In [None]:
## Perceptron

def objective(trial):
        params = {
                'n_estimators': trial.suggest_int('n_estimators', 1000, 100000, step=10),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 5),
                'learning_rate': trial.suggest_float('learning_rate', 0.00001, 0.01),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0001, 0.1, log=True),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0001, 0.1, log=True),
                'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1),
                'subsample': trial.suggest_float('subsample', 0.1, 1),
                }

        model = Perceptron(
                n_jobs         = -1,
                random_state   = 123,
                verbosity            = 0,
                enable_categorical = True,
                **params
                )

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        score = roc_auc_score(y_test, predictions)
        return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5, show_progress_bar=False, timeout=60*10)

print('Mejores hiperparámetros:', study.best_params)
print('Mejor score:', study.best_value)

model_best = XGBClassifier(
                n_jobs = -1,
                random_state = 123,
                verbose = 0,
                **study.best_params)

cv_scores_XGB = cross_validate(model_best, 
                               X = X, 
                               y = y, 
                               cv = cv,
                               scoring = ("roc_auc", "accuracy"),
                               return_estimator = True, 
                               return_indices = True,
                               return_train_score = True,
                               n_jobs = -1
                               )

cv_scores_XGB["test_roc_auc"], cv_scores_XGB["test_accuracy"]


In [None]:
## SGDCLassifier

In [None]:
## PassiveAgressive

In [None]:
## MLPClassifier

In [22]:
### XGB training

def objective(trial):
        params = {
                'n_estimators': trial.suggest_int('n_estimators', 1000, 100000, step=10),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 5),
                'learning_rate': trial.suggest_float('learning_rate', 0.00001, 0.01),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0001, 0.1, log=True),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0001, 0.1, log=True),
                'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1),
                'subsample': trial.suggest_float('subsample', 0.1, 1),
                }

        model = XGBClassifier(
                n_jobs         = -1,
                random_state   = 123,
                verbosity            = 0,
                enable_categorical = True,
                **params
                )

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        score = roc_auc_score(y_test, predictions)
        return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5, show_progress_bar=False, timeout=60*10)

print('Mejores hiperparámetros:', study.best_params)
print('Mejor score:', study.best_value)

model_best = XGBClassifier(
                n_jobs = -1,
                random_state = 123,
                verbose = 0,
                **study.best_params)

cv_scores_XGB = cross_validate(model_best, 
                               X = X, 
                               y = y, 
                               cv = cv,
                               scoring = ("roc_auc", "accuracy"),
                               return_estimator = True, 
                               return_indices = True,
                               return_train_score = True,
                               n_jobs = -1
                               )

cv_scores_XGB["test_roc_auc"], cv_scores_XGB["test_accuracy"]


[I 2025-07-02 13:47:58,058] A new study created in memory with name: no-name-8dfe2e63-789e-41b2-838c-c8dadef937cd
[I 2025-07-02 13:48:21,256] Trial 0 finished with value: 0.939873417721519 and parameters: {'n_estimators': 51710, 'max_depth': 4, 'scale_pos_weight': 5, 'learning_rate': 0.0022806397747399914, 'reg_lambda': 0.0001513059369971646, 'reg_alpha': 0.0007792189163291182, 'colsample_bynode': 0.5395412132557464, 'subsample': 0.6408707533749411}. Best is trial 0 with value: 0.939873417721519.
[I 2025-07-02 13:48:37,888] Trial 1 finished with value: 0.9444966588797739 and parameters: {'n_estimators': 26970, 'max_depth': 5, 'scale_pos_weight': 3, 'learning_rate': 0.0032818067412424025, 'reg_lambda': 0.07844684453532094, 'reg_alpha': 0.0009025970686431106, 'colsample_bynode': 0.8224044884660988, 'subsample': 0.7553152913191377}. Best is trial 1 with value: 0.9444966588797739.
[I 2025-07-02 13:48:58,499] Trial 2 finished with value: 0.9444966588797739 and parameters: {'n_estimators': 4

Mejores hiperparámetros: {'n_estimators': 26970, 'max_depth': 5, 'scale_pos_weight': 3, 'learning_rate': 0.0032818067412424025, 'reg_lambda': 0.07844684453532094, 'reg_alpha': 0.0009025970686431106, 'colsample_bynode': 0.8224044884660988, 'subsample': 0.7553152913191377}
Mejor score: 0.9444966588797739


Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# LGBMCLassifier

def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 10, 510, step=100),
            'max_depth': trial.suggest_int('max_depth', -1, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.01),
            'subsample': trial.suggest_float('subsample', 0.01, 1),
            'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
            "num_leaves": trial.suggest_int("num_leaves", 1024, 2000)
        }

        model = LGBMClassifier(
                n_jobs         = -1,
                random_state   = 123,
                verbosity      = 0,
                **params
                )

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        score = roc_auc_score(y_test, predictions)
        return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5, show_progress_bar=False, timeout=60*10)

print('Mejores hiperparámetros:', study.best_params)
print('Mejor score:', study.best_value)

model_best = LGBMClassifier(
                n_jobs = -1,
                random_state = 123,
                verbose = 0,
                **study.best_params)

cv_scores_LGBM = cross_validate(model_best, 
                               X = X, 
                               y = y, 
                               cv = cv,
                               scoring = ("roc_auc", "accuracy"),
                               return_estimator = True, 
                               return_indices = True,
                               return_train_score = True,
                               n_jobs = -1
                               )

cv_scores_LGBM["test_roc_auc"], cv_scores_LGBM["test_accuracy"]


[I 2025-06-11 09:54:40,905] A new study created in memory with name: no-name-4afa6192-599a-4ab1-a943-9bc3d55cb950




[I 2025-06-11 09:54:41,309] Trial 0 finished with value: 0.9381917682448023 and parameters: {'n_estimators': 110, 'max_depth': -1, 'learning_rate': 0.0026184286169644447, 'subsample': 0.10454916254113333, 'boosting_type': 'gbdt', 'num_leaves': 1533}. Best is trial 0 with value: 0.9381917682448023.
[I 2025-06-11 09:54:41,328] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 10, 'max_depth': 5, 'learning_rate': 0.002754608325983556, 'subsample': 0.7773909517357265, 'boosting_type': 'dart', 'num_leaves': 1212}. Best is trial 0 with value: 0.9381917682448023.




[I 2025-06-11 09:54:41,602] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 310, 'max_depth': 3, 'learning_rate': 0.00014338383997132575, 'subsample': 0.49279586717424906, 'boosting_type': 'dart', 'num_leaves': 1478}. Best is trial 0 with value: 0.9381917682448023.




[I 2025-06-11 09:54:41,930] Trial 3 finished with value: 0.6873781364388659 and parameters: {'n_estimators': 310, 'max_depth': 1, 'learning_rate': 0.0064746614674962314, 'subsample': 0.7273148358422874, 'boosting_type': 'dart', 'num_leaves': 1187}. Best is trial 0 with value: 0.9381917682448023.




[I 2025-06-11 09:54:42,379] Trial 4 finished with value: 0.9323942641071796 and parameters: {'n_estimators': 210, 'max_depth': 7, 'learning_rate': 0.004396595234537054, 'subsample': 0.5656666898192682, 'boosting_type': 'gbdt', 'num_leaves': 1464}. Best is trial 0 with value: 0.9381917682448023.


Mejores hiperparámetros: {'n_estimators': 110, 'max_depth': -1, 'learning_rate': 0.0026184286169644447, 'subsample': 0.10454916254113333, 'boosting_type': 'gbdt', 'num_leaves': 1533}
Mejor score: 0.9381917682448023


In [None]:
# RFClassifier

def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 10, 1000, step=10),
            'max_depth': trial.suggest_int('max_depth', 1, 50),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 1e+5, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 1e+5, log=True),
            'gamma': trial.suggest_float('gamma', 1e-5, 1e+5, log=True),
            'subsample': trial.suggest_float('subsample', 0.1, 1),
            'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1),
            'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.01)
            }


        model = XGBRFClassifier(
                tree_method = 'hist',
                grow_policy = 'depthwise',
                n_jobs         = -1,
                random_state   = 123,
                verbosity            = 0,
                **params
                )

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        score = roc_auc_score(y_test, predictions)
        return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15, show_progress_bar=False, timeout=60*10)

print('Mejores hiperparámetros:', study.best_params)
print('Mejor score:', study.best_value)

model_best = XGBRFClassifier(
                n_jobs = -1,
                random_state = 123,
                verbose = 0,
                **study.best_params)

cv_scores_RF = cross_validate(model_best, 
                               X = X, 
                               y = y, 
                               cv = cv,
                               scoring = ("roc_auc", "accuracy"),
                               return_estimator = True,   
                               return_indices = True,
                               return_train_score = True,
                               n_jobs = -1
                               )

cv_scores_RF["test_roc_auc"], cv_scores_RF["test_accuracy"]

[I 2025-06-11 11:13:26,363] A new study created in memory with name: no-name-ad8e5daa-d33e-459b-a0c9-d67edbc27eec
[I 2025-06-11 11:13:26,550] Trial 0 finished with value: 0.5 and parameters: {'n_estimators': 490, 'max_depth': 31, 'reg_lambda': 0.00839006395759614, 'reg_alpha': 1067.6476942177674, 'gamma': 0.540481699040346, 'subsample': 0.45960466489027807, 'colsample_bynode': 0.20861805461827745, 'learning_rate': 0.008280248614551825}. Best is trial 0 with value: 0.5.
[I 2025-06-11 11:13:26,657] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 510, 'max_depth': 38, 'reg_lambda': 0.0025729510336180094, 'reg_alpha': 4.310626832363637, 'gamma': 53247.830230892774, 'subsample': 0.7827302978831746, 'colsample_bynode': 0.6925718792155103, 'learning_rate': 0.008589439854720902}. Best is trial 0 with value: 0.5.
[I 2025-06-11 11:13:26,841] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 510, 'max_depth': 11, 'reg_lambda': 20343.529030344398, 'reg_alpha': 0.0

Mejores hiperparámetros: {'n_estimators': 490, 'max_depth': 31, 'reg_lambda': 0.00839006395759614, 'reg_alpha': 1067.6476942177674, 'gamma': 0.540481699040346, 'subsample': 0.45960466489027807, 'colsample_bynode': 0.20861805461827745, 'learning_rate': 0.008280248614551825}
Mejor score: 0.5
