In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef

In [2]:
def get_metrics(y_true, y_pred):

    return {
        "precision" : precision_score(y_true, y_pred),
        "recall" : recall_score(y_true, y_pred),
        "f1_score" : f1_score(y_true, y_pred),
        "mcc" : matthews_corrcoef(y_true, y_pred)
    }

In [3]:
data = load_breast_cancer()
X = data.data
y = data.target

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=84, test_size=.4)


In [5]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [6]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [7]:
predictions = rf.predict(scaler.transform(X_test))
get_metrics(y_test, predictions)

{'precision': 0.9858156028368794,
 'recall': 0.9328859060402684,
 'f1_score': 0.9586206896551724,
 'mcc': 0.8890319438476415}

- Con validación cruzada

In [8]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

rf2 = RandomForestClassifier(n_estimators=100, random_state=42)
scores = cross_val_score(rf2, X_train, y_train, cv=cv, scoring="precision")
scores

array([0.9       , 0.95      , 0.91304348, 1.        , 0.95238095,
       1.        , 1.        , 0.95454545, 0.95      , 1.        ])

In [9]:
scores = cross_validate(rf2, X_train, y_train, cv=cv, scoring=["accuracy", "precision", "f1", "recall"])
scores

{'fit_time': array([0.14022779, 0.13544989, 0.13103342, 0.14112782, 0.14006972,
        0.13739753, 0.15101719, 0.14235592, 0.14212871, 0.1428256 ]),
 'score_time': array([0.01018763, 0.00784922, 0.00844622, 0.00874901, 0.00937915,
        0.00882554, 0.01161313, 0.00773096, 0.00780177, 0.01034284]),
 'test_accuracy': array([0.85714286, 0.91176471, 0.94117647, 1.        , 0.94117647,
        1.        , 1.        , 0.97058824, 0.94117647, 0.97058824]),
 'test_precision': array([0.9       , 0.95      , 0.91304348, 1.        , 0.95238095,
        1.        , 1.        , 0.95454545, 0.95      , 1.        ]),
 'test_f1': array([0.87804878, 0.92682927, 0.95454545, 1.        , 0.95238095,
        1.        , 1.        , 0.97674419, 0.95      , 0.97435897]),
 'test_recall': array([0.85714286, 0.9047619 , 1.        , 1.        , 0.95238095,
        1.        , 1.        , 1.        , 0.95      , 0.95      ])}

In [10]:
scores["test_accuracy"].std()

np.float64(0.04307286292287588)

In [11]:
rf2.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV
from scipy.stats import randint
from sklearn.pipeline import Pipeline

In [13]:
param_grid = {
    "model__n_estimators": [50, 100, 150, 200, 250, 500],
    "model__max_depth": [3,4,5,6],
    "model__min_samples_split": [2,3,4],
}

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(random_state=42))
])

grid = GridSearchCV(pipe, param_grid, cv=cv, scoring="recall", n_jobs=-1)
grid.fit(X_train, y_train)

grid.best_params_, grid.best_score_

({'model__max_depth': 6,
  'model__min_samples_split': 4,
  'model__n_estimators': 50},
 np.float64(0.9709523809523809))

In [14]:
predictions_gs = grid.predict(scaler.transform(X_test))
get_metrics(y_test, predictions_gs)

{'precision': 0.9788732394366197,
 'recall': 0.9328859060402684,
 'f1_score': 0.9553264604810997,
 'mcc': 0.878604121829926}

In [15]:
param_dist = {
    "model__n_estimators": randint(50, 400),
    "model__max_depth": randint(3, 20),
    "model__min_samples_split": randint(2, 10),
}

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(random_state=42))
])

random_search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=20,
    cv=cv,
    scoring="recall",
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)
random_search.best_params_, random_search.best_score_


({'model__max_depth': 9,
  'model__min_samples_split': 5,
  'model__n_estimators': 398},
 np.float64(0.9614285714285714))

In [16]:
predictions_rs = random_search.predict(scaler.transform(X_test))
get_metrics(y_test, predictions_rs)

{'precision': 0.9722222222222222,
 'recall': 0.9395973154362416,
 'f1_score': 0.9556313993174061,
 'mcc': 0.8769405693822121}

In [17]:
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 20, 5000)
    max_depth = trial.suggest_int("max_depth", 3, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", RandomForestClassifier(random_state=42, n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split))
    ])
    
    score = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="precision").mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10, n_jobs=-1)

study.best_params, study.best_value

[I 2025-12-05 14:29:27,736] A new study created in memory with name: no-name-e15cdda1-6146-4383-9df5-de1353fda8f9
[I 2025-12-05 14:29:54,128] Trial 5 finished with value: 0.9626679841897234 and parameters: {'n_estimators': 212, 'max_depth': 11, 'min_samples_split': 19}. Best is trial 5 with value: 0.9626679841897234.
[I 2025-12-05 14:30:11,233] Trial 8 finished with value: 0.9576679841897233 and parameters: {'n_estimators': 392, 'max_depth': 3, 'min_samples_split': 20}. Best is trial 5 with value: 0.9626679841897234.
[I 2025-12-05 14:30:50,535] Trial 1 finished with value: 0.9576679841897233 and parameters: {'n_estimators': 805, 'max_depth': 5, 'min_samples_split': 7}. Best is trial 5 with value: 0.9626679841897234.
[I 2025-12-05 14:31:11,504] Trial 9 finished with value: 0.9576679841897233 and parameters: {'n_estimators': 1064, 'max_depth': 9, 'min_samples_split': 16}. Best is trial 5 with value: 0.9626679841897234.
[I 2025-12-05 14:32:36,223] Trial 6 finished with value: 0.9576679841

({'n_estimators': 212, 'max_depth': 11, 'min_samples_split': 19},
 0.9626679841897234)