In [2]:
import pandas as pd
import seaborn as sns
import time
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tqdm import tqdm

# Cargar el dataset
diamonds = sns.load_dataset("diamonds")

# Dividir datos en características (X) y objetivo (y)
X = diamonds.drop("cut", axis=1)
y = diamonds["cut"]

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identificar columnas categóricas y numéricas
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
numerical_features = X.select_dtypes(include=["float64", "int64"]).columns.tolist()

# Preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", StandardScaler(), numerical_features),
    ]
)

# Pipeline
pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", GradientBoostingClassifier(random_state=42)),
    ]
)

# Espacio de búsqueda para RandomizedSearchCV
param_dist = {
    "classifier__learning_rate": [0.01, 0.05, 0.1],
    "classifier__n_estimators": [100, 200, 300],
    "classifier__max_depth": [3, 5, 7],
    "classifier__min_samples_leaf": [5, 10, 20],
    "classifier__subsample": [0.6, 0.8, 1.0],
    "classifier__max_features": [0.5, 0.75, 1.0],
}

# Configuración de RandomizedSearchCV
n_iter = 20
cv = 3
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=n_iter,  # Número de combinaciones aleatorias
    cv=cv,  # Validación cruzada
    scoring="accuracy",  # Métrica de evaluación
    n_jobs=-1,  # Usar todos los núcleos disponibles
    random_state=42,
    verbose=0,  # No imprimir logs, usaremos tqdm
)

# Medir tiempo de ejecución
start_time = time.time()

# Agregar barra de progreso
with tqdm(total=n_iter * cv) as pbar:
    for i in range(n_iter):
        random_search.fit(X_train, y_train)
        pbar.update(cv)

elapsed_time = time.time() - start_time
print(f"\nTiempo total de ejecución: {elapsed_time:.2f} segundos")

# Obtener los mejores parámetros y el mejor modelo
print("Best Parameters:", random_search.best_params_)
best_model = random_search.best_estimator_

# Evaluar en el conjunto de prueba
y_pred = best_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))


 20%|██        | 12/60 [1:39:56<6:39:47, 499.73s/it]


KeyboardInterrupt: 