1. Bibliotecas de importación

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

2. Datos de carga

In [2]:
# Load the diamonds dataset from Seaborn
diamonds = sns.load_dataset("diamonds")

# Split data into features and target
X = diamonds.drop("cut", axis=1)
y = diamonds["cut"]

3. Divide los datos

In [3]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.2, random_state=42
)

4. Definir rasgos categóricos y numéricos

In [4]:
# Define categorical and numerical features
categorical_features = X.select_dtypes(
   include=["object"]
).columns.tolist()

numerical_features = X.select_dtypes(
   include=["float64", "int64"]
).columns.tolist()

5. Definir pasos de preprocesamiento para características categóricas y numéricas

In [5]:
preprocessor = ColumnTransformer(
   transformers=[
       ("cat", OneHotEncoder(), categorical_features),
       ("num", StandardScaler(), numerical_features),
   ]
)

6. Crear una canalización de Clasificador de Aumento Gradiente

In [51]:
pipeline = Pipeline(
   [
       ("preprocessor", preprocessor),
       ("classifier", GradientBoostingClassifier(
                                                 learning_rate=0.1 # Tasa de aprendizaje
                                                ,n_estimators=15 # El número de árboles
                                                ,max_depth=10 # Profundidad máxima
                                                ,min_samples_leaf=10 # Número mínimo de muestras por hoja
                                                ,subsample=0.8 # Frecuencia de submuestreo
                                                ,max_features=0.75 # Frecuencia de muestreo de características
                                                ,n_iter_no_change=20  # Parada temprana después de 20 iteraciones sin mejora
                                                ,criterion='squared_error'
                                                ,random_state=42)
                                                ),
   ]
)


7. CV y formación

In [52]:
# Perform 5-fold cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)

# Fit the model on the training data
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)


8. Informar de los resultados finales

In [50]:
print(f"Mean Cross-Validation Accuracy: {cv_scores.mean():.4f}")
print("\nClassification Report:")
print(report)

Mean Cross-Validation Accuracy: 0.7649

Classification Report:
              precision    recall  f1-score   support

        Fair       0.92      0.91      0.92       335
        Good       0.81      0.68      0.74      1004
       Ideal       0.82      0.93      0.87      4292
     Premium       0.69      0.86      0.76      2775
   Very Good       0.74      0.39      0.52      2382

    accuracy                           0.77     10788
   macro avg       0.80      0.75      0.76     10788
weighted avg       0.77      0.77      0.75     10788

