
# Sesión 08 — Clasificación con *Wine Dataset* (scikit-learn)

Este cuaderno reproduce un flujo típico de clasificación con **Python 3.x** usando el conjunto de datos **Wine**.  
Incluye: carga de datos, exploración, separación *train/test*, estandarización, ajuste de varios clasificadores, evaluación con métricas clásicas y curvas ROC (one-vs-rest).

> Requisitos: `pandas`, `numpy`, `scikit-learn`, `matplotlib`


In [None]:

# %%
# Importaciones básicas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, classification_report,
    confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Configuración de impresión
pd.set_option('display.precision', 3)
np.set_printoptions(precision=3, suppress=True)


## 1) Carga del dataset

In [None]:

# %%
wine = load_wine(as_frame=True)
X = wine.data
y = wine.target
feature_names = X.columns.tolist()
target_names = wine.target_names

print("Dimensiones X:", X.shape)
print("Clases:", target_names)
X.head()


## 2) Exploración rápida de datos (EDA)

In [None]:

# %%
X.describe()


In [None]:

# %%
# Histogramas simples de algunas variables (sin seaborn)
cols = feature_names[:6]  # primeras 6 para no sobrecargar
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12, 7))
axes = axes.flatten()
for i, col in enumerate(cols):
    axes[i].hist(X[col], bins=20)
    axes[i].set_title(col)
    axes[i].set_xlabel(col)
    axes[i].set_ylabel("Frecuencia")
plt.tight_layout()
plt.show()


## 3) Partición *train/test*

In [None]:

# %%
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)
print("Train:", X_train.shape, " Test:", X_test.shape)


## 4) Entrenamiento y evaluación de modelos

In [None]:

# %%
# Función auxiliar para entrenar y evaluar
def evaluar_modelo(nombre, pipe, X_train, X_test, y_train, y_test):
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='macro', zero_division=0
    )
    cm = confusion_matrix(y_test, y_pred)
    print(f"\n=== {nombre} ===")
    print(f"Accuracy: {acc:.3f} | Precision (macro): {prec:.3f} | Recall (macro): {rec:.3f} | F1 (macro): {f1:.3f}")
    print("\nReporte de Clasificación:")
    print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
    disp.plot(xticks_rotation=45)
    plt.title(f"Matriz de confusión — {nombre}")
    plt.tight_layout()
    plt.show()
    return {"name": nombre, "pipeline": pipe, "y_pred": y_pred, "acc": acc}


In [None]:

# %%
logreg = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=500, multi_class="auto"))
])
res_logreg = evaluar_modelo("Logistic Regression", logreg, X_train, X_test, y_train, y_test)


In [None]:

# %%
tree = DecisionTreeClassifier(random_state=42, max_depth=None)
res_tree = evaluar_modelo("Decision Tree", tree, X_train, X_test, y_train, y_test)


In [None]:

# %%
rf = RandomForestClassifier(random_state=42, n_estimators=300, max_depth=None)
res_rf = evaluar_modelo("Random Forest", rf, X_train, X_test, y_train, y_test)


In [None]:

# %%
svm_rbf = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", SVC(kernel="rbf", probability=True, random_state=42))
])
res_svm = evaluar_modelo("SVM (RBF)", svm_rbf, X_train, X_test, y_train, y_test)


### 4.1) Validación cruzada (accuracy, k=5)

In [None]:

# %%
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for nombre, pipe in [
    ("LogReg", logreg),
    ("DecisionTree", tree),
    ("RandomForest", rf),
    ("SVM-RBF", svm_rbf),
]:
    scores = cross_val_score(pipe, X, y, cv=cv, scoring="accuracy")
    print(f"{nombre}: mean={scores.mean():.3f} ± {scores.std():.3f}")


## 5) Curvas ROC (One-vs-Rest)

In [None]:

# %%
from sklearn.multiclass import OneVsRestClassifier

def plot_roc_ovr(nombre, estimator, X_train, X_test, y_train, y_test):
    n_classes = len(np.unique(y))
    y_train_bin = label_binarize(y_train, classes=np.arange(n_classes))
    y_test_bin = label_binarize(y_test, classes=np.arange(n_classes))

    clf = OneVsRestClassifier(estimator)
    clf.fit(X_train, y_train_bin)

    if hasattr(clf, "predict_proba"):
        y_score = clf.predict_proba(X_test)
    else:
        scores = clf.decision_function(X_test)
        if scores.ndim == 1:
            scores = scores[:, None]
        minv = scores.min(axis=0, keepdims=True)
        maxv = scores.max(axis=0, keepdims=True)
        y_score = (scores - minv) / (maxv - minv + 1e-12)

    plt.figure(figsize=(7, 5))
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"Clase {target_names[i]} (AUC = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], linestyle="--")
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.title(f"ROC One-vs-Rest — {nombre}")
    plt.legend()
    plt.tight_layout()
    plt.show()

plot_roc_ovr("Logistic Regression", LogisticRegression(max_iter=500), X_train, X_test, y_train, y_test)
plot_roc_ovr("SVM (RBF)", SVC(kernel="rbf", probability=True, random_state=42), X_train, X_test, y_train, y_test)


## 6) Importancia de variables (árboles/bosques)

In [None]:

# %%
rf_fit = RandomForestClassifier(random_state=42, n_estimators=300)
rf_fit.fit(X_train, y_train)

importances = rf_fit.feature_importances_
idx = np.argsort(importances)[::-1]

print("Top 10 variables por importancia:")
for i in idx[:10]:
    print(f"{feature_names[i]:30s} {importances[i]:.4f}")

plt.figure(figsize=(8,5))
top_k = 10
plt.bar(range(top_k), importances[idx][:top_k])
plt.xticks(range(top_k), [feature_names[i] for i in idx[:top_k]], rotation=45, ha='right')
plt.ylabel("Importancia (Gini)")
plt.title("Importancia de variables — Random Forest")
plt.tight_layout()
plt.show()



## 7) Conclusiones rápidas
- El conjunto **Wine** es de **3 clases** y **13 atributos**; responde bien a clasificadores lineales con *scaling* y a métodos no lineales.
- Revise *accuracy*, *macro-F1* y matrices de confusión para comparar modelos.
- Use validación cruzada para estimar rendimiento fuera de muestra.

> Lecturas sugeridas: (Han & Kamber, 2012), (Hastie, Tibshirani & Friedman, 2009).
