In [None]:
# ============================================================
# üìå Celda 1: Instalaci√≥n y carga de librer√≠as
# ============================================================
#!pip install scikit-learn matplotlib seaborn pandas


In [None]:
# ============================================================
# üìå Celda 2: Importar librer√≠as
# ============================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# ============================================================
# üìå Celda 3: Cargar dataset (Wine Quality - Vino Tinto)
# ============================================================
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
wine = pd.read_csv(url, sep=";")
wine["quality"].value_counts()



In [None]:
wine.head()

In [None]:
# Creamos variable binaria: calidad buena (>=6) vs mala (<6)
wine["quality_label"] = (wine["quality"] >= 6).astype(int)

X = wine.drop(columns=["quality", "quality_label"])
y = wine["quality_label"]

In [None]:
print("Dimensiones de X:", X.shape)
print("Distribuci√≥n de clases:\n", y.value_counts(normalize=True))

wine.head()

In [None]:
# ============================================================
# üìå Celda 4: Dividir datos en entrenamiento y prueba
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("Tama√±o entrenamiento:", X_train.shape)
print("Tama√±o prueba:", X_test.shape)


In [None]:
# ============================================================
# üìå Celda 5: √Årbol "grande" con Gini (sin poda)
# ============================================================
clf_big_gini = DecisionTreeClassifier(criterion="gini", random_state=42)
clf_big_gini.fit(X_train, y_train)

y_train_pred = clf_big_gini.predict(X_train)
y_test_pred = clf_big_gini.predict(X_test)

print("üîπ √Årbol grande - Gini (sin poda)")
print("Accuracy Entrenamiento:", accuracy_score(y_train, y_train_pred))
print("Accuracy Prueba:", accuracy_score(y_test, y_test_pred))



In [None]:
# üìå Reporte de m√©tricas en el set de prueba
print("\nReporte en test:\n", classification_report(y_test, y_test_pred, target_names=["Mala","Buena"]))

# üìå Matriz de confusi√≥n
sns.heatmap(confusion_matrix(y_test, y_test_pred), annot=True, cmap="Blues",
            xticklabels=["Mala","Buena"], yticklabels=["Mala","Buena"])
plt.xlabel("Predicci√≥n")
plt.ylabel("Real")
plt.title("Matriz de confusi√≥n - √Årbol grande (Gini)")
plt.show()

In [None]:
# üìå Visualizaci√≥n del √°rbol completo
plt.figure(figsize=(20,10))
plot_tree(clf_big_gini, filled=True, feature_names=X.columns, class_names=["Mala","Buena"], fontsize=8)
plt.title("√Årbol grande (sin poda, criterio Gini)")
plt.show()

In [None]:
# Ejemplo de interpretaci√≥n de un nodo del √°rbol:
#
# alcohol <= 10.525
# üëâ La caracter√≠stica usada para dividir en este nodo es "alcohol".
#    Si el valor de alcohol es <= 10.525, el caso va a la rama izquierda.
#    Si es mayor a 10.525, el caso va a la rama derecha.
#
# gini = 0.498
# üëâ Impureza Gini del nodo.
#    El valor va de 0 (puro, todas las muestras de la misma clase) a 0.5 (mezcla m√°xima en binario).
#    Aqu√≠ 0.498 indica que las clases est√°n casi balanceadas (mitad y mitad).
#
# samples = 1119
# üëâ N√∫mero total de muestras (filas del dataset) que llegan a este nodo.
#
# value = [521.0, 598.0]
# üëâ Distribuci√≥n de clases en este nodo:
#    - 521 vinos son de la clase "Mala" (0).
#    - 598 vinos son de la clase "Buena" (1).
#
# class = Buena
# üëâ Clase mayoritaria en este nodo.
#    Como 598 > 521, la predicci√≥n por defecto en este nodo ser√≠a "Buena".


In [None]:
# ============================================================
# üìå Celda 6: Poda con max_depth
# ============================================================
clf_pruned = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=42)
clf_pruned.fit(X_train, y_train)

y_train_pred_pruned = clf_pruned.predict(X_train)
y_test_pred_pruned = clf_pruned.predict(X_test)

print("üîπ √Årbol podado - Gini (max_depth=5)")
print("Accuracy Entrenamiento:", accuracy_score(y_train, y_train_pred_pruned))
print("Accuracy Prueba:", accuracy_score(y_test, y_test_pred_pruned))

# üìå Visualizaci√≥n del √°rbol podado
plt.figure(figsize=(20,10))
plot_tree(clf_pruned, filled=True, feature_names=X.columns, class_names=["Mala","Buena"], fontsize=10)
plt.title("√Årbol podado (max_depth=5, criterio Gini)")
plt.show()



In [None]:
# ============================================================
# üìå Celda 7: Cambiar criterio a "Entrop√≠a"
# ============================================================
clf_entropy = DecisionTreeClassifier(criterion="entropy", max_depth=5, random_state=42)
clf_entropy.fit(X_train, y_train)

y_pred_entropy = clf_entropy.predict(X_test)

print("üîπ √Årbol podado - Entrop√≠a (max_depth=5)")
print("Accuracy en test:", accuracy_score(y_test, y_pred_entropy))

# üìå Visualizaci√≥n del √°rbol con entrop√≠a
plt.figure(figsize=(20,10))
plot_tree(clf_entropy, filled=True, feature_names=X.columns, class_names=["Mala","Buena"], fontsize=10)
plt.title("√Årbol podado (max_depth=5, criterio Entrop√≠a)")
plt.show()

# üìå Explicaci√≥n:
# - El valor por defecto en DecisionTreeClassifier es "gini".
# - Gini mide la probabilidad de clasificar mal un ejemplo si se elige aleatoriamente seg√∫n la distribuci√≥n de clases.
# - Entrop√≠a mide la incertidumbre (teor√≠a de la informaci√≥n).
# - Efectos:
#   Gini ‚Üí m√°s r√°pido de calcular, favorece clases mayoritarias.
#   Entrop√≠a ‚Üí puede equilibrar mejor divisiones si hay varias clases.
# - En la pr√°ctica, la diferencia de rendimiento suele ser peque√±a.



In [None]:
# ============================================================
# üìå Celda 8: B√∫squeda sistem√°tica de hiperpar√°metros con GridSearchCV
# ============================================================
from sklearn.model_selection import GridSearchCV

# Definimos los hiperpar√°metros a explorar
param_grid = {
    "criterion": ["gini", "entropy"],      # funci√≥n de impureza
    "max_depth": [3, 5, 7, None],          # profundidad m√°xima
    "min_samples_split": [2, 10, 20],      # m√≠nimo de muestras para dividir
    "min_samples_leaf": [1, 5, 10],        # m√≠nimo de muestras por hoja
    "max_features": [None, "sqrt", "log2"] # n√∫mero m√°ximo de variables a considerar en cada split
}

# Configuramos el GridSearch
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,                 # validaci√≥n cruzada 5-fold
    scoring="accuracy",   # m√©trica a optimizar
    n_jobs=-1             # usa todos los cores disponibles
)

# Ajustamos el modelo
grid_search.fit(X_train, y_train)

# Mejor modelo encontrado
best_tree = grid_search.best_estimator_

print("üîπ Mejor combinaci√≥n de hiperpar√°metros:", grid_search.best_params_)
print("üîπ Mejor accuracy en validaci√≥n cruzada:", grid_search.best_score_)





In [None]:
# Evaluamos en el set de test
y_pred_best = best_tree.predict(X_test)

print("\nAccuracy en test:", accuracy_score(y_test, y_pred_best))
print("\nReporte en test:\n", classification_report(y_test, y_pred_best, target_names=["Mala","Buena"]))

# üìå Matriz de confusi√≥n
sns.heatmap(confusion_matrix(y_test, y_pred_best), annot=True, cmap="Purples",
            xticklabels=["Mala","Buena"], yticklabels=["Mala","Buena"])
plt.xlabel("Predicci√≥n")
plt.ylabel("Real")
plt.title("Matriz de confusi√≥n - √Årbol optimizado con GridSearchCV")
plt.show()



In [None]:
# üìå Visualizaci√≥n del mejor √°rbol
plt.figure(figsize=(20,10))
plot_tree(best_tree, filled=True, feature_names=X.columns, class_names=["Mala","Buena"], fontsize=9)
plt.title("√Årbol optimizado con GridSearchCV")
plt.show()

In [None]:
# ============================================================
# üìå Celda 9: Comparaci√≥n final de modelos
# ============================================================
acc_results = {
    "√Årbol grande (Gini)": accuracy_score(y_test, y_test_pred),
    "√Årbol podado (Gini)": accuracy_score(y_test, y_test_pred_pruned),
    "√Årbol podado (Entrop√≠a)": accuracy_score(y_test, y_pred_entropy),
    "√Årbol optimizado (GridSearchCV)": accuracy_score(y_test, y_pred_best),
}

for modelo, acc in acc_results.items():
    print(f"{modelo}: {acc:.4f}")



In [None]:
# ============================================================
# üìå Celda 10: Comparaci√≥n Train vs Test Accuracy
# ============================================================

# Accuracy en entrenamiento
train_acc_results = {
    "√Årbol grande (Gini)": accuracy_score(y_train, y_train_pred),
    "√Årbol podado (Gini)": accuracy_score(y_train, y_train_pred_pruned),
    "√Årbol podado (Entrop√≠a)": accuracy_score(y_train, clf_entropy.predict(X_train)),
    "√Årbol optimizado (GridSearchCV)": accuracy_score(y_train, best_tree.predict(X_train)),
}

# Accuracy en prueba (ya calculado en Celda 9)
test_acc_results = acc_results

# Convertimos a DataFrame para graficar
df_acc = pd.DataFrame({
    "Train": train_acc_results,
    "Test": test_acc_results
}).T  # transponemos para que los modelos queden como filas

# üìä Gr√°fico comparativo con etiquetas
ax = df_acc.plot(kind="bar", figsize=(10,6), rot=15, color=["skyblue","salmon"])
plt.ylabel("Accuracy")
plt.title("Comparaci√≥n Accuracy en Train vs Test - √Årboles de Decisi√≥n")
plt.ylim(0,1)
plt.legend(title="Conjunto")

# A√±adimos etiquetas num√©ricas encima de cada barra
for p in ax.patches:
    ax.annotate(
        f"{p.get_height():.2f}",             # texto con 2 decimales
        (p.get_x() + p.get_width() / 2, p.get_height()), # posici√≥n
        ha="center", va="bottom", fontsize=9, color="black", xytext=(0,3), textcoords="offset points"
    )

plt.show()



## üìå Conclusi√≥n

Aunque el **√°rbol optimizado con GridSearchCV** muestra el **menor accuracy en el set de prueba**, es el modelo m√°s recomendable porque alcanza un mejor equilibrio entre **complejidad y capacidad de generalizaci√≥n**.  
El **√°rbol grande con Gini** logra la mayor exactitud, pero lo hace a costa de un **sobreajuste evidente**: memoriza en exceso el conjunto de entrenamiento y pierde robustez ante datos nuevos.  

En cambio, el **√°rbol optimizado** ha sido ajustado mediante **validaci√≥n cruzada**, lo que asegura que su desempe√±o no dependa de un √∫nico set de datos, sino que refleje un comportamiento m√°s **estable, consistente e interpretable**.  
En aplicaciones reales, donde los datos futuros nunca son id√©nticos a los de entrenamiento, es preferible un modelo con un accuracy ligeramente menor, pero que sea mucho m√°s **confiable y generalizable**.
