In [10]:
import pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import (
    accuracy_score, 
    confusion_matrix, 
    classification_report, 
    adjusted_rand_score, 
    silhouette_score
)
from imblearn.over_sampling import SMOTE
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [11]:
# 1. Cargar dataset
car_evaluation = fetch_ucirepo(id=19)
X = car_evaluation.data.features
y = car_evaluation.data.targets

# Verificar distribución de clases
print("\nDistribución original de clases:")
print(y.value_counts())


Distribución original de clases:
class
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64


In [12]:
# 2. Preprocesamiento
# Codificación de características ordinales
encoder = OrdinalEncoder(
    categories=[
        ['low', 'med', 'high', 'vhigh'],  # buying
        ['low', 'med', 'high', 'vhigh'],  # maint
        ['2', '3', '4', '5more'],         # doors
        ['2', '4', 'more'],               # persons
        ['small', 'med', 'big'],          # lug_boot
        ['low', 'med', 'high']            # safety
    ]
)
X_encoded = encoder.fit_transform(X)

# Codificación de etiquetas
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y.values.ravel())

In [13]:
# 3. División estratificada de datos
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, 
    y_encoded, 
    test_size=0.3, 
    stratify=y_encoded,  # Mantener distribución de clases
    random_state=42
)


In [14]:
# 4. Balanceo de clases con SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("\nDistribución después de SMOTE:")
print(pd.Series(y_train).value_counts())


Distribución después de SMOTE:
2    847
0    847
1    847
3    847
Name: count, dtype: int64


In [15]:
# 5. Entrenamiento de modelos y comparación
models = {
    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced',
        random_state=42
    ),
    "Árbol de Decisión": DecisionTreeClassifier(
        max_depth=8,
        class_weight='balanced',
        random_state=42
    ),
    "Red Neuronal": MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        early_stopping=True,
        max_iter=1000,
        random_state=42
    )
}

# Diccionario para almacenar métricas de cada modelo
model_metrics = {}

for name, model in models.items():
    print(f"\n=== Entrenando {name} ===")
    
    # Entrenamiento
    model.fit(X_train, y_train)
    
    # Predicción
    y_pred = model.predict(X_test)
    
    # Evaluación
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(
        y_test, 
        y_pred, 
        target_names=label_encoder.classes_, 
        output_dict=True
    )
    
    # Almacenar métricas para comparación
    model_metrics[name] = {
        'accuracy': accuracy,
        'f1_macro': report['macro avg']['f1-score'],
        'f1_weighted': report['weighted avg']['f1-score']
    }
    
    # Imprimir resultados individuales
    print(f"\nPrecisión: {accuracy:.4f}")
    print(f"F1-score (Macro): {report['macro avg']['f1-score']:.4f}")
    print(f"F1-score (Weighted): {report['weighted avg']['f1-score']:.4f}")
    print(pd.DataFrame(report).transpose().round(4))
    
    # Matriz de confusión
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm, 
        annot=True, 
        fmt='d', 
        cmap='Blues',
        xticklabels=label_encoder.classes_,
        yticklabels=label_encoder.classes_
    )
    plt.title(f'Matriz de Confusión - {name}')
    plt.xlabel('Predicho')
    plt.ylabel('Real')
    plt.savefig(f'static/confusion_{name.lower().replace(" ", "_")}.png')
    plt.close()
    
    # Guardar modelo
    joblib.dump(model, f'model/{name.lower().replace(" ", "_")}_model.pkl')



=== Entrenando Random Forest ===

Precisión: 0.9807
F1-score (Macro): 0.9742
F1-score (Weighted): 0.9808
              precision  recall  f1-score   support
acc              0.9487  0.9652    0.9569  115.0000
good             0.9545  1.0000    0.9767   21.0000
unacc            0.9917  0.9862    0.9890  363.0000
vgood            1.0000  0.9500    0.9744   20.0000
accuracy         0.9807  0.9807    0.9807    0.9807
macro avg        0.9737  0.9754    0.9742  519.0000
weighted avg     0.9810  0.9807    0.9808  519.0000

=== Entrenando Árbol de Decisión ===

Precisión: 0.9518
F1-score (Macro): 0.8913
F1-score (Weighted): 0.9535
              precision  recall  f1-score   support
acc              0.9027  0.8870    0.8947  115.0000
good             0.7143  0.9524    0.8163   21.0000
unacc            1.0000  0.9697    0.9846  363.0000
vgood            0.7692  1.0000    0.8696   20.0000
accuracy         0.9518  0.9518    0.9518    0.9518
macro avg        0.8465  0.9523    0.8913  519.0000
weig

In [16]:
# 6. Comparación de modelos y selección del mejor
print("\n" + "="*60)
print("COMPARACIÓN FINAL DE MODELOS")
print("="*60)

# Crear DataFrame con las métricas
results_df = pd.DataFrame.from_dict(model_metrics, orient='index')
results_df = results_df.sort_values(by='accuracy', ascending=False)
print("\nResumen de métricas por modelo:")
print(results_df)

# Identificar el mejor modelo en base a accuracy
best_model_name = results_df.index[0]
best_accuracy = results_df.iloc[0]['accuracy']
best_f1_macro = results_df.iloc[0]['f1_macro']

print("\n" + "="*60)
print(f"MEJOR MODELO: {best_model_name}")
print(f"Precisión: {best_accuracy:.4f}")
print(f"F1-score (Macro): {best_f1_macro:.4f}")
print("="*60)

# Gráfico comparativo de precisión
plt.figure(figsize=(10, 6))
results_df['accuracy'].plot(kind='bar', color='skyblue')
plt.title('Comparación de Precisión entre Modelos')
plt.ylabel('Precisión')
plt.ylim(0.8, 1.0)
plt.xticks(rotation=15)
for i, v in enumerate(results_df['accuracy']):
    plt.text(i, v + 0.005, f'{v:.4f}', ha='center')
plt.savefig('static/model_comparison.png')
plt.close()


COMPARACIÓN FINAL DE MODELOS

Resumen de métricas por modelo:
                   accuracy  f1_macro  f1_weighted
Random Forest      0.980732  0.974237     0.980792
Red Neuronal       0.953757  0.923463     0.954186
Árbol de Decisión  0.951830  0.891311     0.953457

MEJOR MODELO: Random Forest
Precisión: 0.9807
F1-score (Macro): 0.9742


In [17]:
# 7. Clustering de validación
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(X_encoded)

# Métricas de clustering
ari = adjusted_rand_score(y_encoded, clusters)
silhouette = silhouette_score(X_encoded, clusters)
print(f"\n=== Validación con Clustering ===")
print(f"Adjusted Rand Index: {ari:.2f}")
print(f"Silhouette Score: {silhouette:.2f}")


=== Validación con Clustering ===
Adjusted Rand Index: 0.01
Silhouette Score: 0.16


In [18]:
# 8. Guardar encoders
joblib.dump(encoder, 'model/encoder.pkl')
joblib.dump(label_encoder, 'model/label_encoder.pkl')

['model/label_encoder.pkl']