# üåû CLASIFICACI√ìN ML: PREDICCI√ìN DE CAPACIDAD DE PAGO PARA PANELES SOLARES

**Objetivo:** Desarrollar un modelo de Machine Learning que prediga si un cliente puede pagar por paneles solares.

**Variable objetivo:** `Puede_Pagar_Solar` (S√≠/No)

**Dataset:** Paneles_solares_con_outliers.xlsx - Hoja: Datos Limpios

---

## üìö 1. IMPORTAR LIBRER√çAS

In [None]:
# Manipulaci√≥n de datos
import pandas as pd
import numpy as np

# Visualizaci√≥n
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning - Preprocesamiento
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Machine Learning - Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Machine Learning - M√©tricas
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve,
    precision_recall_curve, auc
)

# Guardar modelos
import joblib
import json
from datetime import datetime

# Configuraci√≥n
import warnings
warnings.filterwarnings('ignore')

# Estilo de gr√°ficos
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("‚úì Librer√≠as importadas correctamente")
print(f"Fecha de ejecuci√≥n: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## üìä 2. CARGA Y EXPLORACI√ìN DE DATOS

In [None]:
# Cargar datos
df = pd.read_excel('Paneles_solares_con_outliers.xlsx', sheet_name='Datos Limpios')

print("="*70)
print("INFORMACI√ìN DEL DATASET")
print("="*70)
print(f"\nDimensiones: {df.shape}")
print(f"Registros: {len(df)}")
print(f"Variables: {df.shape[1]}")
print(f"\nColumnas:\n{df.columns.tolist()}")
print(f"\nTipos de datos:\n{df.dtypes}")
print(f"\nValores nulos:\n{df.isnull().sum()}")

# Mostrar primeras filas
print("\n" + "="*70)
print("PRIMERAS 10 FILAS")
print("="*70)
df.head(10)

In [None]:
# An√°lisis de la variable objetivo
print("="*70)
print("AN√ÅLISIS DE VARIABLE OBJETIVO: Puede_Pagar_Solar")
print("="*70)

# Distribuci√≥n
print("\nDistribuci√≥n de clases:")
print(df['Puede_Pagar_Solar'].value_counts())

print("\nPorcentaje:")
print(df['Puede_Pagar_Solar'].value_counts(normalize=True) * 100)

# Visualizaci√≥n
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Gr√°fico de barras
df['Puede_Pagar_Solar'].value_counts().plot(kind='bar', ax=axes[0], color=['#e74c3c', '#2ecc71'])
axes[0].set_title('Distribuci√≥n de Clases', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Puede Pagar Solar', fontsize=12)
axes[0].set_ylabel('Frecuencia', fontsize=12)
axes[0].set_xticklabels(['No', 'S√≠'], rotation=0)
axes[0].grid(axis='y', alpha=0.3)

# Gr√°fico de pie
colors = ['#e74c3c', '#2ecc71']
df['Puede_Pagar_Solar'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', 
                                              colors=colors, startangle=90)
axes[1].set_title('Proporci√≥n de Clases', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

# Calcular desbalance
class_counts = df['Puede_Pagar_Solar'].value_counts()
imbalance_ratio = class_counts.max() / class_counts.min()
print(f"\nRatio de desbalance: {imbalance_ratio:.2f}:1")

if imbalance_ratio > 1.5:
    print("‚ö†Ô∏è ADVERTENCIA: Dataset desbalanceado. Considerar t√©cnicas de balanceo.")
else:
    print("‚úì Dataset balanceado")

In [None]:
# Estad√≠sticas descriptivas
print("="*70)
print("ESTAD√çSTICAS DESCRIPTIVAS - VARIABLES NUM√âRICAS")
print("="*70)
df.describe()

In [None]:
# An√°lisis de variables categ√≥ricas
print("="*70)
print("AN√ÅLISIS DE VARIABLES CATEG√ìRICAS")
print("="*70)

categorical_cols = ['Sector', 'Ciudad', 'Validar']

for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())
    print(f"Total de categor√≠as √∫nicas: {df[col].nunique()}")

## üìä 3. AN√ÅLISIS EXPLORATORIO DE DATOS (EDA)

In [None]:
# Distribuci√≥n de variables num√©ricas
numeric_cols = ['Consumo_kWh_Mensual', 'Estrato', 'Area_m2', 'Factura_Mensual_COP']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(numeric_cols):
    axes[i].hist(df[col].dropna(), bins=30, color='steelblue', edgecolor='black', alpha=0.7)
    axes[i].set_title(f'Distribuci√≥n de {col}', fontsize=12, fontweight='bold')
    axes[i].set_xlabel(col, fontsize=10)
    axes[i].set_ylabel('Frecuencia', fontsize=10)
    axes[i].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Boxplots por clase objetivo
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(numeric_cols):
    df.boxplot(column=col, by='Puede_Pagar_Solar', ax=axes[i])
    axes[i].set_title(f'{col} por Capacidad de Pago', fontsize=12, fontweight='bold')
    axes[i].set_xlabel('Puede Pagar Solar', fontsize=10)
    axes[i].set_ylabel(col, fontsize=10)
    axes[i].get_figure().suptitle('')

plt.tight_layout()
plt.show()

In [None]:
# Matriz de correlaci√≥n
plt.figure(figsize=(10, 8))
correlation_matrix = df[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Matriz de Correlaci√≥n - Variables Num√©ricas', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nCorrelaciones m√°s fuertes:")
corr_pairs = correlation_matrix.unstack()
corr_pairs = corr_pairs[corr_pairs != 1].sort_values(ascending=False)
print(corr_pairs.head(10))

## üîß 4. PREPROCESAMIENTO DE DATOS

In [None]:
# Crear copia para preprocesamiento
df_model = df.copy()

print("="*70)
print("PREPROCESAMIENTO DE DATOS")
print("="*70)

# 1. Eliminar columnas no necesarias
columns_to_drop = ['ID_Cliente', 'Validar']
df_model = df_model.drop(columns=columns_to_drop)
print(f"\n‚úì Columnas eliminadas: {columns_to_drop}")

# 2. Convertir variable objetivo a binaria (0/1)
df_model['Puede_Pagar_Solar'] = df_model['Puede_Pagar_Solar'].map({'No': 0, 'S√≠': 1})
print("\n‚úì Variable objetivo convertida a binaria (No=0, S√≠=1)")

# 3. Codificar variables categ√≥ricas
categorical_features = ['Sector', 'Ciudad']

print("\n‚úì Codificando variables categ√≥ricas:")
label_encoders = {}

for col in categorical_features:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    label_encoders[col] = le
    print(f"   - {col}: {len(le.classes_)} categor√≠as")

# 4. Imputar valores nulos
print("\n‚úì Imputando valores nulos:")
imputer = SimpleImputer(strategy='median')
numeric_features = ['Consumo_kWh_Mensual', 'Area_m2', 'Factura_Mensual_COP']

for col in numeric_features:
    if df_model[col].isnull().sum() > 0:
        df_model[col] = imputer.fit_transform(df_model[[col]])
        print(f"   - {col}: {df_model[col].isnull().sum()} nulos imputados")

print(f"\n‚úì Valores nulos restantes: {df_model.isnull().sum().sum()}")
print(f"\n‚úì Dataset preprocesado: {df_model.shape}")

df_model.head()

In [None]:
# Separar features y target
X = df_model.drop('Puede_Pagar_Solar', axis=1)
y = df_model['Puede_Pagar_Solar']

print("="*70)
print("SEPARACI√ìN DE FEATURES Y TARGET")
print("="*70)
print(f"\nFeatures (X): {X.shape}")
print(f"Target (y): {y.shape}")
print(f"\nVariables predictoras:")
for i, col in enumerate(X.columns, 1):
    print(f"  {i}. {col}")

print(f"\nDistribuci√≥n de clases en y:")
print(y.value_counts())

## üîÄ 5. DIVISI√ìN DE DATOS Y ESCALADO

In [None]:
# Divisi√≥n train/test con estratificaci√≥n
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("="*70)
print("DIVISI√ìN DE DATOS (80% TRAIN / 20% TEST)")
print("="*70)
print(f"\nTrain: {X_train.shape[0]} muestras ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test:  {X_test.shape[0]} muestras ({X_test.shape[0]/len(X)*100:.1f}%)")

print("\nDistribuci√≥n de clases en Train:")
print(y_train.value_counts())
print(y_train.value_counts(normalize=True) * 100)

print("\nDistribuci√≥n de clases en Test:")
print(y_test.value_counts())
print(y_test.value_counts(normalize=True) * 100)

In [None]:
# Escalado de features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("="*70)
print("ESCALADO DE FEATURES (StandardScaler)")
print("="*70)
print("\n‚úì Features escaladas")
print(f"\nMedia de X_train_scaled: {X_train_scaled.mean(axis=0)}")
print(f"Desviaci√≥n est√°ndar de X_train_scaled: {X_train_scaled.std(axis=0)}")

## ü§ñ 6. ENTRENAMIENTO DE MODELOS DE CLASIFICACI√ìN

In [None]:
# Definir modelos a entrenar
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42)
}

print("="*70)
print(f"ENTRENANDO {len(models)} MODELOS DE CLASIFICACI√ìN")
print("="*70)

# Diccionario para almacenar resultados
results = {}

# Entrenar cada modelo
for name, model in models.items():
    print(f"\n{'='*70}")
    print(f"Modelo: {name}")
    print(f"{'='*70}")
    
    # Entrenar
    model.fit(X_train_scaled, y_train)
    
    # Predicciones
    y_pred_train = model.predict(X_train_scaled)
    y_pred_test = model.predict(X_test_scaled)
    
    # Probabilidades (si est√° disponible)
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        y_pred_proba = None
    
    # M√©tricas en Train
    train_accuracy = accuracy_score(y_train, y_pred_train)
    
    # M√©tricas en Test
    test_accuracy = accuracy_score(y_test, y_pred_test)
    precision = precision_score(y_test, y_pred_test)
    recall = recall_score(y_test, y_pred_test)
    f1 = f1_score(y_test, y_pred_test)
    
    # AUC-ROC
    if y_pred_proba is not None:
        auc_roc = roc_auc_score(y_test, y_pred_proba)
    else:
        auc_roc = None
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    
    # Guardar resultados
    results[name] = {
        'model': model,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc_roc': auc_roc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'y_pred': y_pred_test,
        'y_pred_proba': y_pred_proba
    }
    
    # Imprimir resultados
    print(f"\nAccuracy (Train): {train_accuracy:.4f}")
    print(f"Accuracy (Test):  {test_accuracy:.4f}")
    print(f"Precision:        {precision:.4f}")
    print(f"Recall:           {recall:.4f}")
    print(f"F1-Score:         {f1:.4f}")
    if auc_roc:
        print(f"AUC-ROC:          {auc_roc:.4f}")
    print(f"CV Accuracy:      {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

print("\n" + "="*70)
print("‚úì ENTRENAMIENTO COMPLETADO")
print("="*70)

## üìä 7. COMPARACI√ìN DE MODELOS

In [None]:
# Crear DataFrame de comparaci√≥n
comparison_df = pd.DataFrame({
    'Modelo': list(results.keys()),
    'Accuracy (Train)': [r['train_accuracy'] for r in results.values()],
    'Accuracy (Test)': [r['test_accuracy'] for r in results.values()],
    'Precision': [r['precision'] for r in results.values()],
    'Recall': [r['recall'] for r in results.values()],
    'F1-Score': [r['f1_score'] for r in results.values()],
    'AUC-ROC': [r['auc_roc'] if r['auc_roc'] else 0 for r in results.values()],
    'CV Mean': [r['cv_mean'] for r in results.values()],
    'CV Std': [r['cv_std'] for r in results.values()]
})

# Ordenar por F1-Score
comparison_df = comparison_df.sort_values('F1-Score', ascending=False)

print("="*70)
print("COMPARACI√ìN DE MODELOS")
print("="*70)
print(comparison_df.to_string(index=False))

# Identificar mejor modelo
best_model_name = comparison_df.iloc[0]['Modelo']
print(f"\nüèÜ MEJOR MODELO: {best_model_name}")
print(f"   F1-Score: {comparison_df.iloc[0]['F1-Score']:.4f}")
print(f"   AUC-ROC:  {comparison_df.iloc[0]['AUC-ROC']:.4f}")

In [None]:
# Visualizaci√≥n de comparaci√≥n
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Gr√°fico 1: Accuracy comparaci√≥n
comparison_df.plot(x='Modelo', y=['Accuracy (Train)', 'Accuracy (Test)'], 
                   kind='bar', ax=axes[0, 0], color=['#3498db', '#e74c3c'])
axes[0, 0].set_title('Accuracy: Train vs Test', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('Accuracy', fontsize=12)
axes[0, 0].set_xlabel('')
axes[0, 0].legend(['Train', 'Test'])
axes[0, 0].grid(axis='y', alpha=0.3)
axes[0, 0].set_xticklabels(comparison_df['Modelo'], rotation=45, ha='right')

# Gr√°fico 2: M√©tricas de clasificaci√≥n
comparison_df.plot(x='Modelo', y=['Precision', 'Recall', 'F1-Score'], 
                   kind='bar', ax=axes[0, 1], color=['#2ecc71', '#f39c12', '#9b59b6'])
axes[0, 1].set_title('M√©tricas de Clasificaci√≥n', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Score', fontsize=12)
axes[0, 1].set_xlabel('')
axes[0, 1].legend(['Precision', 'Recall', 'F1-Score'])
axes[0, 1].grid(axis='y', alpha=0.3)
axes[0, 1].set_xticklabels(comparison_df['Modelo'], rotation=45, ha='right')

# Gr√°fico 3: AUC-ROC
comparison_df_with_auc = comparison_df[comparison_df['AUC-ROC'] > 0]
comparison_df_with_auc.plot(x='Modelo', y='AUC-ROC', kind='barh', 
                             ax=axes[1, 0], color='#1abc9c', legend=False)
axes[1, 0].set_title('AUC-ROC por Modelo', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('AUC-ROC', fontsize=12)
axes[1, 0].set_ylabel('')
axes[1, 0].grid(axis='x', alpha=0.3)

# Gr√°fico 4: Cross-Validation con error bars
x_pos = np.arange(len(comparison_df))
axes[1, 1].bar(x_pos, comparison_df['CV Mean'], yerr=comparison_df['CV Std'],
               color='#34495e', alpha=0.7, capsize=5)
axes[1, 1].set_title('Cross-Validation Accuracy (5-Fold)', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Accuracy', fontsize=12)
axes[1, 1].set_xlabel('Modelo', fontsize=12)
axes[1, 1].set_xticks(x_pos)
axes[1, 1].set_xticklabels(comparison_df['Modelo'], rotation=45, ha='right')
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## üîç 8. AN√ÅLISIS DETALLADO DEL MEJOR MODELO

In [None]:
# Obtener mejor modelo
best_model = results[best_model_name]['model']
y_pred_best = results[best_model_name]['y_pred']
y_pred_proba_best = results[best_model_name]['y_pred_proba']

print("="*70)
print(f"AN√ÅLISIS DETALLADO: {best_model_name}")
print("="*70)

# Reporte de clasificaci√≥n
print("\nREPORTE DE CLASIFICACI√ìN:\n")
print(classification_report(y_test, y_pred_best, target_names=['No Puede Pagar', 'S√≠ Puede Pagar']))

# Matriz de confusi√≥n
cm = confusion_matrix(y_test, y_pred_best)
print("\nMATRIZ DE CONFUSI√ìN:\n")
print(cm)

In [None]:
# Visualizaci√≥n de matriz de confusi√≥n
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Matriz de confusi√≥n - Valores absolutos
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['No', 'S√≠'], yticklabels=['No', 'S√≠'])
axes[0].set_title(f'Matriz de Confusi√≥n - {best_model_name}', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Valor Real', fontsize=12)
axes[0].set_xlabel('Predicci√≥n', fontsize=12)

# Matriz de confusi√≥n - Normalizada
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Greens', ax=axes[1],
            xticklabels=['No', 'S√≠'], yticklabels=['No', 'S√≠'])
axes[1].set_title(f'Matriz de Confusi√≥n Normalizada - {best_model_name}', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Valor Real', fontsize=12)
axes[1].set_xlabel('Predicci√≥n', fontsize=12)

plt.tight_layout()
plt.show()

# Interpretaci√≥n de la matriz
tn, fp, fn, tp = cm.ravel()
print(f"\nINTERPRETACI√ìN DE LA MATRIZ:")
print(f"  Verdaderos Negativos (TN): {tn} - Correctamente clasificados como NO")
print(f"  Falsos Positivos (FP):     {fp} - Incorrectamente clasificados como S√ç")
print(f"  Falsos Negativos (FN):     {fn} - Incorrectamente clasificados como NO")
print(f"  Verdaderos Positivos (TP): {tp} - Correctamente clasificados como S√ç")

In [None]:
# Curvas ROC y Precision-Recall
if y_pred_proba_best is not None:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Curva ROC
    fpr, tpr, thresholds_roc = roc_curve(y_test, y_pred_proba_best)
    roc_auc = auc(fpr, tpr)
    
    axes[0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    axes[0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
    axes[0].set_xlim([0.0, 1.0])
    axes[0].set_ylim([0.0, 1.05])
    axes[0].set_xlabel('False Positive Rate', fontsize=12)
    axes[0].set_ylabel('True Positive Rate', fontsize=12)
    axes[0].set_title(f'Curva ROC - {best_model_name}', fontsize=14, fontweight='bold')
    axes[0].legend(loc="lower right")
    axes[0].grid(alpha=0.3)
    
    # Curva Precision-Recall
    precision_curve, recall_curve, thresholds_pr = precision_recall_curve(y_test, y_pred_proba_best)
    pr_auc = auc(recall_curve, precision_curve)
    
    axes[1].plot(recall_curve, precision_curve, color='green', lw=2, 
                 label=f'PR curve (AUC = {pr_auc:.4f})')
    axes[1].set_xlim([0.0, 1.0])
    axes[1].set_ylim([0.0, 1.05])
    axes[1].set_xlabel('Recall', fontsize=12)
    axes[1].set_ylabel('Precision', fontsize=12)
    axes[1].set_title(f'Curva Precision-Recall - {best_model_name}', fontsize=14, fontweight='bold')
    axes[1].legend(loc="lower left")
    axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\n‚úì AUC-ROC: {roc_auc:.4f}")
    print(f"‚úì AUC-PR:  {pr_auc:.4f}")
else:
    print("\n‚ö†Ô∏è Este modelo no soporta predict_proba, no se pueden generar curvas ROC/PR")

## üíæ 9. GUARDAR MODELO Y ARTEFACTOS

In [None]:
# Guardar mejor modelo
print("="*70)
print("GUARDANDO MODELO Y ARTEFACTOS")
print("="*70)

# 1. Guardar modelo
joblib.dump(best_model, 'best_model_pago_solar.pkl')
print("\n‚úì Modelo guardado: best_model_pago_solar.pkl")

# 2. Guardar scaler
joblib.dump(scaler, 'scaler_pago_solar.pkl')
print("‚úì Scaler guardado: scaler_pago_solar.pkl")

# 3. Guardar label encoders
joblib.dump(label_encoders, 'label_encoders_pago_solar.pkl')
print("‚úì Label encoders guardados: label_encoders_pago_solar.pkl")

# 4. Guardar informaci√≥n del modelo
model_info = {
    'modelo': best_model_name,
    'fecha_entrenamiento': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'metricas': {
        'accuracy_train': float(results[best_model_name]['train_accuracy']),
        'accuracy_test': float(results[best_model_name]['test_accuracy']),
        'precision': float(results[best_model_name]['precision']),
        'recall': float(results[best_model_name]['recall']),
        'f1_score': float(results[best_model_name]['f1_score']),
        'auc_roc': float(results[best_model_name]['auc_roc']) if results[best_model_name]['auc_roc'] else None,
        'cv_mean': float(results[best_model_name]['cv_mean']),
        'cv_std': float(results[best_model_name]['cv_std'])
    },
    'clases': ['No', 'S√≠'],
    'variables_predictoras': list(X.columns),
    'total_registros': len(df_model),
    'train_size': len(X_train),
    'test_size': len(X_test)
}

with open('model_info_pago_solar.json', 'w', encoding='utf-8') as f:
    json.dump(model_info, f, indent=2, ensure_ascii=False)
print("‚úì Informaci√≥n del modelo guardada: model_info_pago_solar.json")

# 5. Guardar comparaci√≥n de modelos
comparison_df.to_excel('comparacion_modelos_pago_solar.xlsx', index=False)
print("‚úì Comparaci√≥n de modelos guardada: comparacion_modelos_pago_solar.xlsx")

print("\n" + "="*70)
print("‚úì TODOS LOS ARTEFACTOS GUARDADOS CORRECTAMENTE")
print("="*70)

## üìä 10. EXPORTAR PREDICCIONES A EXCEL

In [None]:
# Crear DataFrame con predicciones
predictions_df = pd.DataFrame({
    'Real': y_test.map({0: 'No', 1: 'S√≠'}),
    'Prediccion': [('S√≠' if p == 1 else 'No') for p in y_pred_best],
    'Correcto': y_test == y_pred_best
})

# Agregar probabilidades si est√°n disponibles
if y_pred_proba_best is not None:
    predictions_df['Probabilidad_Si'] = y_pred_proba_best
    predictions_df['Probabilidad_No'] = 1 - y_pred_proba_best

# Agregar features originales
X_test_original = X_test.reset_index(drop=True)
predictions_df = pd.concat([predictions_df, X_test_original], axis=1)

# Crear archivo Excel con m√∫ltiples hojas
with pd.ExcelWriter('predicciones_pago_solar.xlsx', engine='openpyxl') as writer:
    # Hoja 1: Predicciones
    predictions_df.to_excel(writer, sheet_name='Predicciones', index=False)
    
    # Hoja 2: Resumen de m√©tricas
    metrics_summary = pd.DataFrame([
        {'M√©trica': 'Modelo', 'Valor': best_model_name},
        {'M√©trica': 'Accuracy', 'Valor': f"{results[best_model_name]['test_accuracy']:.4f}"},
        {'M√©trica': 'Precision', 'Valor': f"{results[best_model_name]['precision']:.4f}"},
        {'M√©trica': 'Recall', 'Valor': f"{results[best_model_name]['recall']:.4f}"},
        {'M√©trica': 'F1-Score', 'Valor': f"{results[best_model_name]['f1_score']:.4f}"},
        {'M√©trica': 'AUC-ROC', 'Valor': f"{results[best_model_name]['auc_roc']:.4f}" if results[best_model_name]['auc_roc'] else 'N/A'},
        {'M√©trica': 'Total Predicciones', 'Valor': len(predictions_df)},
        {'M√©trica': 'Predicciones Correctas', 'Valor': predictions_df['Correcto'].sum()},
        {'M√©trica': 'Predicciones Incorrectas', 'Valor': (~predictions_df['Correcto']).sum()}
    ])
    metrics_summary.to_excel(writer, sheet_name='Resumen M√©tricas', index=False)
    
    # Hoja 3: Matriz de confusi√≥n
    cm_df = pd.DataFrame(cm, 
                        columns=['Predicho: No', 'Predicho: S√≠'],
                        index=['Real: No', 'Real: S√≠'])
    cm_df.to_excel(writer, sheet_name='Matriz Confusi√≥n')
    
    # Hoja 4: Comparaci√≥n de todos los modelos
    comparison_df.to_excel(writer, sheet_name='Comparaci√≥n Modelos', index=False)

print("="*70)
print("PREDICCIONES EXPORTADAS A EXCEL")
print("="*70)
print("\n‚úì Archivo creado: predicciones_pago_solar.xlsx")
print("\nHojas incluidas:")
print("  1. Predicciones - Detalle de cada predicci√≥n")
print("  2. Resumen M√©tricas - M√©tricas del mejor modelo")
print("  3. Matriz Confusi√≥n - Matriz de confusi√≥n")
print("  4. Comparaci√≥n Modelos - Comparaci√≥n de todos los modelos")

# Mostrar preview
print("\nPreview de predicciones:")
predictions_df.head(10)

## üéØ 11. RESUMEN FINAL Y CONCLUSIONES

In [None]:
print("="*70)
print("RESUMEN FINAL DEL PROYECTO")
print("="*70)

print("\nüìä DATOS:")
print(f"  - Total de registros: {len(df)}")
print(f"  - Variables predictoras: {len(X.columns)}")
print(f"  - Train/Test split: {len(X_train)}/{len(X_test)} (80%/20%)")
print(f"  - Distribuci√≥n de clases: S√≠={y.sum()} ({y.mean()*100:.1f}%), No={len(y)-y.sum()} ({(1-y.mean())*100:.1f}%)")

print("\nü§ñ MODELOS ENTRENADOS:")
print(f"  - Total de modelos: {len(models)}")
print(f"  - Mejor modelo: {best_model_name}")

print("\nüìà M√âTRICAS DEL MEJOR MODELO:")
print(f"  - Accuracy (Test):  {results[best_model_name]['test_accuracy']:.4f}")
print(f"  - Precision:        {results[best_model_name]['precision']:.4f}")
print(f"  - Recall:           {results[best_model_name]['recall']:.4f}")
print(f"  - F1-Score:         {results[best_model_name]['f1_score']:.4f}")
if results[best_model_name]['auc_roc']:
    print(f"  - AUC-ROC:          {results[best_model_name]['auc_roc']:.4f}")
print(f"  - CV Accuracy:      {results[best_model_name]['cv_mean']:.4f} (+/- {results[best_model_name]['cv_std']:.4f})")

print("\nüíæ ARCHIVOS GENERADOS:")
print("  ‚úì best_model_pago_solar.pkl")
print("  ‚úì scaler_pago_solar.pkl")
print("  ‚úì label_encoders_pago_solar.pkl")
print("  ‚úì model_info_pago_solar.json")
print("  ‚úì comparacion_modelos_pago_solar.xlsx")
print("  ‚úì predicciones_pago_solar.xlsx")

print("\n‚úÖ CONCLUSIONES:")
accuracy = results[best_model_name]['test_accuracy']
if accuracy >= 0.90:
    print("  - El modelo tiene un EXCELENTE desempe√±o (‚â•90% accuracy)")
elif accuracy >= 0.80:
    print("  - El modelo tiene un BUEN desempe√±o (‚â•80% accuracy)")
elif accuracy >= 0.70:
    print("  - El modelo tiene un desempe√±o ACEPTABLE (‚â•70% accuracy)")
else:
    print("  - El modelo tiene un desempe√±o BAJO (<70% accuracy)")
    print("  - Se recomienda revisar features o probar otros modelos")

f1 = results[best_model_name]['f1_score']
if f1 >= 0.85:
    print("  - El balance Precision/Recall es EXCELENTE (F1 ‚â•0.85)")
elif f1 >= 0.75:
    print("  - El balance Precision/Recall es BUENO (F1 ‚â•0.75)")
else:
    print("  - El balance Precision/Recall puede mejorarse (F1 <0.75)")

print("\n" + "="*70)
print("üéâ PROYECTO COMPLETADO EXITOSAMENTE")
print("="*70)