In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras import layers, models, callbacks, backend as K
from tensorflow.keras.losses import LogCosh

CONFIGURACI√ìN DE COLUMNAS

In [2]:
# Lista de variables a ELIMINAR basada en el an√°lisis de Permutation Importance (< 0.05% de influencia)
NOISE_COLS = ['Tecnologia', 'Tur', 'categoria_producto', 'semana_anio', 'g_art_id']

# Las listas originales (del paso anterior)
USER_NUMERICAL = [
    'semana_anio', 'Tur', 'planta_id', 'seccion_id', 'maq_id', 'Pas', 
    'producto_id', 'estilo_id', 'Tal', 'Col', 'Tal_Fert', 'Col_Fert', 
    'Componentes', 'g_art_id', 'mp_id', 'Rechazo_comp', 'rechazo_flag', 
    'Tipo_2a_encoded'
]

USER_CATEGORICAL = [
    'Tipo_TEJ', 'Tecnologia', 'C', 'categoria_producto', 'MP', 'mp_categoria'
]
LEAKAGE_COLS = ['Rechazo_comp', 'rechazo_flag', 'Tipo_2a_encoded']



Logica de Reclasificacion

In [3]:
def reorganize_features_final(df):
    """
    Funci√≥n de reorganizaci√≥n de features que ahora incluye la eliminaci√≥n de ruido.
    """
    df = df.copy()
    
    # 1. Eliminar Data Leakage y Ruido
    cols_to_drop = [c for c in (LEAKAGE_COLS + NOISE_COLS) if c in df.columns]
    if cols_to_drop:
        print(f"‚úÖ Eliminando ruido y leakage: {cols_to_drop}")
        df = df.drop(columns=cols_to_drop)

    # 2. Re-clasificaci√≥n Inteligente
    potential_ids = ['planta_id', 'seccion_id', 'maq_id', 'producto_id', 'estilo_id', 'mp_id']
    
    final_embeddings = list(USER_CATEGORICAL)
    final_numerics = []

    for col in USER_NUMERICAL:
        if col in cols_to_drop or col not in df.columns:
            continue
            
        if col in potential_ids:
            final_embeddings.append(col)
        else:
            final_numerics.append(col)
            
    # Filtro final de listas
    final_embeddings = [c for c in final_embeddings if c in df.columns]
    final_numerics = [c for c in final_numerics if c in df.columns]
    
    # Quitar duplicados entre las listas (solo por seguridad)
    final_embeddings = list(set(final_embeddings))
    final_numerics = list(set([c for c in final_numerics if c not in final_embeddings]))

    print(f"\n---> Total de Variables Finales: {len(final_embeddings) + len(final_numerics)}")
    
    return df, final_embeddings, final_numerics

Preprocesamiento Robusto

In [4]:
def preprocess_data(X_train, X_test, embed_cols, num_cols):
    input_train = {}
    input_test = {}
    encoders = {}
    
    # A. Procesar Embeddings (IDs y Categor√≠as)
    for col in embed_cols:
        # Convertir a string para tratar igual IDs num√©ricos y texto
        X_train[col] = X_train[col].astype(str)
        X_test[col] = X_test[col].astype(str)
        
        le = LabelEncoder()
        # Ajustamos con train y manejamos desconocidos en test
        train_vals = list(X_train[col].unique())
        # Truco: a√±adimos una clase 'UNKNOWN' para valores nuevos en el futuro
        train_vals.append('<UNK>') 
        le.fit(train_vals)
        encoders[col] = le
        
        # Transformar Train
        input_train[f"in_{col}"] = le.transform(X_train[col])
        
        # Transformar Test (Manejo seguro de valores no vistos)
        # Si el valor no est√° en el encoder, asignamos el √≠ndice de <UNK> (el √∫ltimo)
        unk_idx = len(le.classes_) - 1
        input_test[f"in_{col}"] = X_test[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else unk_idx).values

    # B. Procesar Num√©ricas (Escalamiento)
    if num_cols:
        scaler = StandardScaler()
        # Ajustar solo en train
        X_train_num = scaler.fit_transform(X_train[num_cols])
        X_test_num = scaler.transform(X_test[num_cols])
        
        input_train["in_numerics"] = X_train_num
        input_test["in_numerics"] = X_test_num
        
    return input_train, input_test, encoders, len(num_cols)

MODELO: Zero-Inflated Neural Network

In [5]:
def build_dynamic_model_tuned(embed_cols, encoders, n_numeric_features, learning_rate=0.0005):
    inputs = []
    embeddings = []
    
    # --- 1. Capas de Embedding ---
    for col in embed_cols:
        n_vocab = len(encoders[col].classes_)
        embed_dim = min(60, int(np.log2(n_vocab) * 2.5) + 1) # Aumento leve del tama√±o
        
        in_layer = layers.Input(shape=(1,), name=f"in_{col}")
        inputs.append(in_layer)
        
        emb = layers.Embedding(input_dim=n_vocab, output_dim=embed_dim)(in_layer)
        emb = layers.Flatten()(emb)
        embeddings.append(emb)
        
    # --- 2. Entrada Num√©rica ---
    if n_numeric_features > 0:
        num_in = layers.Input(shape=(n_numeric_features,), name="in_numerics")
        inputs.append(num_in)
        embeddings.append(num_in)
        
    # --- 3. Concatenaci√≥n y Cuerpo Denso ---
    x = layers.Concatenate()(embeddings)
    
    # Aumento de capacidad y ajuste de Dropout
    x = layers.Dense(256, activation='relu')(x) # Aumento a 256
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x) # Aumento de Dropout
    
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    
    x = layers.Dense(64, activation='relu')(x)
    
    # --- 4. Salida Especializada ---
    # Usamos inicializador de bias negativo para la tendencia a cero
    output = layers.Dense(1, activation='sigmoid', bias_initializer=tf.keras.initializers.Constant(-2.5))(x)
    
    model = models.Model(inputs=inputs, outputs=output)
    
    # ¬°USAMOS LOG-COSH LOSS!
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), 
                  loss=LogCosh, 
                  metrics=['mae', tf.keras.metrics.RootMeanSquaredError(name='rmse')])
    
    return model

Ejecucion

In [8]:
df = pd.read_csv('../../data/processed/dataset_cleaned.csv')
# 1. Target y Separaci√≥n
target_col = 'Und_2a_percentage'
# ... (aseg√∫rate de que df y y est√°n bien definidos)
y = df[target_col].values
X = df.drop(columns=[target_col])

# 2. Reorganizaci√≥n Final
X_clean, embed_cols, final_num_cols = reorganize_features_final(X)

# 3. Split, Preprocesamiento y Reentrenamiento
# (Usar las funciones preprocess_data y build_dynamic_model_tuned previamente definidas)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_clean, y, test_size=0.2, random_state=42)
train_inputs, test_inputs, encoders, n_nums = preprocess_data(X_train_raw, X_test_raw, embed_cols, final_num_cols)

model_final = build_dynamic_model_tuned(embed_cols, encoders, n_nums, learning_rate=0.0003) # LR ligeramente m√°s bajo



# --- PASO E: Construir y Entrenar ---
model = build_dynamic_model_tuned(embed_cols, encoders, n_nums)

print("\nEntrenando Modelo...")
history = model.fit(
    train_inputs, y_train,
    validation_data=(test_inputs, y_test),
    epochs=50,
    batch_size=32,
    callbacks=[
        callbacks.EarlyStopping(patience=8, restore_best_weights=True),
        callbacks.ReduceLROnPlateau(patience=4)
    ],
    verbose=1
)

# --- Predicci√≥n ---
predictions = model.predict(test_inputs)
# Mostrar las primeras 5 predicciones vs reales
print("\n--- Validaci√≥n (Real vs Predicho) ---")
for i in range(5):
    print(f"Real: {y_test[i]:.4f} | Pred: {predictions[i][0]:.4f}")

‚úÖ Eliminando ruido y leakage: ['Rechazo_comp', 'rechazo_flag', 'Tipo_2a_encoded', 'Tecnologia', 'Tur', 'categoria_producto', 'semana_anio', 'g_art_id']

---> Total de Variables Finales: 16

Entrenando Modelo...
Epoch 1/50
[1m9121/9121[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m54s[0m 6ms/step - loss: 0.0124 - mae: 0.0685 - rmse: 0.1655 - val_loss: 0.0113 - val_mae: 0.0682 - val_rmse: 0.1571 - learning_rate: 5.0000e-04
Epoch 2/50
[1m9121/9121[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m56s[0m 6ms/step - loss: 0.0113 - mae: 0.0665 - rmse: 0.1570 - val_loss: 0.0110 - val_mae: 0.0657 - val_rmse: 0.1547 - learning_rate: 5.0000e-04
Epoch 3/50
[1m9121/9121[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m57s[0m 6ms/step - loss: 0.0110 - mae: 0.0655 - rmse: 0.1548 - val_loss: 0.0109 - val_mae: 0.0653 - val_rmse: 0.1542 - learning_rate: 5.0000e-04
Epoch 4/50
[1m9

In [12]:
from sklearn.metrics import r2_score, mean_squared_error

# Calculate R-squared
r2 = r2_score(y_test, predictions)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f"R-squared: {r2:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")

R-squared: 0.7844
Mean Squared Error: 0.0235
Root Mean Squared Error: 0.1532


In [14]:
# ==========================================
# C√ìDIGO PARA CALCULAR IMPORTANCIA
# ==========================================

def calculate_permutation_importance(model, X_dict, y_true, metric_to_monitor, sample_size=5000):
    """
    Calcula la Importancia por Permutaci√≥n usando la m√©trica del modelo (RMSE).
    
    Args:
        model: El modelo Keras ya entrenado (model_tuned).
        X_dict: Diccionario de inputs de prueba (test_inputs).
        y_true: Valores reales de y_test.
        metric_to_monitor: √çndice de la m√©trica a usar (0=loss, 1=mae, 2=rmse).
        sample_size: N√∫mero de filas a usar para el c√°lculo.
    """
    
    # 1. Seleccionar una muestra (para velocidad)
    indices = np.random.choice(len(y_true), min(sample_size, len(y_true)), replace=False)
    inputs_sample = {k: v[indices] for k, v in X_dict.items()}
    y_sample = y_true[indices]
    
    # 2. Evaluaci√≥n BASE (sin permutaci√≥n)
    # Evaluamos en el set de prueba y tomamos el valor de la m√©trica (ej. RMSE)
    baseline_score = model.evaluate(inputs_sample, y_sample, verbose=0)[metric_to_monitor] 
    
    importances = {}
    
    # 3. Iterar y Permutar
    for key in inputs_sample.keys():
        save_col = inputs_sample[key].copy()
        
        # PERMUTAR: Romper la relaci√≥n de la columna con el objetivo
        np.random.shuffle(inputs_sample[key])
        
        # Evaluar score con la columna rota
        shuff_score = model.evaluate(inputs_sample, y_sample, verbose=0)[metric_to_monitor]
        
        # Calcular el aumento de error
        importances[key] = shuff_score - baseline_score
        
        # RESTAURAR la columna para el siguiente ciclo
        inputs_sample[key] = save_col
        
    return importances

# --- EJECUCI√ìN (Corre esto con tu modelo ya entrenado) ---

# El RMSE es la m√©trica con √≠ndice 2 en la lista de m√©tricas de Keras ('mae', 'rmse')
# Si usaste ['mae', 'rmse'], el √≠ndice es 2. Si usaste ['mae'], el √≠ndice es 1.
# Revisa el resumen de tu modelo.
RMSE_INDEX = 2 

print("\nüöÄ Calculando Importancia por Permutaci√≥n...")
imps = calculate_permutation_importance(
    model=build_dynamic_model_tuned, 
    X_dict=test_inputs, 
    y_true=y_test, 
    metric_to_monitor=RMSE_INDEX, 
    sample_size=10000 # Usa el m√°ximo que puedas para estabilidad
)
sorted_imps = sorted(imps.items(), key=lambda x: x[1], reverse=True)

print("\n--- Top 10 Variables m√°s Influyentes (Causan el mayor aumento de error) ---")
for name, imp in sorted_imps[:10]:
    print(f"{name.replace('in_', '')}: {imp*100:.3f}% (Aumento en el RMSE)")


üöÄ Calculando Importancia por Permutaci√≥n...


AttributeError: 'function' object has no attribute 'evaluate'