# 05 – Experimentos con NN tabular (embeddings + zero-inflated)

En este notebook:
- Evaluamos una arquitectura de red neuronal para predecir `Und_2a_percentage`.
- Validamos el uso de embeddings vs numéricas.
- Comparamos desempeño y analizamos importancia de variables.
- Dejamos documentado el modelo elegido para producción.


In [1]:
from pathlib import Path
import sys

cwd = Path().resolve()
PROJECT_ROOT = None

for parent in [cwd, *cwd.parents]:
    if (parent / "src").is_dir():
        PROJECT_ROOT = parent
        break

if PROJECT_ROOT is None:
    raise RuntimeError("No se encontró carpeta 'src'.")

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.config.settings import TARGET_COL, RANDOM_STATE, MODELS_DIR
from src.config.nn_config import NN_MODEL_SUBDIR, NN_KERAS_NAME, NN_PIPELINE_PKL
from src.data.load_data import load_clean_dataset

print("PROJECT_ROOT:", PROJECT_ROOT)
print("TARGET_COL  :", TARGET_COL)


PROJECT_ROOT: D:\Users\dhcertug\OneDrive - Crystal S.A.S\Documentos\HOME\00_PERSONAL\02_CURSOS\PROYECTO\Proyecto_analisis_intermedio_udea
TARGET_COL  : Und_2a_percentage


In [2]:
import pandas as pd
import numpy as np

from src.features.nn_features import reorganize_features_final

df = load_clean_dataset()
y = df[TARGET_COL].values
X = df.drop(columns=[TARGET_COL])

X_clean, embed_cols, num_cols = reorganize_features_final(X)

print("Shape X_clean:", X_clean.shape)
print("Embeddings:", embed_cols)
print("Numéricas :", num_cols)


# Observación:
# En este punto congelamos qué columnas entran al modelo. 
# Cambios posteriores deben justificarse con nueva evidencia (perm importance, negocio, drift)

Eliminando ruido/leakage: ['Rechazo_comp', 'rechazo_flag', 'Tecnologia', 'Tur', 'categoria_producto', 'semana_anio', 'g_art_id']
Variables finales: 16
Shape X_clean: (364832, 19)
Embeddings: ['mp_categoria', 'mp_id', 'Tipo_TEJ', 'planta_id', 'seccion_id', 'producto_id', 'MP', 'maq_id', 'estilo_id', 'C']
Numéricas : ['Col', 'Tal', 'Pas', 'Tal_Fert', 'Col_Fert', 'Componentes']


### Split y preprocesamiento

In [3]:
from sklearn.model_selection import train_test_split
from src.models.nn_preprocessing import preprocess_data

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_clean,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
)

train_inputs, test_inputs, encoders, n_nums, scaler = preprocess_data(
    X_train_raw,
    X_test_raw,
    embed_cols,
    num_cols,
)

print("Train inputs keys:", train_inputs.keys())


Train inputs keys: dict_keys(['in_mp_categoria', 'in_mp_id', 'in_Tipo_TEJ', 'in_planta_id', 'in_seccion_id', 'in_producto_id', 'in_MP', 'in_maq_id', 'in_estilo_id', 'in_C', 'in_numerics'])


## Construcción y entrenamiento de la NN

In [4]:
import tensorflow as tf
from tensorflow.keras import callbacks
from src.models.nn_zero_inflated import build_dynamic_model_tuned

model = build_dynamic_model_tuned(
    embed_cols=embed_cols,
    encoders=encoders,
    n_numeric_features=n_nums,
    learning_rate=3e-4,
)

cb = [
    callbacks.EarlyStopping(patience=8, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(patience=4),
]

history = model.fit(
    train_inputs,
    y_train,
    validation_data=(test_inputs, y_test),
    epochs=50,
    batch_size=32,
    callbacks=cb,
    verbose=1,
)

# Observacion:
# “EarlyStopping" y "ReduceLROnPlateau" reducen el riesgo de sobreentrenar y estabilizan el entrenamiento sin necesidad de grid-search manual de épocas.

Epoch 1/50
[1m9121/9121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - loss: 0.0128 - mae: 0.0697 - rmse: 0.1677 - val_loss: 0.0114 - val_mae: 0.0665 - val_rmse: 0.1577 - learning_rate: 3.0000e-04
Epoch 2/50
[1m9121/9121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 3ms/step - loss: 0.0114 - mae: 0.0665 - rmse: 0.1578 - val_loss: 0.0110 - val_mae: 0.0659 - val_rmse: 0.1551 - learning_rate: 3.0000e-04
Epoch 3/50
[1m9121/9121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 3ms/step - loss: 0.0111 - mae: 0.0657 - rmse: 0.1555 - val_loss: 0.0110 - val_mae: 0.0657 - val_rmse: 0.1550 - learning_rate: 3.0000e-04
Epoch 4/50
[1m9121/9121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 3ms/step - loss: 0.0109 - mae: 0.0651 - rmse: 0.1539 - val_loss: 0.0108 - val_mae: 0.0655 - val_rmse: 0.1534 - learning_rate: 3.0000e-04
Epoch 5/50
[1m9121/9121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - loss: 0.0107 - mae: 0.0644 - rmse: 0.1528

## Evaluación

In [5]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

preds = model.predict(test_inputs).reshape(-1)

mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)

print(f"R2   : {r2:.4f}")
print(f"MSE  : {mse:.6f}")
print(f"RMSE : {rmse:.6f}")
print(f"MAE  : {mae:.6f}")


[1m2281/2281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
R2   : 0.7849
MSE  : 0.023415
RMSE : 0.153021
MAE  : 0.066329


In [None]:
import joblib

model_dir = MODELS_DIR / NN_MODEL_SUBDIR
model_dir.mkdir(parents=True, exist_ok=True)

keras_path = model_dir / NN_KERAS_NAME
pipe_path = model_dir / NN_PIPELINE_PKL

model.save(keras_path)

pipeline_artefactos = {
    "keras_model_path": keras_path,
    "encoders": encoders,
    "scaler": scaler,
    "embed_cols": embed_cols,
    "num_cols": num_cols,
}

joblib.dump(pipeline_artefactos, pipe_path)

print("Modelo guardado en:", keras_path)
print("Pipeline guardado en:", pipe_path)


### Importancia por permutación

In [6]:
from src.models.importance import calculate_permutation_importance

# Métricas de Keras: [loss, mae, rmse] -> rmse está en índice 2
RMSE_INDEX = 2

imps = calculate_permutation_importance(
    model=model,
    X_dict=test_inputs,
    y_true=y_test,
    metric_index=RMSE_INDEX,
    sample_size=10000,
)

sorted_imps = sorted(imps.items(), key=lambda x: x[1], reverse=True)

for name, imp in sorted_imps[:10]:
    print(f"{name}: ΔRMSE = {imp:.6f}")
    

# Observacion:
# Permutation importance a nivel de input dict permite validar si la arquitectura está usando las señales correctas y justificar exclusiones futuras. 
# No es una explicación local estilo SHAP, pero es suficientemente estable para decisiones de feature selection.


in_seccion_id: ΔRMSE = 0.267317
in_numerics: ΔRMSE = 0.056576
in_mp_id: ΔRMSE = 0.012455
in_producto_id: ΔRMSE = 0.011733
in_Tipo_TEJ: ΔRMSE = 0.000534
in_maq_id: ΔRMSE = 0.000317
in_estilo_id: ΔRMSE = 0.000010
in_mp_categoria: ΔRMSE = 0.000002
in_planta_id: ΔRMSE = 0.000000
in_MP: ΔRMSE = -0.000008
