In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib

# ================================
# 1. Cargar dataset
# ================================
df = pd.read_csv('../../data/processed/dataset_cleaned.csv')

TARGET = "Und_2a_percentage"

# ================================
# 2. Orden temporal estricta
# ================================
df = df.sort_values("semana_anio")

split_week = df["semana_anio"].quantile(0.8)

train_df = df[df["semana_anio"] <= split_week]
test_df  = df[df["semana_anio"] >  split_week]

X_train = train_df.drop(TARGET, axis=1)
y_train = train_df[TARGET]

X_test = test_df.drop(TARGET, axis=1)
y_test = test_df[TARGET]

# ================================
# 3. Eliminar outliers extremos
# ================================
q_low  = y_train.quantile(0.01)
q_high = y_train.quantile(0.99)

mask = (y_train >= q_low) & (y_train <= q_high)

X_train = X_train[mask]
y_train = y_train[mask]

# ================================
# 4. Columnas numéricas y categóricas
# ================================
num_features = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_features = X_train.select_dtypes(include=["object", "category"]).columns

# ================================
# 5. Preprocesamiento robusto
# ================================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features)
    ]
)

# ================================
# 6. XGBoost optimizado
# ================================
xgb_model = XGBRegressor(
    n_estimators=700,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.3,
    reg_lambda=1.5,
    random_state=42,
    objective="reg:squarederror",
    n_jobs=-1
)

# ================================
# 7. Pipeline final
# ================================
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", xgb_model)
])

# ================================
# 8. Entrenar
# ================================
pipeline.fit(X_train, y_train)

# ================================
# 9. Evaluar
# ================================
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nRESULTADOS FINALES DEL MODELO")
print("MAE :", mae)
print("RMSE:", rmse)
print("R2  :", r2)

# ================================
# 10. Guardar modelo para producción
# ================================
joblib.dump(pipeline, "modelo_xgb_und2a.pkl")

print("\nModelo guardado como modelo_xgb_und2a.pkl ✅")



RESULTADOS FINALES DEL MODELO
MAE : 0.00039840717606497037
RMSE: 0.0012467120226756678
R2  : 0.9999852916113089

Modelo guardado como modelo_xgb_und2a.pkl ✅
