In [25]:
from src.forecast.model_xgboost import entrenar_modelo_por_segmento

# AMEX
resultados_amex = entrenar_modelo_por_segmento(
    df=df,
    cliente_alias=["AMEX"],
    idiomas=df["idioma"].unique()
)

# AIRBNB + COLUMBUS
resultados_otros = entrenar_modelo_por_segmento(
    df=df,
    cliente_alias=["AIRBNB", "COLUMBUS"],
    idiomas=df["idioma"].unique()
)

✅ Modelo xgb_AMEX_DE.pkl | MAE: 13.82 | RMSE: 17.99 | MAPE: 133.29% | Corte: 2025-03-31
✅ Modelo xgb_AMEX_EN.pkl | MAE: 44.26 | RMSE: 60.36 | MAPE: 111.26% | Corte: 2025-03-31
✅ Modelo xgb_AMEX_ES.pkl | MAE: 7.82 | RMSE: 9.30 | MAPE: 33.79% | Corte: 2025-03-31
✅ Modelo xgb_AMEX_FR.pkl | MAE: 24.37 | RMSE: 43.87 | MAPE: 32.70% | Corte: 2025-03-31
✅ Modelo xgb_AMEX_IT.pkl | MAE: 24.45 | RMSE: 31.57 | MAPE: 45.25% | Corte: 2025-03-31
✅ Modelo xgb_AMEX_NL.pkl | MAE: 12.49 | RMSE: 16.74 | MAPE: 50.49% | Corte: 2025-03-31
⚠️ Sin datos para cliente ['AMEX'] e idioma PT
✅ Modelo xgb_AMEX_FI.pkl | MAE: 3.12 | RMSE: 3.64 | MAPE: 127.91% | Corte: 2025-03-30
✅ Modelo xgb_AMEX_NO.pkl | MAE: 2.48 | RMSE: 3.02 | MAPE: 80.56% | Corte: 2025-03-31
⚠️ No hay suficientes datos para ['AMEX'] - Other
⚠️ No hay suficientes datos para ['AMEX'] - PO
✅ Modelo xgb_AMEX_SE.pkl | MAE: 12.72 | RMSE: 15.87 | MAPE: 97.06% | Corte: 2025-03-31
⚠️ Sin datos para cliente ['AMEX'] e idioma DK
✅ Modelo xgb_AIRBNB_COLUMBUS_

In [31]:
resultados_amex = entrenar_modelo_por_segmento(
    df=df,
    cliente_alias=["AMEX"],
    idiomas=["EN"]  # o todos los que quieras
)


✅ Modelo xgb_AMEX_EN.pkl | MAE: 44.26 | RMSE: 60.36 | MAPE: 111.26% | Corte: 2025-03-31


In [32]:
from pathlib import Path

models_path = Path().resolve().parent / "models"  # ← ahora sí debe estar bien
print("🧾 Modelos encontrados:")
for f in models_path.glob("*.pkl"):
    print("✅", f.name)


🧾 Modelos encontrados:


In [33]:
from pathlib import Path
import joblib

modelo_path = Path().resolve() / "models" / "xgb_AMEX_EN.pkl"
modelo = joblib.load(modelo_path)


In [34]:
from pathlib import Path

models_path = Path().resolve() / "models"
print("📦 Modelos disponibles:")
for f in models_path.glob("*.pkl"):
    print("✅", f.name)


📦 Modelos disponibles:
✅ xgb_AMEX_EN.pkl


In [27]:
import pandas as pd
from pathlib import Path

# Unir métricas
df_metricas = pd.concat([resultados_amex, resultados_otros], ignore_index=True)

# Guardar
root_dir = Path().resolve().parent
metricas_path = root_dir / "data" / "processed" / "metricas_modelo.csv"
metricas_path.parent.mkdir(parents=True, exist_ok=True)
df_metricas.to_csv(metricas_path, index=False)

print(f"✅ Métricas guardadas en {metricas_path}")


✅ Métricas guardadas en C:\Users\gcasc\proyectos\proyecto_final\data\processed\metricas_modelo.csv


In [49]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

# Leer dataset completo
df_total = pd.read_csv("../data/processed/llamadas_diarias.csv", parse_dates=["date"])
df_total = df_total[df_total["cliente"].isin(["COLUMBUS", "AIRBNB"])]
df_total = df_total[df_total["date"].dt.dayofweek < 5]  # Solo días hábiles

# Idiomas disponibles
idiomas = df_total["idioma"].unique()

# Lista para métricas por idioma
metricas = []

# Loop por idioma
for idioma in idiomas:
    df_idioma = df_total[df_total["idioma"] == idioma].copy()
    
    # Crear serie temporal
    serie = df_idioma.set_index("date")["y"].resample("D").sum().fillna(0)
    
    # Suavizado simple (media móvil 3 días centrada)
    serie_suavizada = serie.rolling(window=3, center=True).mean().dropna()

    # 1. Dataset base
    serie_ultimos_3m = serie_suavizada[serie_suavizada.index >= '2025-01-01']
    df_features = serie_ultimos_3m.reset_index()
    df_features.columns = ['date', 'y']
    df_features['dayofweek'] = df_features['date'].dt.dayofweek
    df_features['is_month_end'] = df_features['date'].dt.is_month_end.astype(int)

    # 2. Crear lags
    for lag in [1, 2, 3, 4, 5]:
        df_features[f'lag_{lag}'] = df_features['y'].shift(lag)

    df_features.dropna(inplace=True)
    df_features.set_index('date', inplace=True)

    # 3. Split
    X = df_features.drop(columns='y')
    y = df_features['y']
    X_train = X[X.index <= '2025-03-31']
    y_train = y[y.index <= '2025-03-31']
    X_test = X[(X.index >= '2025-04-01') & (X.index <= '2025-04-30')]
    y_test = y[(y.index >= '2025-04-01') & (y.index <= '2025-04-30')]

    if len(X_test) == 0 or len(X_train) == 0:
        continue

    # 4. Limpieza de outlier específico
    fecha_outlier = pd.Timestamp('2025-04-21')
    if fecha_outlier in y_test.index:
        entorno = y_test.loc['2025-04-16':'2025-04-25'].drop(index=fecha_outlier)
        y_test.loc[fecha_outlier] = entorno.mean()

    # 5. Modelo
    model_xgb = XGBRegressor(n_estimators=50, learning_rate=0.1, random_state=42)
    model_xgb.fit(X_train, y_train)

    # 6. Predicción y evaluación
    pred_xgb = pd.Series(model_xgb.predict(X_test), index=X_test.index)
    mae = mean_absolute_error(y_test, pred_xgb)
    rmse = np.sqrt(mean_squared_error(y_test, pred_xgb))
    mape = np.mean(np.abs((y_test - pred_xgb) / y_test.replace(0, np.nan))) * 100

    # 7. IC 95%
    residuos = y_test - pred_xgb
    std_error = residuos.std()
    ci_upper = pred_xgb + 1.96 * std_error
    ci_lower = np.clip(pred_xgb - 1.96 * std_error, a_min=0, a_max=None)

    # 8. Guardar métricas
    metricas.append({
        "idioma": idioma,
        "MAE": round(mae, 2),
        "RMSE": round(rmse, 2),
        "MAPE (%)": round(mape, 2)
    })

# Mostrar tabla final
df_metricas = pd.DataFrame(metricas).sort_values(by="MAPE (%)", ascending=False)

print(df_metricas)


   idioma   MAE  RMSE  MAPE (%)
8      NO  0.11  0.16     62.95
9      PO  0.02  0.06     62.16
7      DK  0.40  0.53     47.63
10     SE  0.29  0.37     46.51
4      IT  2.62  3.14     44.28
6      PT  0.37  0.50     38.64
5      NL  1.12  1.38     30.60
2      ES  2.15  2.61     27.01
1      EN  1.53  2.02     26.00
0      DE  1.59  2.10     25.12
3      FR  3.68  5.63     14.57


In [50]:
import os

# Crear carpeta si no existe
output_dir = "../data/processed/"
os.makedirs(output_dir, exist_ok=True)

# Loop nuevamente para guardar predicciones + IC por idioma
for idioma in idiomas:
    df_idioma = df_total[df_total["idioma"] == idioma].copy()
    
    # Serie suavizada
    serie = df_idioma.set_index("date")["y"].resample("D").sum().fillna(0)
    serie_suavizada = serie.rolling(window=3, center=True).mean().dropna()
    serie_ultimos_3m = serie_suavizada[serie_suavizada.index >= '2025-01-01']
    
    df_features = serie_ultimos_3m.reset_index()
    df_features.columns = ['date', 'y']
    df_features['dayofweek'] = df_features['date'].dt.dayofweek
    df_features['is_month_end'] = df_features['date'].dt.is_month_end.astype(int)

    for lag in [1, 2, 3, 4, 5]:
        df_features[f'lag_{lag}'] = df_features['y'].shift(lag)

    df_features.dropna(inplace=True)
    df_features.set_index('date', inplace=True)

    X = df_features.drop(columns='y')
    y = df_features['y']
    X_train = X[X.index <= '2025-03-31']
    y_train = y[y.index <= '2025-03-31']
    X_test = X[(X.index >= '2025-04-01') & (X.index <= '2025-04-30')]
    y_test = y[(y.index >= '2025-04-01') & (y.index <= '2025-04-30')]

    if len(X_test) == 0 or len(X_train) == 0:
        continue

    # Outlier
    fecha_outlier = pd.Timestamp('2025-04-21')
    if fecha_outlier in y_test.index:
        entorno = y_test.loc['2025-04-16':'2025-04-25'].drop(index=fecha_outlier)
        y_test.loc[fecha_outlier] = entorno.mean()

    # Modelo
    model_xgb = XGBRegressor(n_estimators=50, learning_rate=0.1, random_state=42)
    model_xgb.fit(X_train, y_train)

    # Predicción + IC
    pred_xgb = pd.Series(model_xgb.predict(X_test), index=X_test.index)
    residuos = y_test - pred_xgb
    std_error = residuos.std()
    ci_upper = pred_xgb + 1.96 * std_error
    ci_lower = np.clip(pred_xgb - 1.96 * std_error, a_min=0, a_max=None)

    # Guardar predicciones
    df_out = pd.DataFrame({
        "date": X_test.index,
        "cliente": "COLUMBUS+AIRBNB",
        "idioma": idioma,
        "real": y_test.values,
        "pred": pred_xgb.values,
        "ic_95_inf": ci_lower,
        "ic_95_sup": ci_upper
    })

    df_out.to_csv(os.path.join(output_dir, f"pred_columbus_airbnb_{idioma}.csv"), index=False)
