In [95]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import numpy as np

df_occupancy = pd.read_csv('daily_occupancy.csv')
df_occupancy['fecha'] = pd.to_datetime(df_occupancy['fecha'])
df_occupancy = df_occupancy.sort_values('fecha')
#df_occupancy.set_index('fecha', inplace=True)
# Variable objetivo
y = df_occupancy['ocupacion']
# Variables exógenas
# X = df.drop(columns=['ocupacion'])
exog_cols = ['dia_festivo', 'lag_1', 'lag_2', 'lag_4']
X = df_occupancy[exog_cols]

y = y.loc[X.index]

order = (0, 0, 2) # p, d, q
seasonal_order = (1, 0, 1, 7)

modelo_exog = SARIMAX(
y,
exog=X,
order=order,
seasonal_order=seasonal_order,
enforce_stationarity=False,
enforce_invertibility=False
)

modelo_exog_fit = modelo_exog.fit(disp=False)



In [96]:
import holidays
from pandas.tseries.offsets import DateOffset

h = 30
last_date = df_occupancy['fecha'].iloc[-1]

# 1. Rango de fechas futuro
future_dates = pd.date_range(
    start=last_date + DateOffset(days=1),
    periods=h,
    freq='D'
)

last_year  = last_date.year
next_year  = (last_date + DateOffset(days=h)).year
mx_holidays = holidays.Mexico(years=range(last_year, next_year + 1))
future_holidays = future_dates.normalize().isin(mx_holidays).astype(int)

  future_holidays = future_dates.normalize().isin(mx_holidays).astype(int)


In [97]:
exog_future = pd.DataFrame(index=future_dates, columns=exog_cols, dtype=float)
exog_future['dia_festivo'] = future_holidays

In [98]:
X.tail(10)

Unnamed: 0,dia_festivo,lag_1,lag_2,lag_4
385,0,901.0,909.0,1094.0
386,0,839.0,901.0,845.0
387,0,879.0,839.0,909.0
388,0,1253.0,879.0,901.0
389,0,1311.0,1253.0,839.0
390,0,1022.0,1311.0,879.0
391,0,955.0,1022.0,1253.0
392,0,929.0,955.0,1311.0
393,0,762.0,929.0,1022.0
394,0,785.0,762.0,955.0


In [99]:
y_history = y.copy()
preds, lag_trace = [], []

for idx in future_dates:
    lag_1 = y_history.iloc[-1]
    lag_2 = y_history.iloc[-2] if len(y_history) >= 2 else lag_1
    lag_4 = y_history.iloc[-4] if len(y_history) >= 4 else lag_1

    row_now = {
        "dia_festivo": int(idx.normalize() in mx_holidays),
        "lag_1": lag_1,
        "lag_2": lag_2,
        "lag_4": lag_4,
    }
    row_df = pd.DataFrame(row_now, index=[idx])

    pred = modelo_exog_fit.forecast(steps=1, exog=row_df).iloc[0]
    preds.append(np.floor(pred))
    lag_trace.append({**row_now, "pred": pred, "fecha": idx})

    y_history.loc[idx] = pred


y_pred_30 = pd.DataFrame(
    {"fecha": future_dates, "ocupacion_pred": preds}
).set_index("fecha") 

y_pred_30.head(30)


Unnamed: 0_level_0,ocupacion_pred
fecha,Unnamed: 1_level_1
2020-03-14,1202.0
2020-03-15,1204.0
2020-03-16,1098.0
2020-03-17,1119.0
2020-03-18,1167.0
2020-03-19,1212.0
2020-03-20,1239.0
2020-03-21,1259.0
2020-03-22,1280.0
2020-03-23,1301.0


In [100]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from lightgbm import LGBMRegressor


df_demand = pd.read_csv('daily_demand.csv')

df_demand['dia_festivo'].astype('category')
df_demand['dia_semana'].astype('category')
df_demand.drop('monto_total', axis=1, inplace=True)

stats = []
for pid, g in df_demand.groupby('platillo_id'):
    s = g.sort_values('fecha')['cantidad']
    stats.append({
        'platillo_id':    pid,
        'mean_sales':     s.mean(),
        'std_sales':      s.std(),
        'var_sales':      s.var(),
        'cv_sales':       s.std()/s.mean() if s.mean() else 0,
        'spike_count':    (s > s.mean() + 2*s.std()).sum(),
        'zero_days_ratio':(s == 0).mean(),
    })
stats_df = pd.DataFrame(stats)

num_cols = ['mean_sales','std_sales','var_sales','cv_sales','spike_count','zero_days_ratio']
scaler = StandardScaler().fit(stats_df[num_cols].fillna(0))
X_train_stats = scaler.transform(stats_df[num_cols].fillna(0))

pca_dummy = PCA().fit(X_train_stats)
cum_var = np.cumsum(pca_dummy.explained_variance_ratio_)
k = np.searchsorted(cum_var, 0.95) + 1
pca = PCA(n_components=k, random_state=42).fit(X_train_stats)
embedddings = pca.transform(X_train_stats)

emb_df = pd.DataFrame(
    embedddings,
    columns=[f'pca_emb_{i}' for i in range(k)]
)
emb_df['platillo_id'] = stats_df['platillo_id']

df_demand = df_demand.merge(emb_df, on='platillo_id')

feature_cols = ['platillo_id','lag_1','lag_7','ocupacion','dia_semana','dia_festivo'] + [f'pca_emb_{i}' for i in range(k)]

X_demand = df_demand[feature_cols]
y_demand = df_demand['cantidad']

In [101]:
params = {'objective':'tweedie','metric':'rmse','verbosity':-1, "bagging_fraction": 0.8,
            "feature_fraction": 0.8,
            "lambda_l1": 1,
            "lambda_l2": 0,
            "learning_rate": 0.05,
            "max_depth": -1,
            "num_leaves": 31,
            }

model = LGBMRegressor(**params)
model.fit(
    X_demand,
    y_demand,
    eval_metric='rmse'
)

In [102]:
df_demand.tail(7)

Unnamed: 0,fecha,cantidad,platillo_id,ocupacion,dia_festivo,dia_semana,lag_1,lag_7,pca_emb_0,pca_emb_1,pca_emb_2
36728,2020-03-07,0,715,1311,0,5,0.0,0.0,-1.774596,4.235547,-1.745116
36729,2020-03-08,2,715,1022,0,6,0.0,0.0,-1.774596,4.235547,-1.745116
36730,2020-03-09,18,715,955,0,0,2.0,0.0,-1.774596,4.235547,-1.745116
36731,2020-03-10,2,715,929,0,1,18.0,0.0,-1.774596,4.235547,-1.745116
36732,2020-03-11,10,715,762,0,2,2.0,0.0,-1.774596,4.235547,-1.745116
36733,2020-03-12,0,715,785,0,3,10.0,0.0,-1.774596,4.235547,-1.745116
36734,2020-03-13,0,715,1124,0,4,0.0,0.0,-1.774596,4.235547,-1.745116


In [103]:
import pandas as pd
from collections import deque
import holidays


platillos      = df_demand["platillo_id"].unique()
emb_cols       = [f"pca_emb_{i}" for i in range(k)]
emb_lookup     = emb_df.set_index("platillo_id")[emb_cols]

buffers = {}
for pid, g in df_demand.sort_values("fecha").groupby("platillo_id"):
    last7 = g["cantidad"].tail(7).tolist()
    buffers[pid] = deque(last7, maxlen=7)

mx_holidays = holidays.Mexico(
    years=range(future_dates[0].year, future_dates[-1].year + 1)

)

pred_rows = []


for fecha in future_dates:                          
    festivo     = int(fecha.normalize() in mx_holidays)
    dia_semana  = fecha.weekday()
    ocupacion_d = y_pred_30.loc[fecha]              

    for pid in platillos:                           
        dq = buffers[pid]
        lag_1 = dq[-1]
        lag_7 = dq[0]

        row_dict = {
            "platillo_id": pid,
            "lag_1":       lag_1,
            "lag_7":       lag_7,
            "ocupacion":   ocupacion_d,
            "dia_semana":  dia_semana,
            "dia_festivo": festivo,
            **emb_lookup.loc[pid].to_dict(),
        }

        y_hat = model.predict(pd.DataFrame(row_dict, index=[0]))[0]
        pred_rows.append({"fecha": fecha, "platillo_id": pid, "cantidad_pred": np.floor(y_hat)})

        dq.append(y_hat)

df_pred_demand = (
    pd.DataFrame(pred_rows)
      .sort_values(["platillo_id", "fecha"]) 
      .reset_index(drop=True)
)
mae_df = pd.read_csv("mae_per_plt.csv")       
df_pred_demand = (
    df_pred_demand
      .merge(mae_df, on="platillo_id", how="left")      
      .assign(
          lower=lambda d: (d["cantidad_pred"] - d["mae_per_plt"]).clip(lower=0),
          upper=lambda d: d["cantidad_pred"] + d["mae_per_plt"]
      )
      .drop(columns="mae_per_plt")
        .set_index("fecha")             
)

print(df_pred_demand.head())


deque([14, 25, 16, 15, 21, 26, 28], maxlen=7)
deque([20, 16, 3, 7, 8, 5, 18], maxlen=7)
deque([2, 2, 6, 10, 2, 6, 4], maxlen=7)
deque([26, 10, 2, 8, 0, 8, 4], maxlen=7)
deque([10, 6, 8, 4, 16, 12, 12], maxlen=7)
deque([22, 12, 8, 8, 12, 16, 8], maxlen=7)
deque([2, 0, 0, 2, 6, 2, 0], maxlen=7)
deque([38, 18, 26, 22, 19, 16, 45], maxlen=7)
deque([0, 0, 0, 0, 0, 0, 0], maxlen=7)
deque([0, 8, 2, 0, 0, 0, 0], maxlen=7)
deque([0, 8, 2, 0, 0, 0, 1], maxlen=7)
deque([14, 4, 0, 0, 0, 0, 4], maxlen=7)
deque([11, 20, 35, 10, 21, 41, 27], maxlen=7)
deque([18, 21, 16, 2, 4, 6, 6], maxlen=7)
deque([8, 2, 4, 10, 10, 6, 12], maxlen=7)
deque([15, 16, 4, 10, 7, 8, 20], maxlen=7)
deque([8, 10, 0, 4, 4, 10, 14], maxlen=7)
deque([10, 14, 4, 2, 0, 7, 24], maxlen=7)
deque([1, 0, 0, 5, 6, 8, 21], maxlen=7)
deque([8, 2, 4, 4, 4, 2, 8], maxlen=7)
deque([8, 4, 2, 4, 2, 6, 22], maxlen=7)
deque([4, 12, 3, 2, 0, 0, 16], maxlen=7)
deque([14, 2, 0, 0, 2, 0, 15], maxlen=7)
deque([8, 6, 6, 6, 8, 12, 15], maxlen=7)
dequ

In [104]:
df_pred_demand.to_csv('predicted_demand.csv', index=True)
y_pred_30.to_csv('predicted_occupancy.csv', index=True)