In [1]:
#Importa librerias
import pandas as pd
from statsforecast import StatsForecast
import os

  from tqdm.autonotebook import tqdm


In [2]:
#Leer los datos
df = pd.read_csv(os.path.join("refined", "top_aerolinea.csv"))

In [3]:
#Convierte el campo fecha a tipo datetime
df["Fecha"] = pd.to_datetime(df["Fecha"])
df.dtypes

Nombre_Empresa_clean            object
Fecha                   datetime64[ns]
pasajeros                        int64
dtype: object

In [4]:
#Ordena el dataframe por fecha
df = df.sort_values("Fecha")

In [5]:
#Se filtran las empresas que NO tengan datos hasta el 2022
ind_empresas = df.loc[df.Fecha.dt.year == 2022, "Nombre_Empresa_clean"].unique()
df = df[df.Nombre_Empresa_clean.isin(ind_empresas)].copy()

In [6]:
# Se filtran solo los ultimos 10 años 
ind_year = df.Fecha.dt.year >= 2013
df = df[ind_year].copy()

In [7]:
#Se toman las top 10 empresas con mayor trafico aereo
top_10 = list(df.groupby("Nombre_Empresa_clean")["pasajeros"].sum().sort_values(ascending=False)[:10].index)
ind = df.Nombre_Empresa_clean.isin(top_10)
df10= df[ind].copy()

In [8]:
#Se crea un nuevo dataframe para el entrenamiento
Y_df = pd.DataFrame()
Y_df["ds"] = pd.to_datetime(df10["Fecha"])
Y_df["unique_id"] = df10["Nombre_Empresa_clean"]
Y_df["y"] = df10["pasajeros"]

In [9]:
#Se visualizan las series de tiempo
StatsForecast.plot(Y_df)

In [10]:
#Se importan diferentes modelos de series de tiempo
from statsforecast.models import (
    AutoARIMA,
    HoltWinters,
    CrostonClassic as Croston, 
    HistoricAverage,
    DynamicOptimizedTheta as DOT,
    SeasonalNaive
)

models = [
    AutoARIMA(season_length=12),
    HoltWinters(),
    Croston(),
    SeasonalNaive(season_length=12),
    HistoricAverage(),
    DOT(season_length=12)
]

In [11]:
# Se instancian los modelos
sf = StatsForecast(
    df=Y_df, 
    models=models,
    freq='M', 
    n_jobs=1,
    fallback_model = SeasonalNaive(season_length=12)
)

In [21]:
#Se genera el forecast / entrenan los modelos
forecasts_df = sf.forecast(h=12, level=[95])
forecasts_df

Unnamed: 0_level_0,ds,AutoARIMA,AutoARIMA-lo-95,AutoARIMA-hi-95,HoltWinters,HoltWinters-lo-95,HoltWinters-hi-95,CrostonClassic,SeasonalNaive,SeasonalNaive-lo-95,SeasonalNaive-hi-95,HistoricAverage,HistoricAverage-lo-95,HistoricAverage-hi-95,DynamicOptimizedTheta,DynamicOptimizedTheta-lo-95,DynamicOptimizedTheta-hi-95
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AEROGAL,2022-12-31,46821.562500,9202.462891,84440.664062,30891.0,-56149.304688,1.179313e+05,39906.039062,30891.0,-56149.304688,1.179313e+05,62721.191406,-7785.442383,133227.828125,47797.339844,15440.447266,8.633378e+04
AEROGAL,2023-01-31,45642.824219,-5694.137207,96979.781250,29794.0,-57246.304688,1.168343e+05,39906.039062,29794.0,-57246.304688,1.168343e+05,62721.191406,-7785.442383,133227.828125,47797.339844,-7235.407227,9.935030e+04
AEROGAL,2023-02-28,44964.824219,-12972.956055,102902.601562,37840.0,-49200.304688,1.248803e+05,39906.039062,37840.0,-49200.304688,1.248803e+05,62721.191406,-7785.442383,133227.828125,47797.339844,-24414.248047,1.117612e+05
AEROGAL,2023-03-31,44673.289062,-17134.396484,106480.976562,35354.0,-51686.304688,1.223943e+05,39906.039062,35354.0,-51686.304688,1.223943e+05,62721.191406,-7785.442383,133227.828125,47797.339844,-27099.808594,1.222141e+05
AEROGAL,2023-04-30,44575.941406,-20097.908203,109249.789062,44152.0,-42888.304688,1.311923e+05,39906.039062,44152.0,-42888.304688,1.311923e+05,62721.191406,-7785.442383,133227.828125,47797.339844,-41855.851562,1.320748e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VIVA AIR,2023-07-31,585619.812500,204647.656250,966591.937500,651658.0,194256.718750,1.109059e+06,592246.437500,651658.0,194256.718750,1.109059e+06,364677.406250,-64214.449219,793569.250000,569123.562500,-108020.718750,1.212792e+06
VIVA AIR,2023-08-31,585584.000000,201785.359375,969382.625000,595739.0,138337.718750,1.053140e+06,592246.437500,595739.0,138337.718750,1.053140e+06,364677.406250,-64214.449219,793569.250000,599939.437500,-124700.109375,1.303923e+06
VIVA AIR,2023-09-30,585570.312500,198982.671875,972157.937500,633899.0,176497.718750,1.091300e+06,592246.437500,633899.0,176497.718750,1.091300e+06,364677.406250,-64214.449219,793569.250000,542284.500000,-131149.531250,1.199388e+06
VIVA AIR,2023-10-31,585565.062500,196214.781250,974915.312500,609495.0,152093.718750,1.066896e+06,592246.437500,609495.0,152093.718750,1.066896e+06,364677.406250,-64214.449219,793569.250000,598869.000000,-187385.281250,1.269678e+06


In [13]:
#Se plotean las series de tiempo con un forecast de cada uno de los modelos propuestos
sf.plot(Y_df,forecasts_df)

In [14]:
#Se realiza crossvalidation para evaluar el desempeño de los modelos
crossvaldation_df = sf.cross_validation(
    df=Y_df,
    h=12,
    step_size=12,
    n_windows=5
  )

In [15]:
#crossvaldation_df.head()

In [16]:
#Se define una funcion para la evaluacion de los modelos
from datasetsforecast.losses import mse, mae, rmse,mape

def evaluate_cross_validation(df, metric):
    models = df.drop(columns=['ds', 'cutoff', 'y']).columns.tolist()
    evals = []
    for model in models:
        eval_ = df.groupby(['unique_id', 'cutoff']).apply(lambda x: metric(x['y'].values, x[model].values)).to_frame() # Calculate loss for every unique_id, model and cutoff.
        eval_.columns = [model]
        evals.append(eval_)
    evals = pd.concat(evals, axis=1)
    evals = evals.groupby(['unique_id']).mean(numeric_only=True) # Averages the error metrics for all cutoffs for every combination of model and unique_id
    evals['best_model'] = evals.idxmin(axis=1)
    return evals

In [22]:
#Se imprime el desempeño de los disintos modelos
evaluation_df = evaluate_cross_validation(crossvaldation_df, mape)
evaluation_df

Unnamed: 0_level_0,AutoARIMA,HoltWinters,CrostonClassic,SeasonalNaive,HistoricAverage,DynamicOptimizedTheta,best_model
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AEROGAL,71.288271,160.934871,120.648552,160.934871,171.388528,62.512193,DynamicOptimizedTheta
AEROREPUBLICA,23.027534,56.774101,13.074549,56.774101,30.418438,37.910457,CrostonClassic
AMERICAN,18.73974,44.781168,26.209886,44.781168,46.39017,33.316885,AutoARIMA
AVIANCA,23.230197,54.884807,17.724168,54.884807,24.175236,26.870928,CrostonClassic
COPA,77.584284,47.413006,46.089025,47.413006,80.355185,29.552886,DynamicOptimizedTheta
EASYFLY S.A,11.987834,53.153604,9.515591,53.153604,31.113854,29.935896,CrostonClassic
LATAM,16.564485,42.747921,19.338194,42.747921,30.182965,27.547769,AutoARIMA
SATENA,19.959158,62.345549,22.281215,62.345549,17.899894,43.773457,HistoricAverage
SPIRIT AIRLINES,23.810297,45.057934,22.385416,45.057934,56.64387,56.163021,CrostonClassic
VIVA AIR,15.914517,41.856524,19.699499,41.856524,46.298268,36.12387,AutoARIMA


In [18]:
#Se define una variable para tomar el mejor modelo para cada serie de tiempo
def get_best_model_forecast(forecasts_df, evaluation_df):
    df = forecasts_df.set_index('ds', append=True).stack().to_frame().reset_index(level=2) # Wide to long 
    df.columns = ['model', 'best_model_forecast'] 
    df = df.join(evaluation_df[['best_model']])
    df = df.query('model.str.replace("-lo-90|-hi-90", "", regex=True) == best_model').copy()
    df.loc[:, 'model'] = [model.replace(bm, 'best_model') for model, bm in zip(df['model'], df['best_model'])]
    df = df.drop(columns='best_model').set_index('model', append=True).unstack()
    df.columns = df.columns.droplevel()
    df = df.reset_index(level=1)
    return df

In [19]:
#Se genera inferencia de cada serie de tiempo con su mejor modelo correspondiente
prod_forecasts_df = get_best_model_forecast(forecasts_df, evaluation_df)
prod_forecasts_df.head()

model,ds,best_model
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
AEROGAL,2022-12-31,47797.339844
AEROGAL,2023-01-31,47797.339844
AEROGAL,2023-02-28,47797.339844
AEROGAL,2023-03-31,47797.339844
AEROGAL,2023-04-30,47797.339844


In [20]:
#Se plotea cada serie de tiempo con su inferencia correspondiente
sf.plot(Y_df, prod_forecasts_df, level=[95])