In [1]:
cd ../

c:\Users\eduar\Documents\usp\AMST\ml_timeseries_usp


In [2]:
import os
from src.data.make_dataset import DatasetCreator
from src.data.make_hierarchical_dataset import DatasetHierarchicalAggregator
from src.data.refine_dataset import HierarchicalTimeSeriesOutlierRemover
from src.data.split import hierarchical_train_test_split
from src.features.build_features import FeaturesBuilder
from src.config import load_config
from src.models.tuning import ModelTuning
from src.models.train_model import CreateCanditateModel
from src.evaluation.backtest_evaluation import ModelEvaluate,CompareRecMethods
from src.data.back_to_origin import DatasetReconciliator
import warnings
warnings.filterwarnings("ignore")
config = load_config()

if config['cache']:
    print('loading data...')
    df = DatasetCreator(config).load_intermediary()
    Hagg = DatasetHierarchicalAggregator(config,df)
    Y_df = Hagg.load_processed(filename='dataset.parquet')
    S_df = Hagg.load_processed(filename='structure.parquet')
    tags = Hagg.load_tags(filename='tags.joblib')
    pass
else:
    df = DatasetCreator(config).run() 

    if config['exogen_features'] == 'true':
        df = FeaturesBuilder(df,config).run()
    else:
        pass

    Y_df,S_df,tags = DatasetHierarchicalAggregator(config,df).run()

train,test = hierarchical_train_test_split(Y_df)

2025-12-02 21:16:45,499 - INFO - Config carregado de c:\Users\eduar\Documents\usp\AMST\ml_timeseries_usp\config.yaml
2025-12-02 21:16:46,322 - INFO - Config carregado de c:\Users\eduar\Documents\usp\AMST\ml_timeseries_usp\config.yaml


loading data...
Train: 2022-01-01 → 2025-01-01 (37 meses)
Test (backtest): 2025-02-01 → 2025-07-01 (6 meses)


In [5]:
models = config['models']
cv_config = config['modeling']['cv_config']
methods = config['reconciliation']['methods']
min_trace_methods = config['reconciliation']['min_trace_methods']
mid_level = config['reconciliation']['middle_level']
evaluation_path = config['paths']['evaluation']['evaluation_path']
if os.path.exists(os.path.join(evaluation_path,'metrics_summary.csv')):
    os.remove(os.path.join(evaluation_path,'metrics_summary.csv'))

candidate_info = {}
candidate_performances = {}
for model in models:
    is_enable = models[model]['enabled']
    #check if model is enable in yaml file
    if is_enable:
        type_model = models[model]['type']
        if type_model == 'mlforecast':
            #Get Tuning info
            model_name = models[model]['regressor']
            fixed_params = models[model]['fixed_params']
            param_space = config['parameter_space'][model]
            mlforecast_params = config['modeling']['mlforecast']
            training_metric = config['modeling']['training_metric']
            compare_metrics = config['modeling']['compare_metrics']
            #Tuning model
            best_value,best_model_params,best_mlforecast_params,mlf_fit_params =\
                  ModelTuning(
                                df=train,
                                config=config,
                                model_name=model_name,
                                fixed_params=fixed_params,
                                param_space=param_space,
                                cv_config=cv_config,
                                mlforecast_params=mlforecast_params,
                                tuning_metric = training_metric
                            ).run()
            
            #create model with best tuning parameters
            candidate, fitted_values, metric,results_metrics = CreateCanditateModel(
                                df=train,
                                config=config,
                                cv_config=cv_config,
                                type_model=type_model,
                                model_name=model_name,
                                metric=best_value,
                                cv_metric = training_metric,
                                compare_metrics=compare_metrics,
                                model_params=best_model_params,
                                mlf_params=best_mlforecast_params,
                                mlf_fit_params = mlf_fit_params
                                ).run()
        else:
            #create model without tuning parameters
            candidate, fitted_values, metric,results_metrics = CreateCanditateModel(
                                df=train,
                                config=config,
                                cv_config=cv_config,
                                type_model=type_model,
                                compare_metrics=compare_metrics,
                                model_name=model_name
                                ).run()
                
        candidate_info.update({metric:candidate})
        candidate_performances.update({model:results_metrics})

        #eavluate model
        prediction,validation_metric,results_metrics = ModelEvaluate(
                                candidate_model=candidate,
                                model_name=model_name,
                                fitted_values=fitted_values,
                                train=train,
                                test=test,
                                n_months_test=6,
                                validation_metric='rmsse',
                                compare_metrics=compare_metrics,
                                config=config
                                ).run()
        
        #HIerarchical Reconcilier
        df_post_processed,methods = DatasetReconciliator(
                        Y_df = Y_df,
                        S_df = S_df,
                        tags = tags,
                        model_name = model_name,
                        methods = methods,
                        mid_level = mid_level,
                        min_trace_methods = min_trace_methods,
                        fitted_values = fitted_values,
                        prediction = prediction
                    ).run()
        
        CompareRecMethods(df_post_processed,methods,fitted_values,test,config).save_plot()
        
        for method in methods:
            
            df_rec_method = df_post_processed[['unique_id','ds',method]]
            # Evaluate with each hierarchical reconcilier
            prediction,validation_metric,results_metrics = ModelEvaluate(
                                                                        model_name=method,
                                                                        fitted_values=fitted_values,
                                                                        train=train,
                                                                        test=test,
                                                                        n_months_test=6,
                                                                        validation_metric='rmsse',
                                                                        compare_metrics=compare_metrics,
                                                                        config=config,
                                                                        prediction = df_rec_method
                                                                        ).run()
        
        

[I 2025-12-02 21:17:55,583] A new study created in memory with name: no-name-56c19a81-220e-456b-b5a4-c9078abd6e79


{'objective': 'poisson', 'verbosity': -1, 'n_jobs': -1}


[I 2025-12-02 21:17:58,095] Trial 0 finished with value: 0.7283619200727317 and parameters: {'learning_rate': 0.03574712922600244, 'feature_fraction': 0.9852142919229748, 'bagging_fraction': 0.9195981825434215, 'lambda_l1': 0.0006155564318973012, 'lambda_l2': 1.77071686435378e-07, 'num_leaves': 16, 'max_depth': 3, 'min_data_in_leaf': 18, 'n_estimators': 1923}. Best is trial 0 with value: 0.7283619200727317.
[I 2025-12-02 21:18:01,240] Trial 1 finished with value: 0.7806167286366525 and parameters: {'learning_rate': 0.11114989443094977, 'feature_fraction': 0.7061753482887407, 'bagging_fraction': 0.9909729556485982, 'lambda_l1': 0.04566054873446119, 'lambda_l2': 4.997040685255803e-07, 'num_leaves': 18, 'max_depth': 4, 'min_data_in_leaf': 9, 'n_estimators': 1717}. Best is trial 0 with value: 0.7283619200727317.
[I 2025-12-02 21:18:03,420] Trial 2 finished with value: 0.7122282927919427 and parameters: {'learning_rate': 0.04345454109729477, 'feature_fraction': 0.7873687420594125, 'bagging_

plots/time_series_plot_rec.html


In [7]:
import pandas as pd
pd.read_csv('evaluation/metrics_summary.csv')

Unnamed: 0,model,wrmsse,smape,mase,rmsse,rmse,notes
0,LGBMRegressor,234.863823,0.731466,0.772472,0.304561,0.0,cv_results
1,LGBMRegressor,235.046468,1.071921,1.194544,0.289716,0.0,final_model
2,LGBMRegressor/TopDown_method-forecast_proportions,86.599236,0.61133,0.652742,0.238016,0.0,final_model
3,LGBMRegressor/MiddleOut_middle_level-total/pro...,267.077531,0.8318,0.915773,0.282179,0.0,final_model
4,LGBMRegressor/MinTrace_method-wls_var,503.152315,1.059725,1.178678,0.289347,0.0,final_model
5,LGBMRegressor/MinTrace_method-wls_struct,265.995224,1.335695,1.572913,0.487975,0.0,final_model
6,LGBMRegressor/MinTrace_method-mint_shrink,433.077938,2.029097,2.355311,0.450594,0.0,final_model
