# Checking Baseline with AutoML

In [1]:
%config InlineBackend.figure_format='retina'
import logging
from ekorpkit import eKonf

logging.basicConfig(level=logging.INFO)
print(eKonf.__version__)

0.1.31+10.g751d414.dirty


In [2]:
data_dir = "../data/fomc"
save_dataframe = eKonf.partial(
    config_group="_func_/save_dataframe", output_dir=data_dir
)
load_dataframe = eKonf.partial(config_group="_func_/load_dataframe", data_dir=data_dir)

## Build and laod feature set

In [3]:
cfg = eKonf.compose(config_group="dataset=feature")
cfg.name = "fomc_features_small"
cfg.data_dir = data_dir
cfg.data_file = "econ_train_small.parquet"
cfg.force_rebuild = True
cfg.pipeline.reset_index.index_column_name = "date"
cfg.column_info.columns.index = "date"
cfg.column_info.columns.id = "index"
cfg.column_info.columns.x = [
    "prev_decision",
    "GDP_diff_prev",
    "PMI",
    "EMP_diff_prev",
    "RSALES_diff_year",
    "UNEMP_diff_prev",
    "HSALES_diff_year",
    "Inertia_diff",
    "Balanced_diff",
]
cfg.column_info.columns.y = "target"
cfg.verbose = False
fomc_features_small = eKonf.instantiate(cfg)
fomc_features_small.persist()

INFO:ekorpkit.datasets.feature:Loaded info file: ../data/fomc/fomc_features_small/info-fomc_features_small.yaml
INFO:ekorpkit.pipelines.pipe:Applying pipeline: OrderedDict([('load_dataframe', 'load_dataframe'), ('reset_index', 'reset_index'), ('split_sampling', 'split_sampling')])
INFO:ekorpkit.ekonf:Applying pipe: functools.partial(<function load_dataframe at 0x7fb69009b670>)
INFO:ekorpkit.io.file:Processing [1] files from [['econ_train_small.parquet']]
INFO:ekorpkit.pipelines.pipe:Loading 1 dataframes from ['../data/fomc/econ_train_small.parquet']
INFO:ekorpkit.io.file:Loading data from ../data/fomc/econ_train_small.parquet
INFO:ekorpkit.ekonf:Applying pipe: functools.partial(<function reset_index at 0x7fb743ec6430>)
INFO:ekorpkit.ekonf:Applying pipe: functools.partial(<function split_sampling at 0x7fb743ec6ca0>)
INFO:ekorpkit.io.file:Saving dataframe as ../data/fomc/fomc_features_small/fomc_features_small-train.parquet
INFO:ekorpkit.io.file:Saving dataframe as ../data/fomc/fomc_feat

In [4]:
cfg = eKonf.compose(config_group="dataset=feature")
cfg.name = "fomc_features_small"
cfg.data_dir = data_dir
cfg.data_file = "econ_train_small.parquet"
fomc_features_small = eKonf.instantiate(cfg)

INFO:ekorpkit.datasets.feature:Loaded info file: ../data/fomc/fomc_features_small/info-fomc_features_small.yaml
INFO:ekorpkit.io.file:Loading data from ../data/fomc/fomc_features_small/fomc_features_small-train.parquet
INFO:ekorpkit.info.feature:Added a column [split] with value [train]
INFO:ekorpkit.io.file:Loading data from ../data/fomc/fomc_features_small/fomc_features_small-test.parquet
INFO:ekorpkit.info.feature:Added a column [split] with value [test]


In [5]:
fomc_features_small.INFO

{'splits': {'train': {'data_file': 'fomc_features_small-train.parquet',
   'name': 'train',
   'num_examples': 332,
   'num_bytes': 49800,
   'human_bytes': '48.63 KiB'},
  'test': {'data_file': 'fomc_features_small-test.parquet',
   'name': 'test',
   'num_examples': 83,
   'num_bytes': 12367,
   'human_bytes': '12.08 KiB'}},
 'num_examples': 415,
 'size_in_bytes': 62167,
 'size_in_human_bytes': '60.71 KiB',
 'data_files': {'train': 'fomc_features_small-train.parquet',
  'test': 'fomc_features_small-test.parquet'},
 'meta_files': {},
 'data_files_modified': '2022-06-04 08:41:26',
 'info_updated': '2022-06-04 08:41:26',
 'column_info': {'keys': {'index': 'date',
   'id': 'id',
   'x': 'x',
   'y': 'y',
   'split': 'split'},
  'columns': {'index': 'date',
   'id': ['date', 'split'],
   'x': ['prev_decision',
    'GDP_diff_prev',
    'PMI',
    'EMP_diff_prev',
    'RSALES_diff_year',
    'UNEMP_diff_prev',
    'HSALES_diff_year',
    'Inertia_diff',
    'Balanced_diff'],
   'y': 'target

In [9]:
y_train = fomc_features_small.y_train
print(fomc_features_small.FEATURE.Y)
y_train[:5]

target


array([0, 0, 1, 0, 1])

In [10]:
X_train = fomc_features_small.X_train
print(fomc_features_small.FEATURE.X)
X_train[:5]

['prev_decision', 'GDP_diff_prev', 'PMI', 'EMP_diff_prev', 'RSALES_diff_year', 'UNEMP_diff_prev', 'HSALES_diff_year', 'Inertia_diff', 'Balanced_diff']


array([[  0.        ,   7.54753464,  57.7       ,   0.23404719,
          3.05527559,  -2.89855072,  22.62247839,   0.        ,
          0.        ],
       [  0.        ,   0.8437117 ,  51.9       ,   0.05891895,
          0.19675412,  -2.17391304, -21.9604147 ,   0.        ,
          0.        ],
       [  1.        ,   0.36444736,  54.7       ,   0.22189124,
          4.21392244,  -5.        ,  -8.38779956,   0.        ,
          0.        ],
       [ -1.        ,   0.74332734,  51.2       ,   0.12473506,
          2.09425599,  -4.28571429,  13.64341085,   0.        ,
          0.        ],
       [  0.        ,   1.28412891,  56.1       ,   0.2218919 ,
          3.23995931,  -3.27868852,  16.62531017,   0.        ,
          0.        ]])

## Auto ML