In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp augmentation

# Data augmentation

> Functions to augment the user's dataset with information from official sources.

In [None]:
#hide
from nbdev.showdoc import *

## Data augmentation as a transformer

The package `scikit-learn` makes use of the concept of transformer, an object in python that adapts the transformation at hand to the training data, and then deploys it on the testing data. These transformers can be used as part of a pipeline.

In order to ensure maximumm compatibility with packages that are widely used, `gingado`'s data augmentation function are implemented as an object that are funcional on their own, but also as a transformer as parte pines.

In [None]:
#export
import pandas as pd
import pandasdmx as sdmx
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_selection import VarianceThreshold

class ggdAugmentSDMX(BaseEstimator, TransformerMixin):
    def __init__(self, sources=['BIS', 'ECB', 'IMF', 'UNSD', 'WB'], variance_threshold=None, backend='memory'):
        self.sources = sources
        self.variance_threshold = variance_threshold
        self.backend = backend

    def fit(self, X, y=None):
        
        self.X_ = X
        self.y_ = y

        self.data_freq_ = X.index.to_series().diff().min().resolution_string
        self.keys_ = {'FREQ': self.data_freq_}

        self.params_ = {"startPeriod": min(X.index).year}

        return self

    def transform(self, X):
        check_is_fitted(self)

        data_sdmx = {}
        for source in self.sources:
            src_conn = sdmx.Request(source, backend=self.backend, expire_after=1800)
            src_dflows = src_conn.dataflow()
            dflows = {k: v for k, v in src_dflows.dataflow.items()}
            for dflow in dflows.keys():
                print(f"Trying to download data from {source}'s dataflow {dflow}...")
                try:
                    data = sdmx.to_pandas(src_conn.data(dflow, key=self.keys_, params=self.params_), datetime='TIME_PERIOD')
                except:
                    print("this dataflow does not have data in the desired frequency and time period.")
                    continue
                data.columns = ['__'.join(col) for col in data.columns.to_flat_index()]
                data_sdmx[source+"__"+dflow] = data

        if len(data_sdmx.keys()) is None:
            return X

        df = pd.concat(data_sdmx, axis=1)
        df.columns = ['_'.join(col) for col in df.columns.to_flat_index()]

        feat_sel = VarianceThreshold() if self.variance_threshold is None else VarianceThreshold(threshold=self.variance_threshold)
        feat_sel.fit(df)
    
        self.features_stay = df.columns[feat_sel.get_support()]
        self.features_removed = df.columns[~feat_sel.get_support()]

        df = df.iloc[:, feat_sel.get_support()]
        df.dropna(axis=0, how='all', inplace=True)
        df.dropna(axis=1, how='all', inplace=True)

        self.augm_df_ = df
        del(df)

        X = pd.concat([X, self.augm_df_], axis=1, join='inner')

        return X

In [None]:
class ggdAugmentSDMX2(BaseEstimator, TransformerMixin):
    def __init__(self, sources=['BIS', 'ECB', 'IMF', 'UNSD', 'WB'], variance_threshold=None, backend='memory'):
        self.sources = sources
        self.variance_threshold = variance_threshold
        self.backend = backend

    def fit(self, X, y=None):
        self.index_ = X.index if hasattr(X, "index") else None
        self.data_freq_ = self.index_.to_series().diff().min().resolution_string
        format_string = "%Y-%m-%d" if self.data_freq_ == 'D' else "%Y-%m" if self.data_freq_ == 'M' else "%Y"

        X = self._validate_data(X)
        
        self.keys_ = {'FREQ': self.data_freq_}

        self.params_ = {
            "startPeriod": min(self.index_).strftime(format_string),
            "endPeriod": max(self.index_).strftime(format_string),
        }

        return self

    def transform(self, X):
        check_is_fitted(self)

        data_sdmx = {}
        for source in self.sources:
            src_conn = sdmx.Request(source, backend=self.backend, expire_after=1800)
            src_dflows = src_conn.dataflow()
            dflows = {k: v for k, v in src_dflows.dataflow.items()}
            for dflow in dflows.keys():
                print(f"Trying to download data from {source}'s dataflow {dflow}...")
                try:
                    data = sdmx.to_pandas(src_conn.data(dflow, key=self.keys_, params=self.params_), datetime='TIME_PERIOD')
                except:
                    print("this dataflow does not have data in the desired frequency and time period.")
                    continue
                data.columns = ['__'.join(col) for col in data.columns.to_flat_index()]
                data_sdmx[source+"__"+dflow] = data

        if len(data_sdmx.keys()) is None:
            return X

        df = pd.concat(data_sdmx, axis=1)
        df.columns = ['_'.join(col) for col in df.columns.to_flat_index()]

        feat_sel = VarianceThreshold() if self.variance_threshold is None else VarianceThreshold(threshold=self.variance_threshold)
        feat_sel.fit(df)
    
        self.features_stay = df.columns[feat_sel.get_support()]
        self.features_removed = df.columns[~feat_sel.get_support()]

        df = df.iloc[:, feat_sel.get_support()]
        df.dropna(axis=0, how='all', inplace=True)
        df.dropna(axis=1, how='all', inplace=True)

        self.augm_df_ = df
        del(df)

        X = pd.concat([X, self.augm_df_], axis=1, join='inner')

        return X

In [None]:
show_doc(ggdAugmentSDMX)

As mentioned above, `gingado`'s transformers are built to be compatible with `scikit-learn`. The code below checks whether this is achieved:

In [None]:
from gingado.utils import load_EURFX_data

X = load_EURFX_data()
y = X.pop('BRL')

In [None]:
ggd2 = ggdAugmentSDMX2(sources=['ECB']).fit(X=X)

{'startPeriod': '2003-01-02', 'endPeriod': '2022-05-20'}
Trying to download data from ECB's dataflow AME...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow BKN...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow BLS...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow BNT...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow BOP...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow BSI...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow BSP...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow CBD...
this dataf

2022-05-23 06:04:54,856 pandasdmx.reader.sdmxml - INFO: Use supplied dsd=… argument for non–structure-specific message


Trying to download data from ECB's dataflow CLIFS...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow CPP...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow DCM...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow DD...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow EON...


2022-05-23 06:05:33,653 pandasdmx.reader.sdmxml - INFO: Use supplied dsd=… argument for non–structure-specific message


Trying to download data from ECB's dataflow ESA...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow ESB...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow EST...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow EXR...


2022-05-23 06:06:12,601 pandasdmx.reader.sdmxml - INFO: Use supplied dsd=… argument for non–structure-specific message


Trying to download data from ECB's dataflow FM...


2022-05-23 06:09:55,295 pandasdmx.reader.sdmxml - INFO: Use supplied dsd=… argument for non–structure-specific message


Trying to download data from ECB's dataflow FVC...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow FXI...


2022-05-23 06:10:08,027 pandasdmx.reader.sdmxml - INFO: Use supplied dsd=… argument for non–structure-specific message


Trying to download data from ECB's dataflow GST...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow ICB...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow ICO...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow ICP...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow IFI...


2022-05-23 06:10:57,771 pandasdmx.reader.sdmxml - INFO: Use supplied dsd=… argument for non–structure-specific message


Trying to download data from ECB's dataflow ILM...


2022-05-23 06:10:58,693 pandasdmx.reader.sdmxml - INFO: Use supplied dsd=… argument for non–structure-specific message


Trying to download data from ECB's dataflow IRS...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow IVF...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow KRI...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow LIG...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow MFI...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow MIR...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow MMS...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow MMSR...
this dataflow does not have data in the desired frequency and time

2022-05-23 06:11:23,622 pandasdmx.reader.sdmxml - INFO: Use supplied dsd=… argument for non–structure-specific message


Trying to download data from ECB's dataflow RDF...


2022-05-23 06:11:36,976 pandasdmx.reader.sdmxml - INFO: Use supplied dsd=… argument for non–structure-specific message


Trying to download data from ECB's dataflow RESC...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow RESH...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow RESR...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow RESV...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow RIR...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow RPP...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow RPV...
this dataflow does not have data in the desired frequency and time period.
Trying to download data from ECB's dataflow RTD...
this dataflow does not have data in the desired frequency and t

Unnamed: 0_level_0,AUD,CAD,CHF,GBP,JPY,SGD,USD,ECB__CISS_D__AT__Z0Z__4F__EC__SS_CIN__IDX,ECB__CISS_D__BE__Z0Z__4F__EC__SS_CIN__IDX,ECB__CISS_D__CN__Z0Z__4F__EC__SS_CIN__IDX,...,ECB__RDE_D__D0__Z0Z__DE__EC__SRCB_COVAR__95P,ECB__RDE_D__D0__Z0Z__DE__EC__SRCB_COVAR__AVE,ECB__RDE_D__D0__Z0Z__DE__EC__SRCI_COVAR__5P,ECB__RDE_D__D0__Z0Z__DE__EC__SRCI_COVAR__95P,ECB__RDE_D__D0__Z0Z__DE__EC__SRCI_COVAR__AVE,ECB__RDF_D__D0__Z0Z__4F__EC__DFTLB__PR,ECB__RDF_D__U2__Z0Z__4F__EC__U2_CEB__HST,ECB__RDF_D__U2__Z0Z__4F__EC__U2_CI__HST,ECB__RDF_D__U2__Z0Z__4F__EC__U2_GRAI__HST,ECB__RDF_D__U2__Z0Z__4F__EC__U2_MM__HST
TIME_PERIOD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003-01-02,1.8554,1.6422,1.4528,0.65200,124.40,1.8188,1.0446,0.023427,0.047823,,...,,,,,,,-0.50,-0.42,1.17,0.08
2003-01-03,1.8440,1.6264,1.4555,0.65000,124.56,1.8132,1.0392,0.021899,0.043292,,...,-0.008989,-0.078866,-0.178504,-0.04966,-0.112854,,-0.48,-0.40,1.04,0.08
2003-01-06,1.8281,1.6383,1.4563,0.64950,124.40,1.8210,1.0488,0.020801,0.039924,,...,,,,,,,-0.47,-0.36,0.78,0.11
2003-01-07,1.8160,1.6257,1.4565,0.64960,124.82,1.8155,1.0425,0.019738,0.038084,,...,,,,,,,-0.45,-0.34,0.61,0.11
2003-01-08,1.8132,1.6231,1.4586,0.64950,124.90,1.8102,1.0377,0.019947,0.040338,,...,,,,,,,-0.43,-0.30,0.90,0.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-16,1.5057,1.3473,1.0479,0.85045,135.01,1.4531,1.0422,0.284438,0.158944,,...,,,,,,,,,,
2022-05-17,1.4993,1.3517,1.0457,0.84400,136.32,1.4589,1.0541,0.303949,0.173353,,...,,,,,,,,,,
2022-05-18,1.4980,1.3488,1.0486,0.84670,135.76,1.4598,1.0523,0.307806,0.176587,,...,,,,,,,,,,
2022-05-19,1.5036,1.3490,1.0265,0.84728,134.46,1.4576,1.0525,0.303497,0.192594,,...,,,,,,,,,,


DatetimeIndex(['2003-01-02', '2003-01-03', '2003-01-06', '2003-01-07',
               '2003-01-08', '2003-01-09', '2003-01-10', '2003-01-13',
               '2003-01-14', '2003-01-15',
               ...
               '2022-05-09', '2022-05-10', '2022-05-11', '2022-05-12',
               '2022-05-13', '2022-05-16', '2022-05-17', '2022-05-18',
               '2022-05-19', '2022-05-20'],
              dtype='datetime64[ns]', name='TIME_PERIOD', length=4965, freq=None)

In [None]:
augm_df = ggdAugmentSDMX(sources=['ECB']).fit_transform(EUR_FX)

It can also be part of a `Pipeline` object, which enables the use of grid search and other parameter search techniques:

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from gingado.augmentation import ggdAugmentSDMX

pipeline = Pipeline([
    ('augmentation', ggdAugmentSDMX(sources=['ECB'])),
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor())
])

parameters = {
    'augmentation__sources': (['ECB'], ['BIS']),
    'regressor__n_estimators': ('100', '500')
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
grid_search.fit(X=X, y=y)

best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

## Overall scheme:
* Since sklearn `Pipeline` objects transform data into numpy arrays (and thus do not allow for things like looking at the index or title, etc), `gingado` will create a "workflow" object that:
  * includes the data augmentation (useful to have pandas available!)
  * the sklearn pipeline is a component that lies in the "middle" of the pipeline
  * the documentation step benefits from the workflow retaining things like main dates, etc, so that it can also be used in the documentation.

## Other functions

Most users will probably find it helpful to use the `DataAugment` transformer, which is compatible with `scikit-learn`, rather than the underlying functions. However, they are documented below in case their use might address some specific user need.

## Sources of data

`gingado` only lists official data sources by choice. This is meant to provide users with the trust that their dataset will be complemented by reliable sources. Unfortunately, it is not possible at this stage to include *all* official sources - let alone all reliable sources - because that requires substantial manual and maintenance work. `gingado` leverages the existence of the [Statistical Data and Metadata eXchange (SDMX)](https://sdmx.org), an organisation of official data sources that establishes common data and metadata formats, to download data that is relevant (and hopefully also useful) to users.

The function below from the package [simpledmx](https://github.com/dkgaraujo/simpledmx) returns a list of codes corresponding to the data sources available to provide `gingado` users with data through SDMX.

In [None]:
#export
from simpledmx import *

In [None]:
list_sdmx_sources()

In [None]:
#hide
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

In [None]:
#export
def augm_with_sdmx(df, freq, sources, variance_threshold=None):
    """Downloads relevant data from SDMX sources to complement the original dataset

    Arguments:
      df: a pandas DataFrame
      freq: the frequency of the desired data from SDMX; for example, 'A' is annual
      sources: the list of SDMX sources or None; a list of possible sources can be obtained by running the function list_sdmx_sources()    
      variance_threshold: a value larger than or equal to 0 or None, where 0 will lead to the removal of all data that does not vary across the dataset and None uses the scikit-learn default
    """
    start_date, end_date = min(df.index), max(df.index)
        
    sdmx_data = get_sdmx_data(
        start_date=start_date,
        end_date=end_date,
        freq=freq,
        sources=sources
        )
    sdmx_data = sdmx_data.dropna(axis=1).sort_index()
    sdmx_data.reset_index(inplace=True)
    sdmx_data['TIME_PERIOD'] = pd.to_datetime(sdmx_data['TIME_PERIOD'])
    sdmx_data.set_index('TIME_PERIOD', inplace=True)
    
    feat_sel = VarianceThreshold() if variance_threshold is None else VarianceThreshold(threshold=variance_threshold)
    feat_sel.fit(sdmx_data)
    
    # TODO: log which features were not kept and why
    sdmx_data = sdmx_data.iloc[:, feat_sel.get_support()]

    #sdmx_data = feat_sel.fit_transform(sdmx_data)
        
    if df is None:
        return sdmx_data
    df = df.merge(sdmx_data, how='left', left_on=time_col, right_on='TIME_PERIOD')
    return df

In [None]:
show_doc(augm_with_sdmx)

## Using `gingado` to jumpstart a dataset

Since `gingado` downloads data from official sources through SDMX, users may want to use this funcitonality to gather the dataset of interest instead of augmenting some previously existent data. In these cases, the argument `df` must be set to `None`, like so:

In [None]:
new_data = augm_with_sdmx(df=None, start_date='2018', end_date='2020', freq='A', time_col=None, sources='BIS')

In [None]:
new_data

The code above uses a greedy SDMX downloader that is not too concerned about selecting datasets in advance; rather, it downloads it data it possibly can from those official sources for the time period and frequency in question. It then filters out those data points that do not vary throughout the period, avoiding the use of memory to store data that does not contribute to the predictive power of the model. The dataset is then ready to be used.

Two things are important to highlight. First, choosing even one source (the [BIS](www.bis.org) in this example) leads to the download of hundreds of variables. Some of them might be representing the same underlying concepts, but for different jurisdictions. The second thing to bear in mind is that download and in particular parsing of the SDMX data can take up some time depending on your local setting.

To use `gingado` to augment your dataset instead of creating a completely new one as done above, simply pass the original DataFrame as the argument `df` and name the corresponding column with the time values in the argument `time_col`.