In [None]:
#hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# default_exp augmentation

# Data augmentation

> Functions to augment the user's dataset with information from official sources.

In [None]:
#hide
from nbdev.showdoc import *

## Data augmentation as a transformer

The package `scikit-learn` makes use of the concept of transformer, an object in python that adapts the transformation at hand to the training data, and then deploys it on the testing data. These transformers can be used as part of a pipeline.

In order to ensure maximumm compatibility with packages that are widely used, `gingado`'s data augmentation function are implemented as an object that are funcional on their own.

Interesting! Now moving on to b the stories também.

In [None]:
#export
import pandas as pd
import pandasdmx as sdmx
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_selection import VarianceThreshold

class AugmentSDMX(BaseEstimator, TransformerMixin):
    def __init__(self, sources={'BIS': 'all'}, variance_threshold=None, backend='memory', verbose=True):
        self.sources = sources
        self.variance_threshold = variance_threshold
        self.backend = backend
        self.verbose = verbose

    def fit(self, X, y=None):
        self.index_ = X.index if hasattr(X, "index") else None
        self.data_freq_ = self.index_.to_series().diff().min().resolution_string
        format_string = "%Y-%m-%d" if self.data_freq_ == 'D' else "%Y-%m" if self.data_freq_ == 'M' else "%Y"

        X = self._validate_data(X)
        
        self.keys_ = {'FREQ': self.data_freq_}

        self.params_ = {
            "startPeriod": min(self.index_).strftime(format_string),
            "endPeriod": max(self.index_).strftime(format_string),
        }

        return self

    def transform(self, X):
        check_is_fitted(self)

        data_sdmx = {}
        for source in self.sources.keys():
            src_conn = sdmx.Request(source, backend=self.backend, expire_after=1800)
            src_dflows = src_conn.dataflow()
            if self.sources[source] == 'all':
                dflows = {k: v for k, v in src_dflows.dataflow.items()}
            else:
                dflows = {k: v for k, v in src_dflows.dataflow.items() if k in self.sources[source]}
            for dflow in dflows.keys():
                if self.verbose: print(f"Querying data from {source}'s dataflow '{dflow}' - {dflows[dflow].dict()['name']}...")
                try:
                    data = sdmx.to_pandas(src_conn.data(dflow, key=self.keys_, params=self.params_), datetime='TIME_PERIOD')
                except:
                    if self.verbose: print("this dataflow does not have data in the desired frequency and time period.")
                    continue
                data.columns = ['__'.join(col) for col in data.columns.to_flat_index()]
                data_sdmx[source+"__"+dflow] = data

        if len(data_sdmx.keys()) is None:
            return X

        df = pd.concat(data_sdmx, axis=1)
        df.columns = ['_'.join(col) for col in df.columns.to_flat_index()]

        feat_sel = VarianceThreshold() if self.variance_threshold is None else VarianceThreshold(threshold=self.variance_threshold)
        feat_sel.fit(df)
    
        self.features_stay = df.columns[feat_sel.get_support()]
        self.features_removed = df.columns[~feat_sel.get_support()]

        df = df.iloc[:, feat_sel.get_support()]
        df.dropna(axis=0, how='all', inplace=True)
        df.dropna(axis=1, how='all', inplace=True)

        self.augm_df_ = df
        del(df)

        X = pd.concat([X, self.augm_df_], axis=1, join='inner')

        return X

As mentioned above, `gingado`'s transformers are built to be compatible with `scikit-learn`. The code below checks whether this is achieved.

First, we create the example dataset comprising the foreign exchange rate of a number of currencies to the Euro. The Brazilian Real is chosen for this example as the dependent variable.

In [None]:
from gingado.utils import load_EURFX_data

X = load_EURFX_data()
y = X.pop('BRL')

Next, the data augmentation object provided by `gingado` adds more data. In this case, for brevity only one dataflow from one source is listed. If users want to add more SDMX sources, simply add more keys to the dictionary. And if users want data from all dataflows from a given source (provided the keys and parameters such as frequency and dates are matched), the value should be set to `'all'`, as in `{'ECB': ['CISS'], 'BIS': 'all'}`.

In [None]:
X_augm = AugmentSDMX(sources={'ECB': ['CISS']}).fit_transform(X=X)

2022-05-24 02:06:48,523 pandasdmx.reader.sdmxml - INFO: Use supplied dsd=… argument for non–structure-specific message


Querying data from ECB's dataflow 'CISS' - Composite Indicator of Systemic Stress...


In [None]:
print(f"\nThe number of features just went from {X.shape[1]} to {X_augm.shape[1]} with one dataflow!\n\n" , X_augm.head())


The number of features just went from 7 to 29 with one dataflow!

                 AUD     CAD     CHF     GBP     JPY     SGD     USD  \
TIME_PERIOD                                                           
2003-01-02   1.8554  1.6422  1.4528  0.6520  124.40  1.8188  1.0446   
2003-01-03   1.8440  1.6264  1.4555  0.6500  124.56  1.8132  1.0392   
2003-01-06   1.8281  1.6383  1.4563  0.6495  124.40  1.8210  1.0488   
2003-01-07   1.8160  1.6257  1.4565  0.6496  124.82  1.8155  1.0425   
2003-01-08   1.8132  1.6231  1.4586  0.6495  124.90  1.8102  1.0377   

             ECB__CISS_D__AT__Z0Z__4F__EC__SS_CIN__IDX  \
TIME_PERIOD                                              
2003-01-02                                    0.023427   
2003-01-03                                    0.021899   
2003-01-06                                    0.020801   
2003-01-07                                    0.019738   
2003-01-08                                    0.019947   

             ECB__CISS_D__B

It can also be part of a `Pipeline` object, which enables the use of grid search and other parameter search techniques:

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from gingado.augmentation import AugmentSDMX

lags=2
jump=5

pipeline = Pipeline([
    ('augmentation', AugmentSDMX(verbose=False)),
    ('imputer', KNNImputer()),
    #('regressor', RandomForestRegressor())
])

pipeline.fit_transform(X)
# parameters = {
#     'augmentation': ('passthrough', AugmentSDMX(sources={'BIS': 'WS_CBPOL_D'})),
#     'regressor__n_estimators': ('10', '20')
# }

# grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
# grid_search.fit(X=X, y=y)


# best_parameters = grid_search.best_estimator_
# for param_name in sorted(parameters.keys()):
#     print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

## Overall scheme:
* Since sklearn `Pipeline` objects transform data into numpy arrays (and thus do not allow for things like looking at the index or title, etc), `gingado` will create a "workflow" object that:
  * includes the data augmentation (useful to have pandas available!)
  * the sklearn pipeline is a component that lies in the "middle" of the pipeline
  * the documentation step benefits from the workflow retaining things like main dates, etc, so that it can also be used in the documentation.

## Other functions

Most users will probably find it helpful to use the `DataAugment` transformer, which is compatible with `scikit-learn`, rather than the underlying functions. However, they are documented below in case their use might address some specific user need.

## Sources of data

`gingado` only lists official data sources by choice. This is meant to provide users with the trust that their dataset will be complemented by reliable sources. Unfortunately, it is not possible at this stage to include *all* official sources - let alone all reliable sources - because that requires substantial manual and maintenance work. `gingado` leverages the existence of the [Statistical Data and Metadata eXchange (SDMX)](https://sdmx.org), an organisation of official data sources that establishes common data and metadata formats, to download data that is relevant (and hopefully also useful) to users.

The function below from the package [simpledmx](https://github.com/dkgaraujo/simpledmx) returns a list of codes corresponding to the data sources available to provide `gingado` users with data through SDMX.

In [None]:
#export
from simpledmx import *

In [None]:
list_sdmx_sources()

In [None]:
#hide
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

In [None]:
#export
def augm_with_sdmx(df, freq, sources, variance_threshold=None):
    """Downloads relevant data from SDMX sources to complement the original dataset

    Arguments:
      df: a pandas DataFrame
      freq: the frequency of the desired data from SDMX; for example, 'A' is annual
      sources: the list of SDMX sources or None; a list of possible sources can be obtained by running the function list_sdmx_sources()    
      variance_threshold: a value larger than or equal to 0 or None, where 0 will lead to the removal of all data that does not vary across the dataset and None uses the scikit-learn default
    """
    start_date, end_date = min(df.index), max(df.index)
        
    sdmx_data = get_sdmx_data(
        start_date=start_date,
        end_date=end_date,
        freq=freq,
        sources=sources
        )
    sdmx_data = sdmx_data.dropna(axis=1).sort_index()
    sdmx_data.reset_index(inplace=True)
    sdmx_data['TIME_PERIOD'] = pd.to_datetime(sdmx_data['TIME_PERIOD'])
    sdmx_data.set_index('TIME_PERIOD', inplace=True)
    
    feat_sel = VarianceThreshold() if variance_threshold is None else VarianceThreshold(threshold=variance_threshold)
    feat_sel.fit(sdmx_data)
    
    # TODO: log which features were not kept and why
    sdmx_data = sdmx_data.iloc[:, feat_sel.get_support()]

    #sdmx_data = feat_sel.fit_transform(sdmx_data)
        
    if df is None:
        return sdmx_data
    df , blacksoack= df.merge(sdmx_data, how='left', left_on=time_col, right_on='TIME_PERIOD')
    return df