In [None]:
#| include: false
#skip
! [ -e /content ] && pip install -Uqq gingado nbdev # install or upgrade gingado on colab

In [None]:
#| default_exp utils
#| include: false
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| include: false
#| echo: false

# Code below included to ensure compatibility with scikit-learn v1.1.x
from sklearn import set_config
set_config(display='text')

In [None]:
#| include: false
from nbdev.showdoc import show_doc

## Support for model documentation

In [None]:
#| include: false
#| export
import datetime
import os

In [None]:
#| include: false
#| export
def get_datetime():
    "Returns the time now"
    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z") 

In [None]:
show_doc(get_datetime)

---

[source](https://github.com/dkgaraujo/gingado/blob/main/gingado/utils.py#L13){target="_blank" style="float:right; font-size:smaller"}

### get_datetime

>      get_datetime ()

Returns the time now

In [None]:
d = get_datetime()
assert isinstance(d, str)
assert len(d) > 0

In [None]:
#| include: false
#| export
def read_attr(
    obj # Object from which to attributes will be read
):
    "Read object type and values of attributes from fitted object"
    for a in dir(obj):
        # if statement filters out non-interesting attributes
        if a == '_estimator_type' or (a.endswith("_") and not a.startswith("_") and not a.endswith("__")):
            try:
                model_attr = obj.__getattribute__(a)
                yield {a: model_attr}
            except:
                pass

In [None]:
show_doc(read_attr)

---

### read_attr

>      read_attr (obj)

Read object type and values of attributes from fitted object

|    | **Details** |
| -- | ----------- |
| obj | Object from which to attributes will be read |

Function `read_attr` helps gingado Documenters to read the object behind the scenes.

It collects the type of estimator, and any attributes resulting from fitting an object (in ie, those that end in "_" without being double underscores).

For example, the attributes of an untrained and a trained random forest are, in sequence:

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_unfit = RandomForestRegressor(n_estimators=3)
rf_fit = RandomForestRegressor(n_estimators=3)\
    .fit([[1, 0], [0, 1]], [[0.5], [0.5]]) # random numbers
list(read_attr(rf_unfit)), list(read_attr(rf_fit))

  .fit([[1, 0], [0, 1]], [[0.5], [0.5]]) # random numbers


([{'_estimator_type': 'regressor'}],
 [{'_estimator_type': 'regressor'},
  {'base_estimator_': DecisionTreeRegressor()},
  {'estimators_': [DecisionTreeRegressor(max_features=1.0, random_state=1632148864),
    DecisionTreeRegressor(max_features=1.0, random_state=1616501356),
    DecisionTreeRegressor(max_features=1.0, random_state=2109419996)]},
  {'feature_importances_': array([0., 0.])},
  {'n_features_': 2},
  {'n_features_in_': 2},
  {'n_outputs_': 1}])

## Support for time series

Objects of the class `Lag` are similar to `scikit-learn`'s transformers.

In [None]:
#| include: false
#| export

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

In [None]:
#| include: false
#| export

class Lag(BaseEstimator, TransformerMixin):
    "A transformer that lags variables"
    def __init__(self, lags=1, jump=0, keep_contemporaneous_X=False):
        self.lags = lags
        self.jump = jump
        self.keep_contemporaneous_X = keep_contemporaneous_X
    
    def fit(
        self, 
        X:np.ndarray, # Array-like data of shape (n_samples, n_features)
        y=None # Array-like data of shape (n_samples,) or (n_samples, n_targets) or None
    ): # A fitted version of the `Lag` instance
        "Fit the `Lag` transformer"       
        self.index = None
        if hasattr(X, "index"):
            self.index = X.index
        else:
            if y is not None and hasattr(y, "index"):
                self.index = y.index
        X = self._validate_data(X)

        self.effective_lags_ = self.lags + self.jump
        return self

    def transform(
        self, 
        X:np.ndarray, # Array-like data of shape (n_samples, n_features)
    ): # A lagged version of `X`
        "Lag the dataset `X`"
        X_forlag = X
        
        X = self._validate_data(X)
        check_is_fitted(self)
        X_lags = []
        X_colnames = list(self.feature_names_in_) if self.keep_contemporaneous_X else []
        for lag in range(self.effective_lags_):
            if lag < self.jump:
                continue
            lag_count = lag+1
            lag_X = np.roll(X_forlag, lag_count, axis=0)
            X_lags.append(lag_X)
            if hasattr(self, "feature_names_in_"):
                X_colnames = X_colnames + [col+"_lag_"+str(lag+1) for col in list(self.feature_names_in_)]
        X = np.concatenate(X_lags, axis=1)
        if self.keep_contemporaneous_X:
            X = np.concatenate([X_forlag, X], axis=1)
        X = X[self.effective_lags_:, :]
        if hasattr(self, "index") and self.index is not None:
            new_index = self.index[self.effective_lags_:]
            X = pd.DataFrame(X, index=new_index, columns=X_colnames)
        else:
            X = pd.DataFrame(X)
        return X

In [None]:
show_doc(Lag)

---

[source](https://github.com/dkgaraujo/gingado/blob/main/gingado/utils.py#L44){target="_blank" style="float:right; font-size:smaller"}

### Lag

>      Lag (lags=1, jump=0, keep_contemporaneous_X=False)

A transformer that lags variables

In [None]:
show_doc(Lag.fit)

---

[source](https://github.com/dkgaraujo/gingado/blob/main/gingado/utils.py#L51){target="_blank" style="float:right; font-size:smaller"}

### Lag.fit

>      Lag.fit (X:numpy.ndarray, y=None)

Fit the `Lag` transformer

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| X | ndarray |  | Array-like data of shape (n_samples, n_features) |
| y | NoneType | None | Array-like data of shape (n_samples,) or (n_samples, n_targets) or None |

In [None]:
show_doc(Lag.transform)

---

[source](https://github.com/dkgaraujo/gingado/blob/main/gingado/utils.py#L68){target="_blank" style="float:right; font-size:smaller"}

### Lag.transform

>      Lag.transform (X:numpy.ndarray)

Lag the dataset `X`

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| X | ndarray | Array-like data of shape (n_samples, n_features) |

In [None]:
show_doc(Lag.fit_transform)

---

### TransformerMixin.fit_transform

>      TransformerMixin.fit_transform (X, y=None, **fit_params)

Fit to data, then transform it.

Fits transformer to `X` and `y` with optional parameters `fit_params`
and returns a transformed version of `X`.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| X | array-like of shape (n_samples, n_features) |  | Input samples. |
| y | NoneType | None | Target values (None for unsupervised transformations). |
| fit_params |  |  |  |
| **Returns** | **ndarray array of shape (n_samples, n_features_new)** |  | **Transformed array.** |

The code below demonstrates how `Lag` works in practice. Note in particular that, because `Lag` is a transformer, it can be used as part of a `scikit-learn`'s `Pipeline`. 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
randomX = np.random.rand(15, 2)
randomY = np.random.rand(15)

lags = 3
jump = 2

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('lagger', Lag(lags=lags, jump=jump, keep_contemporaneous_X=False))
]).fit_transform(randomX, randomY)

Below we confirm that the lagger removes the correct number of rows corresponding to the lagged observations:

In [None]:
assert randomX.shape[0] - lags - jump == pipe.shape[0]

And because `Lag` is a transformer, its parameters (`lags` and `jump`) can be calibrated using hyperparameter tuning to achieve the best performance for a model.

## Support for data augmentation with SDMX

:::{.callout-note}

please note that working with SDMX may take some minutes depending on the amount of information you are downloading.

:::

In [None]:
#| include: false
#| export
import pandasdmx as sdmx

def list_SDMX_sources(): # The list of codes representing the SDMX sources available for data download
    "Fetch the list of SDMX sources"
    return sdmx.list_sources()

In [None]:
show_doc(list_SDMX_sources)

---

[source](https://github.com/dkgaraujo/gingado/blob/main/gingado/utils.py#L102){target="_blank" style="float:right; font-size:smaller"}

### list_SDMX_sources

>      list_SDMX_sources ()

Fetch the list of SDMX sources

In [None]:
sources = list_SDMX_sources()
print(sources)

assert len(sources) > 0
# all elements are of type 'str'
assert sum([isinstance(src, str) for src in sources]) == len(sources)

['ABS', 'ABS_XML', 'BBK', 'BIS', 'CD2030', 'ECB', 'ESTAT', 'ILO', 'IMF', 'INEGI', 'INSEE', 'ISTAT', 'LSD', 'NB', 'NBB', 'OECD', 'SGR', 'SPC', 'STAT_EE', 'UNICEF', 'UNSD', 'WB', 'WB_WDI']


In [None]:
#| include: false
#| export
import pandas as pd
import pandasdmx as sdmx

def list_all_dataflows(
    codes_only:bool=False, # Whether to return only the dataflow codes
    return_pandas:bool=True # Whether to return the result in a pandas DataFrame format
): # All available dataflows for all SDMX sources used by gingado
    "List all SDMX dataflows. Note: When using as a parameter to an `AugmentSDMX` object or to the `load_SDMX_data` function, set `codes_only=True`"
    sources = sdmx.list_sources()
    dflows = {}
    for src in sources:
        try:
            dflows[src] = sdmx.to_pandas(sdmx.Request(src).dataflow().dataflow)
            dflows[src] = dflows[src].index if codes_only else dflows[src].index.reset_index()
        except:
            pass
    if return_pandas:
        dflows = pd.concat({
            src: pd.DataFrame.from_dict(dflows)
            for src, dflows in dflows.items()
            })[0].rename('dataflow')
    return dflows

In [None]:
show_doc(list_all_dataflows)

---

[source](https://github.com/dkgaraujo/gingado/blob/main/gingado/utils.py#L111){target="_blank" style="float:right; font-size:smaller"}

### list_all_dataflows

>      list_all_dataflows (codes_only:bool=False, return_pandas:bool=True)

List all SDMX dataflows. Note: When using as a parameter to an `AugmentSDMX` object or to the `load_SDMX_data` function, set `codes_only=True`

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| codes_only | bool | False | Whether to return only the dataflow codes |
| return_pandas | bool | True | Whether to return the result in a pandas DataFrame format |

In [None]:
dflows = list_all_dataflows(return_pandas=False)

assert isinstance(dflows, dict)
all_sources = list_SDMX_sources()
assert len([s for s in dflows.keys() if s in all_sources]) == len(dflows.keys())

2023-09-16 00:49:48,202 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>
2023-09-16 00:50:09,352 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>
2023-09-16 00:50:10,173 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>
2023-09-16 00:50:19,614 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>
2023-09-16 00:50:20,660 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>


`list_all_dataflows` returns by default a pandas Series, facilitating data discovery by users like so:

In [None]:
dflows = list_all_dataflows(return_pandas=True)
assert type(dflows) == pd.core.series.Series

dflows

2023-09-16 00:50:44,400 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>
2023-09-16 00:51:09,450 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>
2023-09-16 00:51:10,058 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>
2023-09-16 00:51:14,175 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>
2023-09-16 00:51:19,057 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>


ABS_XML  ABORIGINAL_POP_PROJ                 Projected population, Aboriginal and Torres St...
         ABORIGINAL_POP_PROJ_REMOTE          Projected population, Aboriginal and Torres St...
         ABS_ABORIGINAL_POPPROJ_INDREGION    Projected population, Aboriginal and Torres St...
         ABS_ACLD_LFSTATUS                   Australian Census Longitudinal Dataset (ACLD):...
         ABS_ACLD_TENURE                     Australian Census Longitudinal Dataset (ACLD):...
                                                                   ...                        
UNSD     DF_UNData_UNFCC                                                       SDMX_GHG_UNDATA
WB       DF_WITS_Tariff_TRAINS                                WITS - UNCTAD TRAINS Tariff Data
         DF_WITS_TradeStats_Development                             WITS TradeStats Devlopment
         DF_WITS_TradeStats_Tariff                                      WITS TradeStats Tariff
         DF_WITS_TradeStats_Trade                 

This format allows for more easily searching `dflows` by source:

In [None]:
list_all_dataflows(codes_only=True, return_pandas=True)

2023-09-16 00:51:51,419 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>
2023-09-16 00:51:57,339 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>
2023-09-16 00:52:15,569 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>
2023-09-16 00:52:16,277 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>
2023-09-16 00:52:18,956 pandasdmx.reader.sdmxml - DEBUG: Truncate sub-microsecond time in <Prepared>


ABS_XML  0                 ABORIGINAL_POP_PROJ
         1          ABORIGINAL_POP_PROJ_REMOTE
         2    ABS_ABORIGINAL_POPPROJ_INDREGION
         3                   ABS_ACLD_LFSTATUS
         4                     ABS_ACLD_TENURE
                            ...               
UNSD     5                     DF_UNData_UNFCC
WB       0               DF_WITS_Tariff_TRAINS
         1      DF_WITS_TradeStats_Development
         2           DF_WITS_TradeStats_Tariff
         3            DF_WITS_TradeStats_Trade
Name: dataflow, Length: 3290, dtype: object

In [None]:
dflows['BIS']

WS_CBPOL_D                                    Policy rates daily
WS_CBPOL_M                                  Policy rates monthly
WS_CBS_PUB                              BIS consolidated banking
WS_CPMI_CASHLESS                   CPMI cashless payments (T5-6)
WS_CPMI_CT1                       CPMI comparative tables type 1
WS_CPMI_CT2                       CPMI comparative tables type 2
WS_CPMI_DEVICES                             CPMI payment devices
WS_CPMI_INSTITUTIONS                           CPMI institutions
WS_CPMI_MACRO                                         CPMI Macro
WS_CPMI_PARTICIPANTS                           CPMI participants
WS_CPMI_SYSTEMS         CPMI systems (T8-9-11-13-14-16-17-18-19)
WS_CREDIT_GAP                             BIS credit-to-GDP gaps
WS_DEBT_SEC2_PUB                             BIS debt securities
WS_DER_OTC_TOV                          OTC derivatives turnover
WS_DSR                                    BIS debt service ratio
WS_EER_D                 

Or the user can search dataflows by their human-readable name instead of their code. For example, this is one way to see if any dataflow has information on interest rates:

In [None]:
dflows[dflows.str.contains('Interest rates', case=False)]

BBK  BBSDI       Discount interest rates pursuant to section 25...
ECB  RIR                                     Retail Interest Rates
IMF  6SR         M&B: Interest Rates and Share Prices (6SR) for...
     INR                                            Interest rates
     INR_NSTD                          Interest rates_Non-Standard
Name: dataflow, dtype: object

The function `load_SDMX_data` is a convenience function that downloads data from SDMX sources (and any specific dataflows passed as arguments) if they match the key and parameters set by the user.

In [None]:
#| include: false
#| export
import pandasdmx as sdmx

def load_SDMX_data(
    sources:dict, # A dictionary with the sources and dataflows per source
    keys:dict, # The keys to be used in the SDMX query
    params:dict, # The parameters to be used in the SDMX query
    verbose:bool=True # Whether to communicate download steps to the user
    ): # A pandas DataFrame with data from SDMX or None if no data matches the sources, keys and parameters
    "Loads datasets from SDMX."
    data_sdmx = {}
    for source in sources.keys():
        src_conn = sdmx.Request(source)
        src_dflows = src_conn.dataflow()
        if sources[source] == 'all':
            dflows = {k: v for k, v in src_dflows.dataflow.items()}
        else:
            dflows = {k: v for k, v in src_dflows.dataflow.items() if k in sources[source]}
        for dflow in dflows.keys():
            if verbose: print(f"Querying data from {source}'s dataflow '{dflow}' - {dflows[dflow].dict()['name']}...")
            try:
                data = sdmx.to_pandas(src_conn.data(dflow, key=keys, params=params), datetime='TIME_PERIOD')
            except:
                if verbose: print("this dataflow does not have data in the desired frequency and time period.")
                continue
            data.columns = ['__'.join(col) for col in data.columns.to_flat_index()]
            data_sdmx[source+"__"+dflow] = data

    if len(data_sdmx.keys()) is None:
        return

    df = pd.concat(data_sdmx, axis=1)
    df.columns = ['_'.join(col) for col in df.columns.to_flat_index()]
    return df

In [None]:
show_doc(load_SDMX_data)

---

[source](https://github.com/dkgaraujo/gingado/blob/main/gingado/utils.py#L135){target="_blank" style="float:right; font-size:smaller"}

### load_SDMX_data

>      load_SDMX_data (sources:dict, keys:dict, params:dict, verbose:bool=True)

Loads datasets from SDMX.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| sources | dict |  | A dictionary with the sources and dataflows per source |
| keys | dict |  | The keys to be used in the SDMX query |
| params | dict |  | The parameters to be used in the SDMX query |
| verbose | bool | True | Whether to communicate download steps to the user |

In [None]:
df = load_SDMX_data(sources={'ECB': 'CISS', 'BIS': 'WS_CBPOL_D'}, keys={'FREQ': 'D'}, params={'startPeriod': 2003})

assert type(df) == pd.DataFrame
assert df.shape[0] > 0
assert df.shape[1] > 0

Querying data from ECB's dataflow 'CISS' - Composite Indicator of Systemic Stress...


2023-09-16 00:52:42,940 pandasdmx.reader.sdmxml - INFO: Use supplied dsd=… argument for non–structure-specific message


Querying data from BIS's dataflow 'WS_CBPOL_D' - Policy rates daily...


In [None]:
#| echo: false
import nbdev; nbdev.nbdev_export()