In [None]:
#default_exp utils
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#hide
#export
from nbdev.showdoc import *

# Utils
> Functions to support the use of `gingado`

In [None]:
#hide
#export
import datetime
import os
import pwd

In [None]:
#hide
#export
def get_username():
    "Returns the active username in the computer"
    return pwd.getpwuid(os.getuid()).pw_name

In [None]:
show_doc(get_username)

<h4 id="get_username" class="doc_header"><code>get_username</code><a href="__main__.py#L3" class="source_link" style="float:right">[source]</a></h4>

> <code>get_username</code>()

Returns the active username in the computer

In [None]:
u = get_username()
assert isinstance(u, str)
assert len(u) > 0

In [None]:
#hide
#export
def get_datetime():
    "Returns the time now"
    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z") 

In [None]:
show_doc(get_datetime)

<h4 id="get_datetime" class="doc_header"><code>get_datetime</code><a href="__main__.py#L3" class="source_link" style="float:right">[source]</a></h4>

> <code>get_datetime</code>()

Returns the time now

In [None]:
d = get_datetime()
assert isinstance(d, str)
assert len(d) > 0

In [None]:
#hide
#export
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

class Lag(BaseEstimator, TransformerMixin):
    def __init__(self, lags=1, jump=0, keep_contemporaneous_X=False):
        self.lags = lags
        self.jump = jump
        self.keep_contemporaneous_X = keep_contemporaneous_X
    
    def fit(self, X, y=None):        
        self.index = None
        if hasattr(X, "index"):
            self.index = X.index
        else:
            if y is not None and hasattr(y, "index"):
                self.index = y.index
        X = self._validate_data(X)

        self.effective_lags_ = self.lags + self.jump
        return self

    def transform(self, X):
        X_forlag = X
        
        X = self._validate_data(X)
        check_is_fitted(self)
        X_lags = []
        X_colnames = list(self.feature_names_in_) if self.keep_contemporaneous_X else []
        for lag in range(self.effective_lags_):
            if lag < self.jump:
                continue
            lag_count = lag+1
            lag_X = np.roll(X_forlag, lag_count, axis=0)
            X_lags.append(lag_X)
            if hasattr(self, "feature_names_in_"):
                X_colnames = X_colnames + [col+"_lag_"+str(lag+1) for col in list(self.feature_names_in_)]
        X = np.concatenate(X_lags, axis=1)
        if self.keep_contemporaneous_X:
            X = np.concatenate([X_forlag, X], axis=1)
        X = X[self.effective_lags_:, :]
        if hasattr(self, "index") and self.index is not None:
            new_index = self.index[self.effective_lags_:]
            X = pd.DataFrame(X, index=new_index, columns=X_colnames)
        else:
            X = pd.DataFrame(X)
        return X

In [None]:
show_doc(Lag)

<h2 id="Lag" class="doc_header"><code>class</code> <code>Lag</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>Lag</code>(**`lags`**=*`1`*, **`jump`**=*`0`*, **`keep_contemporaneous_X`**=*`False`*) :: `BaseEstimator`

Base class for all estimators in scikit-learn.

Notes
-----
All estimators should specify all the parameters that can be set
at the class level in their ``__init__`` as explicit keyword
arguments (no ``*args`` or ``**kwargs``).

The code below demonstrates how `Lag` works in practice.

In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

randomX = np.random.rand(15, 2)
randomY = np.random.rand(15)

lags = 3
jump = 2

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('lagger', Lag(lags=lags, jump=jump, keep_contemporaneous_X=False))
]).fit_transform(randomX, randomY)

Below we confirm that the lagger object removes the correct number of rows corresponding to the lagged observations:

In [None]:
assert randomX.shape[0] - lags - jump == pipe.shape[0]

## Real-life example dataset

The function `load_EURFX_data` is a helper function to download a test dataset containing real life data. This dataset was chosen due to the assumption that most users have at least an intuitive understanding of what a foreign exchange is: the price of changing one currency for the other. This example dataset does not imply this data is more or less relevant than others; it is used only for pedagogical purposes and also to provide an example source code for users on how such data can be downloaded and prepared, including with `gingado`'s `Lag` class. 

In [None]:
#hide
#export
import pandasdmx as sdmx

def load_EURFX_data(startYear=2003, lags=1, jump=0, keep_contemporaneous_X=True):
    "Loads a real-life dataset for testing use cases."
    ecb = sdmx.Request('ECB')
    exr_msg = ecb.dataflow('EXR')
    exr_flow = exr_msg.dataflow.EXR
    dsd = exr_flow.structure
    key = {
    "CURRENCY": ['EUR', 'AUD', 'BRL', 'CAD', 'CHF', 'GBP', 'JPY', 'SGD', 'USD'],
    "FREQ": 'D'
    }
    params = {"startPeriod": startYear}
    data_msg = ecb.data('EXR', key=key, params=params, dsd=dsd)
    df = sdmx.to_pandas(data_msg.data[0], datetime='TIME_PERIOD')
    df = df.droplevel(['FREQ', 'CURRENCY_DENOM', 'EXR_TYPE', 'EXR_SUFFIX'], axis=1).dropna(how='all')
    
    if lags or jump:
        df = Lag(lags=lags, jump=jump, keep_contemporaneous_X=keep_contemporaneous_X).fit_transform(df)
    return df

In [None]:
show_doc(load_EURFX_data)

<h4 id="load_EURFX_data" class="doc_header"><code>load_EURFX_data</code><a href="__main__.py#L5" class="source_link" style="float:right">[source]</a></h4>

> <code>load_EURFX_data</code>(**`startYear`**=*`2003`*, **`lags`**=*`1`*, **`jump`**=*`0`*, **`keep_contemporaneous_X`**=*`True`*)

Loads a real-life dataset for testing use cases.

In [None]:
EUR_FX = load_EURFX_data()

assert type(EUR_FX) == pd.DataFrame
assert EUR_FX.shape[0] > 0
assert EUR_FX.shape[1] > 0

EUR_FX

Unnamed: 0_level_0,AUD,BRL,CAD,CHF,GBP,JPY,SGD,USD,AUD_lag_1,BRL_lag_1,CAD_lag_1,CHF_lag_1,GBP_lag_1,JPY_lag_1,SGD_lag_1,USD_lag_1
TIME_PERIOD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2003-01-03,1.8440,3.6112,1.6264,1.4555,0.65000,124.56,1.8132,1.0392,1.8554,3.6770,1.6422,1.4528,0.65200,124.40,1.8188,1.0446
2003-01-06,1.8281,3.5145,1.6383,1.4563,0.64950,124.40,1.8210,1.0488,1.8440,3.6112,1.6264,1.4555,0.65000,124.56,1.8132,1.0392
2003-01-07,1.8160,3.5139,1.6257,1.4565,0.64960,124.82,1.8155,1.0425,1.8281,3.5145,1.6383,1.4563,0.64950,124.40,1.8210,1.0488
2003-01-08,1.8132,3.4405,1.6231,1.4586,0.64950,124.90,1.8102,1.0377,1.8160,3.5139,1.6257,1.4565,0.64960,124.82,1.8155,1.0425
2003-01-09,1.8172,3.4915,1.6371,1.4597,0.65300,125.16,1.8244,1.0507,1.8132,3.4405,1.6231,1.4586,0.64950,124.90,1.8102,1.0377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-23,1.4982,5.1623,1.3626,1.0310,0.84783,136.05,1.4639,1.0659,1.4980,5.1989,1.3526,1.0280,0.84820,135.34,1.4588,1.0577
2022-05-24,1.5152,5.1793,1.3714,1.0334,0.85750,136.49,1.4722,1.0720,1.4982,5.1623,1.3626,1.0310,0.84783,136.05,1.4639,1.0659
2022-05-25,1.5126,5.1736,1.3720,1.0269,0.85295,135.34,1.4676,1.0656,1.5152,5.1793,1.3714,1.0334,0.85750,136.49,1.4722,1.0720
2022-05-26,1.5110,5.1741,1.3715,1.0283,0.85073,135.95,1.4709,1.0697,1.5126,5.1736,1.3720,1.0269,0.85295,135.34,1.4676,1.0656
