# Imports

In [1]:
%load_ext watermark
%watermark

%load_ext autoreload
%autoreload 2

# import standard libs
import warnings

warnings.filterwarnings("ignore")
from IPython.display import display
from IPython.core.debugger import set_trace as debug
from pathlib import Path
import itertools
from collections import namedtuple
import pickle

# import python scientific stack
import pandas as pd

pd.set_option("display.max_rows", 100)
import dask.dataframe as dd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import numba as nb

# import ffn
import yfinance as yf
import bottleneck as bk
import mlxtend as mlx

# import mlfinlab as ml
from boruta import BorutaPy as bp
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import (
    RandomForestClassifier,
    BaggingClassifier,
    VotingClassifier,
    # StackingClassifier,
)
from mlxtend.classifier import StackingClassifier  # <-- works w/o errors
from mlxtend.plotting import plot_confusion_matrix

from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA

from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    roc_curve,
    precision_recall_curve,
    auc,
    make_scorer,
    recall_score,
    accuracy_score,
    precision_score,
    confusion_matrix,
    matthews_corrcoef,
    classification_report,
)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from typing import Callable

from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.base import ClassifierMixin
from sklearn.model_selection import BaseCrossValidator

import numpy_ext as npx
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb


"""from mlfinlab.feature_importance import (
    mean_decrease_impurity,
    mean_decrease_accuracy,
    single_feature_importance,
    plot_feature_importance,
)
from mlfinlab.feature_importance import ClassificationModelFingerprint
from mlfinlab.ensemble import SequentiallyBootstrappedBaggingClassifier"""
import shap

# import visual tools
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

%matplotlib inline
import seaborn as sns

# import util libs
from tqdm import tqdm, tqdm_notebook
import missingno as msno

from algo_dev.CONSTANTS import REPO_NAME
from algo_dev.tools.plots import *
from algo_dev.tools.utils import *

# ---------------------------------------------------
# THESE ARE VARIABLES FOR EASILY ACCESSING DIFFERENT
# DIRECTORIES FOR ACCESSING AND SAVING DATA AND IMAGES
# IF NECESSARY. CHANGE THEM TO MATCH YOUR DIRECTORY
# STRUCTURE.
# ---------------------------------------------------

# REPO_NAME = "blackarbs_algo_strategy_dev"
print("\n", REPO_NAME)
project_dir = get_relative_project_dir(REPO_NAME)
data_dir = project_dir / "data"
external = data_dir / "external"
processed = data_dir / "processed"
viz = project_dir / "viz"

print()
%watermark -v -m -p numpy,pandas,scipy,sklearn,mlfinlab,seaborn,matplotlib -g

2021-03-29T20:50:46-03:00

CPython 3.8.6
IPython 7.19.0

compiler   : MSC v.1916 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 158 Stepping 10, GenuineIntel
CPU cores  : 12
interpreter: 64bit

 blackarbs_algo_strategy_dev

CPython 3.8.6
IPython 7.19.0

numpy 1.19.4
pandas 1.2.1
scipy 1.5.3
sklearn 0.23.2
mlfinlab not installed
seaborn 0.11.0
matplotlib 3.3.3

compiler   : MSC v.1916 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 158 Stepping 10, GenuineIntel
CPU cores  : 12
interpreter: 64bit
Git hash   : 8bad65bc7a7f597f230fc8c0154640db2a5a0ecd


In [2]:
%load_ext nb_black

sns_params = {
    "xtick.major.size": 2,
    "ytick.major.size": 2,
    "font.size": 12,
    "font.weight": "medium",
    "figure.figsize": (10, 7),
    "font.family": "Ubuntu Mono",
}

sns.set_style("white", sns_params)
sns.set_context(sns_params)
savefig_kwds = dict(dpi=90, bbox_inches="tight", frameon=True, format="png")


from jupyterthemes import jtplot

jtplot.style(
    theme="grade3",  # "oceans16",  # "monokai",
    context="talk",
    fscale=1.4,
    ticks=True,
    spines=False,
    grid=True,
    gridlines="--",
)

<IPython.core.display.Javascript object>

# Import Data

In [3]:
def read_tradestation_futures_data(fn: Path) -> pd.DataFrame:
    df = pd.read_csv(fn).rename(str.lower, axis="columns")
    df["datetime"] = pd.to_datetime(
        df["date"] + " " + df["time"], infer_datetime_format=True
    )
    df["volume"] = df["up"] + df["down"]
    df.drop(["date", "time"], axis=1, inplace=True)
    df.set_index("datetime", inplace=True)
    return df


spy = read_tradestation_futures_data(external / "tradestation" / "SPY.txt")
cprint(spy)

-------------------------------------------------------------------------------
dataframe information
-------------------------------------------------------------------------------
                       open    high     low   close      up     down   volume
datetime                                                                     
2019-05-14 12:56:00  283.77  283.86  283.69  283.74  313916   322548   636464
2019-05-14 12:57:00  283.74  283.74  283.42  283.44  154415   244134   398549
2019-05-14 12:58:00  283.43  283.60  283.40  283.57  240404   316421   556825
2019-05-14 12:59:00  283.58  283.59  283.42  283.42  308214   493246   801460
2019-05-14 13:00:00  283.42  283.49  283.17  283.32  933738  1283851  2217589
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1468155 entries, 2004-05-14 06:31:00 to 2019-05-14 13:00:00
Data columns (total 7 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    ----- 

<IPython.core.display.Javascript object>

# Data Resampling

In [4]:
def time_consolidator(df, period):
    aggregate = {
        "open": "first",
        "high": "max",
        "low": "min",
        "close": "last",
        "up": "sum",
        "down": "sum",
        "volume": "sum",
    }
    return df.resample(f"{period}Min").agg(aggregate).dropna()


def add_basic_features(df):
    bars = df.copy()
    # bars["date"] = df.index.strftime("%Y-%m-%d")
    bars["bar_return"] = bars.close / bars.open - 1
    return bars


data = dict()
df = spy.copy()
bars_5m = time_consolidator(df, 5)
bars_30m = time_consolidator(df, 30)
bars_1H = time_consolidator(df, 60)
bars_1D = time_consolidator(df, 60 * 24)
bars_1W = time_consolidator(df, 60 * 24 * 7)

"""bars_5m = add_basic_features(bars_5m)
bars_1H = add_basic_features(bars_1H)
bars_1D = add_basic_features(bars_1D)
bars_1W = add_basic_features(bars_1W)"""

data = {
    "5m": bars_5m,
    "30m": bars_30m,
    "1H": bars_1H,
    "1D": bars_1D,
    "1W": bars_1W,
}

<IPython.core.display.Javascript object>

# Feature Functions and Code

In [5]:
#################


def add_log_returns(df: pd.DataFrame, column: str) -> pd.DataFrame:
    df[f"log_{column}_return"] = np.log(df[column]).diff()
    return df


#################
def internal_bar_strength(df: pd.DataFrame) -> float:
    return (df.close - df.low) / (df.high - df.low)


def add_internal_bar_strength(df: pd.DataFrame) -> pd.DataFrame:
    df["ibs"] = internal_bar_strength(df)
    return df


#################
# @nb.jit
def aqr_momentum(array: np.ndarray) -> float:
    returns = np.diff(np.log(array))  # .diff()
    x = np.arange(len(returns))
    slope, _, rvalue, _, _ = stats.linregress(x, returns)
    return ((1 + slope) ** 252) * (rvalue ** 2)  # annualize slope and multiply by R^2


@nb.njit
def aqr_momo_numba(array: np.ndarray) -> float:
    y = np.diff(np.log(array))
    x = np.arange(y.shape[0])
    A = np.column_stack((x, np.ones(x.shape[0])))
    model, resid = np.linalg.lstsq(A, y)[:2]
    r2 = 1 - resid / (y.size * y.var())
    return (((1 + model[0]) ** 252) * r2)[0]


def add_aqr_momentum(df: pd.DataFrame, column: str, window: int) -> pd.DataFrame:
    df[f"aqr_momo_{column}_{window}"] = npx.rolling_apply(
        aqr_momentum, window, df[column].values, n_jobs=10
    )
    return df


def add_aqr_momentum_numba(df: pd.DataFrame, column: str, window: int) -> pd.DataFrame:
    df[f"aqr_momo_{column}_{window}"] = npx.rolling_apply(
        aqr_momo_numba, window, df[column].values, n_jobs=10
    )
    return df


#################


def get_slope(array: np.ndarray) -> float:
    returns = np.diff(np.log(array))
    x = np.arange(len(returns))
    slope, _, rvalue, _, _ = stats.linregress(x, returns)
    return slope


@nb.njit
def get_slope_numba(array: np.ndarray) -> float:
    y = np.diff(np.log(array))
    # y = y[~np.isnan(y)]
    x = np.arange(y.shape[0])
    A = np.column_stack((x, np.ones(x.shape[0])))
    model, resid = np.linalg.lstsq(A, y)[:2]
    return model[0]


def add_slope_column(df: pd.DataFrame, column: str, window: int) -> pd.DataFrame:
    df[f"slope_{column}_{window}"] = npx.rolling_apply(
        get_slope, window, df[column].values, n_jobs=10
    )
    return df


def add_slope_column_numba(df: pd.DataFrame, column: str, window: int) -> pd.DataFrame:
    df[f"slope_{column}_{window}"] = npx.rolling_apply(
        get_slope_numba, window, df[column].values, n_jobs=1
    )
    return df


#################
def add_average_price(df: pd.DataFrame) -> pd.DataFrame:
    df["average_price"] = (df.high + df.low + df.close + df.open) / 4
    return df


#################
def add_rolling_min(df: pd.DataFrame, column: str, window: int) -> pd.DataFrame:
    array = npx.rolling_apply(np.min, window, df[column].values, n_jobs=10)
    df[f"rmin_{column}_{window}"] = array
    return df


def add_rolling_max(df: pd.DataFrame, column: str, window: int) -> pd.DataFrame:
    array = npx.rolling_apply(np.max, window, df[column].values, n_jobs=10)
    df[f"rmax_{column}_{window}"] = array
    return df


#################

# for some reason njit is generating zerodivision errors whereas numpy is not
@nb.njit
def numba_vwap(
    avg: np.ndarray, v: np.ndarray, idx: np.ndarray, len_df: int, window: int
) -> np.ndarray:
    n = np.shape(np.arange(len_df - window))[0]
    A = np.empty((n, 2))
    for i in np.arange(len_df - window):
        tmp_avg = avg[i : i + window]
        tmp_v = v[i : i + window]
        aa = np.sum(tmp_v * tmp_avg) / np.sum(tmp_v)
        jj = idx[i + window]
        A[i, 0] = jj
        A[i, 1] = aa
    return A


def numpy_vwap(
    avg: np.ndarray, v: np.ndarray, idx: np.ndarray, len_df: int, window: int
) -> np.ndarray:
    n = np.shape(np.arange(len_df - window))[0]
    A = np.empty((n, 2))
    for i in tqdm(np.arange(len_df - window)):
        tmp_avg = avg[i : i + window]
        tmp_v = v[i : i + window]
        aa = np.sum(tmp_v * tmp_avg) / np.sum(tmp_v)
        jj = idx[i + window]
        A[i, 0] = jj
        A[i, 1] = aa
    return A


def add_rolling_vwap(df: pd.DataFrame, column: str, window: int) -> pd.DataFrame:
    v = df.volume.values
    avg = df[column].values
    idx = df.index.asi8
    # A = numba_vwap(avg, v, idx, len(df), window)
    A = numpy_vwap(avg, v, idx, len(df), window)
    outdf = (
        pd.DataFrame(A, columns=["index", f"rvwap_{window}"])
        .assign(datetime=lambda df: pd.to_datetime(df["index"], unit="ns"))
        .drop("index", axis=1)
        .set_index("datetime")
    )
    df = df.join(outdf, how="left")
    return df


#################
def add_rolling_bands(
    df: pd.DataFrame, column: str, dist: int, window: int
) -> pd.DataFrame:
    upper = df[column] + dist * df[column].rolling(window).std()
    lower = df[column] - dist * df[column].rolling(window).std()

    df[f"upper_band_{column}"] = upper
    df[f"lower_band_{column}"] = lower
    return df


#################
def add_acceleration(
    df: pd.DataFrame, column: str = "close", window: int = 10
) -> pd.DataFrame:
    return_diff = df[column].pct_change().diff()
    df[f"racc_{column}_{window}"] = return_diff.rolling(
        window
    ).std()  # standard deviation of second deriv aka acceleration
    return df


def roll_rank_bk(array):
    rank = array.size + 1 - bk.rankdata(array)[-1]
    A = array.shape[0]
    p = rank / A
    return p


#################
def add_volatility(
    df: pd.DataFrame, column: str = "close", window: int = 10
) -> pd.DataFrame:
    returns = df[column].pct_change()
    df[f"rvol_{column}_{window}"] = returns.rolling(window).std()
    return df


def relative_strength_index(df: pd.DataFrame, n: int) -> pd.Series:
    """
    Calculate Relative Strength Index(RSI) for given data.
    https://github.com/Crypto-toolbox/pandas-technical-indicators/blob/master/technical_indicators.py

    :param df: pandas.DataFrame
    :param n:
    :return: pandas.DataFrame
    """
    i = 0
    UpI = [0]
    DoI = [0]
    while i + 1 <= df.index[-1]:
        UpMove = df.loc[i + 1, "high"] - df.loc[i, "high"]
        DoMove = df.loc[i, "low"] - df.loc[i + 1, "low"]
        if UpMove > DoMove and UpMove > 0:
            UpD = UpMove
        else:
            UpD = 0
        UpI.append(UpD)
        if DoMove > UpMove and DoMove > 0:
            DoD = DoMove
        else:
            DoD = 0
        DoI.append(DoD)
        i = i + 1
    UpI = pd.Series(UpI)
    DoI = pd.Series(DoI)
    PosDI = pd.Series(UpI.ewm(span=n, min_periods=n).mean())
    NegDI = pd.Series(DoI.ewm(span=n, min_periods=n).mean())
    RSI = pd.Series(round(PosDI * 100.0 / (PosDI + NegDI)), name="RSI_" + str(n))
    return RSI


def add_rsi(df: pd.DataFrame, column: str = "close", window: int = 14) -> pd.DataFrame:
    out = df.reset_index()
    rsi = relative_strength_index(out, window)
    df[f"rsi_{column}_{window}"] = pd.Series(data=rsi.values, index=df.index)
    return df


#################


def np_racorr(array: np.ndarray, window: int, lag: int) -> np.ndarray:
    """
    rolling autocorrelation
    """
    return npx.rolling_apply(
        lambda array, lag: sm.tsa.acf(array, nlags=lag, fft=True)[lag],
        window,
        array,
        lag=lag,
        n_jobs=10,
    )


def add_rolling_autocorr(
    df: pd.DataFrame, column: str, window: int, lag: int
) -> pd.DataFrame:
    log_changes_array = np.log(df[column]).diff().values
    df[f"racorr_{column}_{window}"] = np_racorr(log_changes_array, window, lag)
    return df


#################
@nb.njit
def custom_percentile(array: np.ndarray) -> float:
    if (array.shape[0] - 1) == 0:
        return np.nan
    return (array[:-1] > array[-1]).sum() / (array.shape[0] - 1)


def add_custom_percentile(df: pd.DataFrame, column: str, window: int) -> pd.DataFrame:
    df[f"rank_{column}_{window}"] = npx.rolling_apply(
        custom_percentile, window, df[column].values, n_jobs=5
    )
    return df

<IPython.core.display.Javascript object>

In [6]:
# create data store for features

store = pd.HDFStore(
    processed / "spy_features.h5", mode="a", complevel=1, complib="blosc:lz4"
)
store.close()

<IPython.core.display.Javascript object>

In [7]:
from pprint import pprint

<IPython.core.display.Javascript object>

## 5 min multi day features

In [8]:
one_day_in_minutes = 1440
one_day = int(one_day_in_minutes // 5)
two_days = int(one_day * 2)
three_days = int(one_day * 3)
five_days = int(one_day * 5)
ten_days = int(one_day * 10)
one_month = int(one_day * 21)

period_labels = ["1_day", "2_day", "3_day", "5_day", "10_day", "21_day"]
periods = dict(
    zip(period_labels, [one_day, two_days, three_days, five_days, ten_days, one_month])
)
log_errors = []
pprint(periods)

data_frequency = "5m"
df = data[data_frequency].copy()

for key, window in tqdm(periods.items()):
    tqdm._instances.clear()
    try:
        tmp_df = (
            df.pipe(add_average_price)
            .pipe(add_rolling_vwap, column="average_price", window=window)
            .pipe(add_rolling_bands, column=f"rvwap_{window}", dist=2, window=window)
            .pipe(add_internal_bar_strength)
            .pipe(add_rolling_min, column="low", window=window)
            .pipe(add_rolling_max, column="high", window=window)
            .dropna()
            .pipe(
                add_slope_column_numba,
                column=f"lower_band_rvwap_{window}",
                window=window,
            )
            .pipe(
                add_slope_column_numba,
                column=f"upper_band_rvwap_{window}",
                window=window,
            )
            .pipe(add_slope_column_numba, column=f"rmin_low_{window}", window=window)
            .pipe(add_slope_column_numba, column=f"rmax_high_{window}", window=window)
            .pipe(add_acceleration, column="close", window=window)
            .pipe(add_aqr_momentum_numba, column="close", window=window)
            .pipe(add_acceleration, column="average_price", window=window)
            .pipe(add_aqr_momentum_numba, column="average_price", window=window)
            .pipe(add_acceleration, column=f"rvwap_{window}", window=window)
            .pipe(add_acceleration, column="close", window=window)
            .pipe(add_acceleration, column="average_price", window=window)
            .pipe(add_aqr_momentum_numba, column=f"rvwap_{window}", window=window)
            .pipe(add_volatility, column="close", window=window)
            .pipe(add_volatility, column="average_price", window=window)
            .pipe(add_rsi, column="close", window=window)
            .pipe(add_rsi, column="average_price", window=window)
            .pipe(add_rolling_autocorr, column="close", window=window, lag=1)
            .pipe(add_rolling_autocorr, column="average_price", window=window, lag=1)
        )
        columns_to_rank = tmp_df.columns[tmp_df.columns.get_loc("up") :]
        for column in tqdm(columns_to_rank):
            tmp_df = add_custom_percentile(tmp_df, column, window=int(window // 10))
            # ^^ make window smaller since this basically a double lag

        # write to store iteratively in case of problems
        with pd.HDFStore(processed / "spy_features.h5") as store:
            store.put(value=tmp_df, key=f"spy/5m/{key}", format="table")

    except Exception as error:
        log_error = dict(window=window, error=error)
        log_errors.append(log_error)
        pprint(log_error)

{'10_day': 2880,
 '1_day': 288,
 '21_day': 6048,
 '2_day': 576,
 '3_day': 864,
 '5_day': 1440}


100%|███████████████████████████████████████████████████████████████████████| 297285/297285 [00:10<00:00, 28894.21it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [04:57<00:00, 11.45s/it]
100%|███████████████████████████████████████████████████████████████████████| 296997/296997 [00:07<00:00, 37962.76it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [04:45<00:00, 10.99s/it]
100%|███████████████████████████████████████████████████████████████████████| 296709/296709 [00:04<00:00, 59917.35it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [04:09<00:00,  9.59s/it]
100%|███████████████████████████████████████████████████████████████████████| 296133/296133 [00:05<00:00, 54950.79it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [04:16<00:00,  9.87s/it]
100%|███████████████████████████████████

<IPython.core.display.Javascript object>

## 30 min multi day features

In [9]:
from pprint import pprint

<IPython.core.display.Javascript object>

In [10]:
one_day_in_minutes = 1440
one_day = int(one_day_in_minutes // 30)
two_days = int(one_day * 2)
three_days = int(one_day * 3)
five_days = int(one_day * 5)
ten_days = int(one_day * 10)
one_month = int(one_day * 21)

period_labels = ["1_day", "2_day", "3_day", "5_day", "10_day", "21_day"]
periods = dict(
    zip(period_labels, [one_day, two_days, three_days, five_days, ten_days, one_month])
)
log_errors = []
pprint(periods)

data_frequency = "30m"
df = data[data_frequency].copy()
cprint(df)

{'10_day': 480,
 '1_day': 48,
 '21_day': 1008,
 '2_day': 96,
 '3_day': 144,
 '5_day': 240}
-------------------------------------------------------------------------------
dataframe information
-------------------------------------------------------------------------------
                       open    high     low   close       up     down   volume
datetime                                                                      
2019-05-14 11:00:00  284.90  285.05  284.30  285.03  1982877  1488027  3470904
2019-05-14 11:30:00  285.03  285.05  284.03  284.32  1803471  2137726  3941197
2019-05-14 12:00:00  284.32  284.77  284.14  284.49  1676140  1627154  3303294
2019-05-14 12:30:00  284.49  284.52  283.40  283.42  4500359  5440864  9941223
2019-05-14 13:00:00  283.42  283.49  283.17  283.32   933738  1283851  2217589
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 52754 entries, 2004-05-14 06:30:00 to 2019-05-14 13:00:00
Data columns

<IPython.core.display.Javascript object>

In [11]:
for key, window in tqdm(periods.items()):
    tqdm._instances.clear()
    try:
        tmp_df = (
            df.pipe(add_average_price)
            .pipe(add_rolling_vwap, column="average_price", window=window)
            .pipe(add_rolling_bands, column=f"rvwap_{window}", dist=2, window=window)
            .pipe(add_internal_bar_strength)
            .pipe(add_rolling_min, column="low", window=window)
            .pipe(add_rolling_max, column="high", window=window)
            .dropna()
            .pipe(
                add_slope_column_numba,
                column=f"lower_band_rvwap_{window}",
                window=window,
            )
            .pipe(
                add_slope_column_numba,
                column=f"upper_band_rvwap_{window}",
                window=window,
            )
            .pipe(add_slope_column_numba, column=f"rmin_low_{window}", window=window)
            .pipe(add_slope_column_numba, column=f"rmax_high_{window}", window=window)
            .pipe(add_acceleration, column="close", window=window)
            .pipe(add_aqr_momentum_numba, column="close", window=window)
            .pipe(add_acceleration, column="average_price", window=window)
            .pipe(add_aqr_momentum_numba, column="average_price", window=window)
            .pipe(add_acceleration, column=f"rvwap_{window}", window=window)
            .pipe(add_acceleration, column="close", window=window)
            .pipe(add_acceleration, column="average_price", window=window)
            .pipe(add_aqr_momentum_numba, column=f"rvwap_{window}", window=window)
            .pipe(add_volatility, column="close", window=window)
            .pipe(add_volatility, column="average_price", window=window)
            .pipe(add_rsi, column="close", window=window)
            .pipe(add_rsi, column="average_price", window=window)
            .pipe(add_rolling_autocorr, column="close", window=window, lag=1)
            .pipe(add_rolling_autocorr, column="average_price", window=window, lag=1)
        )
        columns_to_rank = tmp_df.columns[tmp_df.columns.get_loc("up") :]
        for column in tqdm(columns_to_rank):
            tmp_df = add_custom_percentile(tmp_df, column, window=int(window // 10))
            # ^^ make window smaller since this basically a double lag

        # write to store iteratively in case of problems
        with pd.HDFStore(processed / "spy_features.h5") as store:
            store.put(value=tmp_df, key=f"spy/{data_frequency}/{key}", format="table")

    except Exception as error:
        log_error = dict(window=window, error=error)
        log_errors.append(log_error)
        pprint(log_error)

100%|█████████████████████████████████████████████████████████████████████████| 52706/52706 [00:00<00:00, 57143.82it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:54<00:00,  2.10s/it]
100%|█████████████████████████████████████████████████████████████████████████| 52658/52658 [00:00<00:00, 66394.81it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:55<00:00,  2.12s/it]
100%|█████████████████████████████████████████████████████████████████████████| 52610/52610 [00:00<00:00, 63735.60it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:54<00:00,  2.09s/it]
100%|█████████████████████████████████████████████████████████████████████████| 52514/52514 [00:00<00:00, 59397.94it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:55<00:00,  2.12s/it]
100%|███████████████████████████████████

<IPython.core.display.Javascript object>

## 1 Hour (60 min) multi day features

In [12]:
one_day_in_minutes = 1440
one_day = int(one_day_in_minutes // 60)
two_days = int(one_day * 2)
three_days = int(one_day * 3)
five_days = int(one_day * 5)
ten_days = int(one_day * 10)
one_month = int(one_day * 21)
three_month = int(one_month * 3)

period_labels = ["1_day", "2_day", "3_day", "5_day", "10_day", "21_day", "63_day"]
periods = dict(
    zip(
        period_labels,
        [one_day, two_days, three_days, five_days, ten_days, one_month, three_month],
    )
)
log_errors = []
pprint(periods)

data_frequency = "1H"
df = data[data_frequency].copy()

for key, window in tqdm(periods.items()):
    tqdm._instances.clear()
    try:
        tmp_df = (
            df.pipe(add_average_price)
            .pipe(add_rolling_vwap, column="average_price", window=window)
            .pipe(add_rolling_bands, column=f"rvwap_{window}", dist=2, window=window)
            .pipe(add_internal_bar_strength)
            .pipe(add_rolling_min, column="low", window=window)
            .pipe(add_rolling_max, column="high", window=window)
            .dropna()
            .pipe(
                add_slope_column_numba,
                column=f"lower_band_rvwap_{window}",
                window=window,
            )
            .pipe(
                add_slope_column_numba,
                column=f"upper_band_rvwap_{window}",
                window=window,
            )
            .pipe(add_slope_column_numba, column=f"rmin_low_{window}", window=window)
            .pipe(add_slope_column_numba, column=f"rmax_high_{window}", window=window)
            .pipe(add_acceleration, column="close", window=window)
            .pipe(add_aqr_momentum_numba, column="close", window=window)
            .pipe(add_acceleration, column="average_price", window=window)
            .pipe(add_aqr_momentum_numba, column="average_price", window=window)
            .pipe(add_acceleration, column=f"rvwap_{window}", window=window)
            .pipe(add_acceleration, column="close", window=window)
            .pipe(add_acceleration, column="average_price", window=window)
            .pipe(add_aqr_momentum_numba, column=f"rvwap_{window}", window=window)
            .pipe(add_volatility, column="close", window=window)
            .pipe(add_volatility, column="average_price", window=window)
            .pipe(add_rsi, column="close", window=window)
            .pipe(add_rsi, column="average_price", window=window)
            .pipe(add_rolling_autocorr, column="close", window=window, lag=1)
            .pipe(add_rolling_autocorr, column="average_price", window=window, lag=1)
        )
        columns_to_rank = tmp_df.columns[tmp_df.columns.get_loc("up") :]
        for column in tqdm(columns_to_rank):
            tmp_df = add_custom_percentile(tmp_df, column, window=int(window // 10))
            # ^^ make window smaller since this basically a double lag

        # write to store iteratively in case of problems
        with pd.HDFStore(processed / "spy_features.h5") as store:
            store.put(value=tmp_df, key=f"spy/{data_frequency}/{key}", format="table")

    except Exception as error:
        log_error = dict(window=window, error=error)
        log_errors.append(log_error)
        pprint(log_error)

{'10_day': 240,
 '1_day': 24,
 '21_day': 504,
 '2_day': 48,
 '3_day': 72,
 '5_day': 120,
 '63_day': 1512}


100%|█████████████████████████████████████████████████████████████████████████| 30125/30125 [00:00<00:00, 63750.39it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:34<00:00,  1.34s/it]
100%|█████████████████████████████████████████████████████████████████████████| 30101/30101 [00:00<00:00, 68800.22it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:33<00:00,  1.30s/it]
100%|█████████████████████████████████████████████████████████████████████████| 30077/30077 [00:00<00:00, 65984.33it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:32<00:00,  1.25s/it]
100%|█████████████████████████████████████████████████████████████████████████| 30029/30029 [00:00<00:00, 65281.51it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:33<00:00,  1.28s/it]
100%|███████████████████████████████████

<IPython.core.display.Javascript object>

## daily features

In [13]:
one_day = 1
two_days = int(one_day * 2)
three_days = int(one_day * 3)
five_days = int(one_day * 5)
ten_days = int(one_day * 10)
one_month = int(one_day * 21)
three_month = int(one_month * 3)
six_month = int(one_month * 6)
twelve_month = int(one_month * 12)

period_labels = [  # "1_day",
    "2_day",
    "3_day",
    "5_day",
    "10_day",
    "21_day",
    "63_day",
    "126_day",
    "252_day",
]
periods = dict(
    zip(
        period_labels,
        [  # one_day,
            two_days,
            three_days,
            five_days,
            ten_days,
            one_month,
            three_month,
            six_month,
            twelve_month,
        ],
    )
)
log_errors = []
pprint(periods)

data_frequency = "1D"
df = data[data_frequency].copy()

for key, window in tqdm(periods.items()):
    tqdm._instances.clear()
    try:
        tmp_df = (
            df.pipe(add_average_price)
            .pipe(add_rolling_vwap, column="average_price", window=window)
            .pipe(add_rolling_bands, column=f"rvwap_{window}", dist=2, window=window)
            .pipe(add_internal_bar_strength)
            .pipe(add_rolling_min, column="low", window=window)
            .pipe(add_rolling_max, column="high", window=window)
            .dropna()
            .pipe(
                add_slope_column_numba,
                column=f"lower_band_rvwap_{window}",
                window=window,
            )
            .pipe(
                add_slope_column_numba,
                column=f"upper_band_rvwap_{window}",
                window=window,
            )
            .pipe(add_slope_column_numba, column=f"rmin_low_{window}", window=window)
            .pipe(add_slope_column_numba, column=f"rmax_high_{window}", window=window)
            .pipe(add_acceleration, column="close", window=window)
            .pipe(add_aqr_momentum_numba, column="close", window=window)
            .pipe(add_acceleration, column="average_price", window=window)
            .pipe(add_aqr_momentum_numba, column="average_price", window=window)
            .pipe(add_acceleration, column=f"rvwap_{window}", window=window)
            .pipe(add_acceleration, column="close", window=window)
            .pipe(add_acceleration, column="average_price", window=window)
            .pipe(add_aqr_momentum_numba, column=f"rvwap_{window}", window=window)
            .pipe(add_volatility, column="close", window=window)
            .pipe(add_volatility, column="average_price", window=window)
            .pipe(add_rsi, column="close", window=window)
            .pipe(add_rsi, column="average_price", window=window)
            .pipe(add_rolling_autocorr, column="close", window=window, lag=1)
            .pipe(add_rolling_autocorr, column="average_price", window=window, lag=1)
        )
        columns_to_rank = tmp_df.columns[tmp_df.columns.get_loc("up") :]
        for column in tqdm(columns_to_rank):
            tmp_df = add_custom_percentile(
                tmp_df, column, window=max(int(window // 2), 2)
            )
            # ^^ make window smaller since this basically a double lag

        # write to store iteratively in case of problems
        with pd.HDFStore(processed / "spy_features.h5") as store:
            store.put(value=tmp_df, key=f"spy/{data_frequency}/{key}", format="table")

    except Exception as error:
        log_error = dict(window=window, error=error)
        log_errors.append(log_error)
        pprint(log_error)

{'10_day': 10,
 '126_day': 126,
 '21_day': 21,
 '252_day': 252,
 '2_day': 2,
 '3_day': 3,
 '5_day': 5,
 '63_day': 63}


100%|███████████████████████████████████████████████████████████████████████████| 3772/3772 [00:00<00:00, 56449.04it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:10<00:00,  2.44it/s]
100%|███████████████████████████████████████████████████████████████████████████| 3771/3771 [00:00<00:00, 68694.28it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:10<00:00,  2.44it/s]
100%|███████████████████████████████████████████████████████████████████████████| 3769/3769 [00:00<00:00, 59090.38it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:10<00:00,  2.36it/s]
100%|███████████████████████████████████████████████████████████████████████████| 3764/3764 [00:00<00:00, 58240.68it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:11<00:00,  2.31it/s]
100%|███████████████████████████████████

<IPython.core.display.Javascript object>

## weekly features

In [14]:
one_week = 1
two_weeks = int(one_week * 2)
three_weeks = int(one_week * 3)
one_month = int(one_week * 4)
three_month = int(one_month * 3)
six_month = int(one_month * 6)
twelve_month = int(one_month * 12)

period_labels = [
    # "1_week",
    "2_week",
    "3_week",
    "1_month",
    "3_month",
    "6_month",
    "12_month",
]
periods = dict(
    zip(
        period_labels,
        [
            # one_week,
            two_weeks,
            three_weeks,
            one_month,
            three_month,
            six_month,
            twelve_month,
        ],
    )
)
log_errors = []
pprint(periods)

data_frequency = "1W"
df = data[data_frequency].copy()

for key, window in tqdm(periods.items()):
    tqdm._instances.clear()
    try:
        tmp_df = (
            df.pipe(add_average_price)
            .pipe(add_rolling_vwap, column="average_price", window=window)
            .pipe(add_rolling_bands, column=f"rvwap_{window}", dist=2, window=window)
            .pipe(add_internal_bar_strength)
            .pipe(add_rolling_min, column="low", window=window)
            .pipe(add_rolling_max, column="high", window=window)
            .dropna()
            .pipe(
                add_slope_column_numba,
                column=f"lower_band_rvwap_{window}",
                window=window,
            )
            .pipe(
                add_slope_column_numba,
                column=f"upper_band_rvwap_{window}",
                window=window,
            )
            .pipe(add_slope_column_numba, column=f"rmin_low_{window}", window=window)
            .pipe(add_slope_column_numba, column=f"rmax_high_{window}", window=window)
            .pipe(add_acceleration, column="close", window=window)
            .pipe(add_aqr_momentum_numba, column="close", window=window)
            .pipe(add_acceleration, column="average_price", window=window)
            .pipe(add_aqr_momentum_numba, column="average_price", window=window)
            .pipe(add_acceleration, column=f"rvwap_{window}", window=window)
            .pipe(add_acceleration, column="close", window=window)
            .pipe(add_acceleration, column="average_price", window=window)
            .pipe(add_aqr_momentum_numba, column=f"rvwap_{window}", window=window)
            .pipe(add_volatility, column="close", window=window)
            .pipe(add_volatility, column="average_price", window=window)
            .pipe(add_rsi, column="close", window=window)
            .pipe(add_rsi, column="average_price", window=window)
            .pipe(add_rolling_autocorr, column="close", window=window, lag=1)
            .pipe(add_rolling_autocorr, column="average_price", window=window, lag=1)
        )
        columns_to_rank = tmp_df.columns[tmp_df.columns.get_loc("up") :]
        for column in tqdm(columns_to_rank):
            tmp_df = add_custom_percentile(
                tmp_df, column, window=max(int(window // 2), 2)
            )
            # ^^ make window smaller since this basically a double lag

        # write to store iteratively in case of problems
        with pd.HDFStore(processed / "spy_features.h5") as store:
            store.put(value=tmp_df, key=f"spy/{data_frequency}/{key}", format="table")

    except Exception as error:
        log_error = dict(window=window, error=error)
        log_errors.append(log_error)
        pprint(log_error)

{'12_month': 48,
 '1_month': 4,
 '2_week': 2,
 '3_month': 12,
 '3_week': 3,
 '6_month': 24}


100%|█████████████████████████████████████████████████████████████████████████████| 781/781 [00:00<00:00, 39157.40it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:08<00:00,  3.20it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 780/780 [00:00<00:00, 77847.88it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:08<00:00,  3.14it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 779/779 [00:00<00:00, 64904.61it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:08<00:00,  3.17it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 771/771 [00:00<00:00, 51537.26it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:07<00:00,  3.27it/s]
100%|███████████████████████████████████

<IPython.core.display.Javascript object>

In [15]:
pprint(log_errors)

[]


<IPython.core.display.Javascript object>