# Generating trading signals with LightGBM

## Imports & Settings

In [122]:
import warnings

warnings.filterwarnings("ignore")

In [123]:
#!pip install lightgbm==3.3.2
#!pip install catboost==1.0.5

In [124]:
%matplotlib inline

import os
import sys
from collections import defaultdict
from itertools import product
from pathlib import Path
from time import time

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from alphalens.tears import create_full_tear_sheet, create_summary_tear_sheet
from alphalens.utils import get_clean_factor_and_forward_returns
from catboost import CatBoostRegressor, Pool
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression

In [125]:
np.random.seed(42)

In [126]:
sys.path.insert(1, os.path.join(sys.path[0], ".."))
from utils import MultipleTimeSeriesCV, format_time

In [127]:
sns.set_style("whitegrid")

## Algunas declaraciones iniciales

In [128]:
# decimos si queremos datos semanales o no
datos_semanales = 1

In [129]:
# para cambiar los datos de validación
years_val = 12

In [130]:
# YEAR = 252
YEAR = 12
if datos_semanales == 1:
    YEAR = 52

idx = pd.IndexSlice

In [131]:
# si queremos borrar los datos anteriores, si no, poner borrado = False
borrado = True

## Get Data (for train & validation period)

We select the train and validation sets, and identify labels and features:

In [132]:
DATA_STORE = "../data/assets.h5"

In [133]:
with pd.HDFStore(DATA_STORE) as store:
    data = (
        store["engineered_features"].sort_index().loc[idx[:, :"2018"], :]
    )  # train & validation period

In [134]:
results_path = Path("..", "data", "results", "us_stocks")

if not results_path.exists():
    results_path.mkdir(parents=True)

In [135]:
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 8937 entries, ('XLB', Timestamp('1999-12-26 00:00:00')) to ('XLY', Timestamp('2018-12-30 00:00:00'))
Data columns (total 89 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   return_1w                  8469 non-null   float64
 1   return_2w                  8469 non-null   float64
 2   return_3w                  8469 non-null   float64
 3   return_6w                  8469 non-null   float64
 4   return_12w                 8469 non-null   float64
 5   return_52w                 8469 non-null   float64
 6   const                      7875 non-null   float64
 7   Mkt-RF                     7875 non-null   float64
 8   SMB                        7875 non-null   float64
 9   HML                        7875 non-null   float64
 10  RMW                        7875 non-null   float64
 11  CMA                        7875 non-null   float64
 12  const_diff                 

In [136]:
data.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,return_1w,return_2w,return_3w,return_6w,return_12w,return_52w,const,Mkt-RF,SMB,HML,...,weekjobclaims_diff,weekjobclaims_chg,retail_sales_percent_diff,retail_sales_percent_chg,retail_sales_diff,retail_sales_chg,us_asset_balance_diff,us_asset_balance_chg,1y_yield_diff,1y_yield_chg
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
XLB,1999-12-26,,,,,,,,,,,...,-19000.0,-0.066202,0.8,0.571429,5164.0,0.022028,5.0,8e-06,0.03,0.005059
XLB,2000-01-02,,,,,,,,,,,...,18000.0,0.067164,-3.1,-1.409091,-2048.0,-0.008548,5.0,8e-06,0.02,0.003356
XLB,2000-01-09,,,,,,,,,,,...,12000.0,0.041958,-3.1,-1.409091,-2048.0,-0.008548,5.0,8e-06,0.02,0.003344
XLB,2000-01-16,,,,,,,,,,,...,-9000.0,-0.030201,-3.1,-1.409091,-2048.0,-0.008548,5.0,8e-06,0.13,0.021667
XLB,2000-01-23,,,,,,,,,,,...,-5000.0,-0.017301,-3.1,-1.409091,-2048.0,-0.008548,5.0,8e-06,-0.01,-0.001631
XLB,2000-01-30,,,,,,,,,,,...,1000.0,0.003521,-3.1,-1.409091,-2048.0,-0.008548,5.0,8e-06,0.13,0.021242
XLB,2000-02-06,,,,,,,,,,,...,27000.0,0.094737,2.6,-2.888889,4000.0,0.016839,5.0,8e-06,-0.06,-0.0096
XLB,2000-02-13,,,,,,,,,,,...,-12000.0,-0.038462,2.6,-2.888889,4000.0,0.016839,5.0,8e-06,-0.02,-0.003231
XLB,2000-02-20,,,,,,,,,,,...,-17000.0,-0.056667,2.6,-2.888889,4000.0,0.016839,5.0,8e-06,0.09,0.014587
XLB,2000-02-27,,,,,,,,,,,...,-3000.0,-0.010601,2.6,-2.888889,4000.0,0.016839,5.0,8e-06,-0.08,-0.01278


In [137]:
labels = sorted(data.filter(like="target_").columns)
features = data.columns.difference(labels).tolist()
label = "target_1w"

## Model Selection: Lookback, lookahead and roll-forward periods

In [138]:
tickers = data.index.get_level_values("ticker").unique()

In [139]:
tickers

Index(['XLB', 'XLE', 'XLF', 'XLI', 'XLK', 'XLP', 'XLU', 'XLV', 'XLY'], dtype='object', name='ticker')

In [140]:
lookaheads = [1]

In [141]:
categoricals = ["month", "sector"]

We select 4.5 and one years as the length of our training periods; test periods are one and three months long. Since we are using 6 years (2012/18) for validation, a one-month test period implies 12x6 folds

In [142]:
train_lengths = [int(4.5 * 12 * 4), 52]  # 4 years and half a year
test_lengths = [1, 12]

In [143]:
test_params = list(product(lookaheads, train_lengths, test_lengths))

In [144]:
test_params

[(1, 216, 1), (1, 216, 12), (1, 52, 1), (1, 52, 12)]

## LightGBM Model Tuning

The notebook example iterates over many configurations, optionally using random samples to speed up model selection using a diverse subset. The goal is to identify the most impactful parameters without trying every possible combination.

In [145]:
def get_fi(model):
    fi = model.feature_importance(importance_type="gain")
    return pd.Series(fi / fi.sum(), index=model.feature_name())

### Hyperparameter Options

The base_params are not affected by cross-validation:

In [146]:
base_params = dict(boosting="gbdt", objective="regression", verbose=-1)

We choose the following parameters and values to select our best model (see book chapter for detail):

In [147]:
learning_rate_ops = [0.01, 0.1, 0.3]
max_depths = [2, 3, 5, 7]
num_leaves_opts = [2**i for i in max_depths]
min_data_in_leaf_opts = [int(250 / 5), int(500 / 5), int(1000 / 5)]
feature_fraction_opts = [0.3, 0.6, 0.95]

In [148]:
print("learning_rate_ops:", learning_rate_ops)
print("max_depths:", max_depths)
print("num_leaves_opts:", num_leaves_opts)
print("min_data_in_leaf_opts:", min_data_in_leaf_opts)
print("feature_fraction_opts:", feature_fraction_opts)

learning_rate_ops: [0.01, 0.1, 0.3]
max_depths: [2, 3, 5, 7]
num_leaves_opts: [4, 8, 32, 128]
min_data_in_leaf_opts: [50, 100, 200]
feature_fraction_opts: [0.3, 0.6, 0.95]


In [149]:
param_names = ["learning_rate", "num_leaves", "feature_fraction", "min_data_in_leaf"]

In [150]:
cv_params = list(
    product(
        learning_rate_ops, num_leaves_opts, feature_fraction_opts, min_data_in_leaf_opts
    )
)
n_params = len(cv_params)
print(f"# Parameters: {n_params}")

# Parameters: 108


### Train/Test Period Lengths

In [151]:
test_params = list(product(lookaheads, train_lengths, test_lengths))
n = len(test_params)
test_param_sample = np.random.choice(list(range(n)), size=int(n), replace=False)
test_params = [test_params[i] for i in test_param_sample]
print("Train configs:", len(test_params))

Train configs: 4


In [152]:
test_params

[(1, 216, 12), (1, 52, 12), (1, 216, 1), (1, 52, 1)]

### Categorical Variables

In [153]:
categoricals = ["month", "sector"]
for feature in categoricals:
    data[feature] = pd.factorize(data[feature], sort=True)[0]

### Custom Loss Function: Information Coefficient

In [154]:
def ic_lgbm(preds, train_data):
    """Custom IC eval metric for lightgbm"""
    is_higher_better = True
    return "ic", spearmanr(preds, train_data.get_label())[0], is_higher_better

### Run Cross-Validation

In [155]:
if borrado == True:
    lgb_store = Path(results_path / "tuning_lgb.h5")

    # Comprueba si el archivo existe
    if lgb_store.exists():
        # Elimina el archivo
        os.remove(lgb_store)
        print(f"El archivo {lgb_store} ha sido eliminado.")
    else:
        print(f"No se encontró el archivo {lgb_store}.")

No se encontró el archivo ../data/results/us_stocks/tuning_lgb.h5.


In [156]:
lgb_store = Path(results_path / "tuning_lgb.h5")

In [157]:
labels = sorted(data.filter(like="target").columns)
features = data.columns.difference(labels).tolist()

In [158]:
labels

['target_12w', 'target_1w', 'target_2w', 'target_3w', 'target_6w']

In [159]:
features

['1y_yield',
 '1y_yield_chg',
 '1y_yield_diff',
 'CMA',
 'CMA_diff',
 'HML',
 'HML_diff',
 'Mkt-RF',
 'Mkt-RF_diff',
 'RMW',
 'RMW_diff',
 'SMB',
 'SMB_diff',
 'const',
 'const_diff',
 'corp_oas',
 'corp_oas_chg',
 'corp_oas_diff',
 'empleo_chg',
 'empleo_diff',
 'eu_hy_oas',
 'eu_hy_oas_chg',
 'eu_hy_oas_diff',
 'hy_oas',
 'hy_oas_chg',
 'hy_oas_diff',
 'inflacion',
 'inflacion_chg',
 'inflacion_diff',
 'leading',
 'leading_chg',
 'leading_diff',
 'momentum_12',
 'momentum_2',
 'momentum_3',
 'momentum_3_12',
 'momentum_52',
 'momentum_6',
 'month',
 'oil',
 'oil_chg',
 'oil_diff',
 'real_gdp',
 'real_gdp_chg',
 'real_gdp_diff',
 'recession',
 'recession_chg',
 'recession_diff',
 'retail_sales',
 'retail_sales_chg',
 'retail_sales_diff',
 'retail_sales_percent',
 'retail_sales_percent_chg',
 'retail_sales_percent_diff',
 'return_12w',
 'return_1w',
 'return_1w_t-1',
 'return_1w_t-2',
 'return_1w_t-3',
 'return_1w_t-4',
 'return_1w_t-5',
 'return_1w_t-6',
 'return_2w',
 'return_3w',
 '

In [160]:
num_iterations = [50, 100]
num_boost_round = num_iterations[-1]

In [161]:
metric_cols = (
    param_names
    + ["t", "daily_ic_mean", "daily_ic_mean_n", "daily_ic_median", "daily_ic_median_n"]
    + [str(n) for n in num_iterations]
)

In [162]:
metric_cols

['learning_rate',
 'num_leaves',
 'feature_fraction',
 'min_data_in_leaf',
 't',
 'daily_ic_mean',
 'daily_ic_mean_n',
 'daily_ic_median',
 'daily_ic_median_n',
 '50',
 '100']

We iterate over our six CV configurations and collect the resulting metrics:

In [None]:
for lookahead, train_length, test_length in test_params:
    # randomized grid search
    cvp = np.random.choice(list(range(n_params)), size=int(n_params / 2), replace=False)
    cv_params_ = [cv_params[i] for i in cvp]

    # set up cross-validation years_val serán los años de validación
    n_splits = int(YEAR * years_val / test_length)  # modificado
    print(
        f"Lookahead: {lookahead:2.0f} | "
        f"Train: {train_length:3.0f} | "
        f"Test: {test_length:2.0f} | "
        f"Params: {len(cv_params_):3.0f} | "
        f"Train configs: {len(test_params)}"
    )

    # time-series cross-validation
    cv = MultipleTimeSeriesCV(
        n_splits=n_splits,
        lookahead=lookahead,
        test_period_length=test_length,
        train_period_length=train_length,
    )

    # label = label_dict[lookahead] modificado para no liar
    label = label
    outcome_data = data.loc[:, features + [label]].dropna()

    # binary dataset
    lgb_data = lgb.Dataset(
        data=outcome_data.drop(label, axis=1),
        label=outcome_data[label],
        categorical_feature=categoricals,
        free_raw_data=False,
    )
    T = 0
    predictions, metrics, feature_importance, daily_ic = [], [], [], []

    # iterate over (shuffled) hyperparameter combinations
    for p, param_vals in enumerate(cv_params_):
        key = f"{lookahead}/{train_length}/{test_length}/" + "/".join(
            [str(p) for p in param_vals]
        )
        params = dict(zip(param_names, param_vals))
        params.update(base_params)

        start = time()
        cv_preds, nrounds = [], []
        ic_cv = defaultdict(list)

        # iterate over folds
        for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):

            # select train subset
            lgb_train = lgb_data.subset(
                used_indices=train_idx.tolist(), params=params
            ).construct()

            # train model for num_boost_round
            model = lgb.train(
                params=params, train_set=lgb_train, num_boost_round=num_boost_round
            )
            # log feature importance
            if i == 0:
                fi = get_fi(model).to_frame()
            else:
                fi[i] = get_fi(model)

            # capture predictions
            test_set = outcome_data.iloc[test_idx, :]
            X_test = test_set.loc[:, model.feature_name()]
            y_test = test_set.loc[:, label]
            y_pred = {
                str(n): model.predict(X_test, num_iteration=n) for n in num_iterations
            }

            # record predictions for each fold
            cv_preds.append(y_test.to_frame("y_test").assign(**y_pred).assign(i=i))

        # combine fold results
        cv_preds = pd.concat(cv_preds).assign(**params)
        predictions.append(cv_preds)

        # compute IC per day
        by_day = cv_preds.groupby(level="date")
        ic_by_day = pd.concat(
            [
                by_day.apply(lambda x: spearmanr(x.y_test, x[str(n)])[0]).to_frame(n)
                for n in num_iterations
            ],
            axis=1,
        )
        daily_ic_mean = ic_by_day.mean()
        daily_ic_mean_n = daily_ic_mean.idxmax()
        daily_ic_median = ic_by_day.median()
        daily_ic_median_n = daily_ic_median.idxmax()

        # compute IC across all predictions
        ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0] for n in num_iterations]
        t = time() - start
        T += t

        # collect metrics
        metrics = pd.Series(
            list(param_vals)
            + [
                t,
                daily_ic_mean.max(),
                daily_ic_mean_n,
                daily_ic_median.max(),
                daily_ic_median_n,
            ]
            + ic,
            index=metric_cols,
        )
        msg = f'\t{p:3.0f} | {format_time(T)} ({t:3.0f}) | {params["learning_rate"]:5.2f} | '
        msg += f'{params["num_leaves"]:3.0f} | {params["feature_fraction"]:3.0%} | {params["min_data_in_leaf"]:4.0f} | '
        msg += f" {max(ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {daily_ic_mean_n: 4.0f} | {ic_by_day.median().max(): 6.2%} | {daily_ic_median_n: 4.0f}"
        print(msg)

        # persist results for given CV run and hyperparameter combination
        metrics.to_hdf(lgb_store, "metrics/" + key)
        ic_by_day.assign(**params).to_hdf(lgb_store, "daily_ic/" + key)
        fi.T.describe().T.assign(**params).to_hdf(lgb_store, "fi/" + key)
        cv_preds.to_hdf(lgb_store, "predictions/" + key)

Lookahead:  1 | Train: 216 | Test: 12 | Params:  54 | Train configs: 4
	  0 | 00:00:04 (  4) |  0.10 |   4 | 60% |  100 |   2.70% |  0.25% |  100 |  1.67% |  100
	  1 | 00:00:10 (  6) |  0.10 | 128 | 30% |  200 |   3.38% |  0.23% |  100 |  1.67% |   50
	  2 | 00:00:14 (  4) |  0.01 |   4 | 60% |  100 |   0.83% | -1.88% |  100 |  0.00% |   50
	  3 | 00:00:20 (  6) |  0.10 |   8 | 30% |  200 |   3.35% |  0.73% |  100 |  3.33% |  100
	  4 | 00:00:24 (  4) |  0.10 |   4 | 95% |   50 |   2.44% |  0.86% |  100 |  0.00% |   50
	  5 | 00:00:46 ( 21) |  0.10 | 128 | 95% |   50 |   0.83% |  1.48% |  100 |  2.50% |  100
	  6 | 00:00:52 (  6) |  0.01 |  32 | 95% |  200 |   0.95% | -0.37% |   50 |  0.00% |   50
	  7 | 00:00:58 (  6) |  0.01 |   8 | 30% |  200 |   4.80% | -2.17% |   50 | -0.89% |   50
	  8 | 00:01:04 (  7) |  0.01 |   8 | 30% |  100 |   3.50% | -2.57% |  100 | -1.69% |  100
	  9 | 00:01:08 (  4) |  0.10 |   4 | 95% |  200 |   3.72% |  1.49% |  100 |  1.67% |  100
	 10 | 00:01:12 (  

In [None]:
[(1, 216, 12), (1, 52, 12), (1, 216, 1), (1, 52, 1)]

In [164]:
# Para ver los datos en sí, puedes convertir el conjunto de datos a un DataFrame de pandas
df = pd.DataFrame(lgb_train.data)
print(df.tail(100))

                   1y_yield  1y_yield_chg  1y_yield_diff       CMA  CMA_diff  \
ticker date                                                                    
XLV    2006-02-19      4.68     -0.004255          -0.02  0.300588 -0.428706   
       2006-02-26      4.73      0.010684           0.05  0.300588 -0.428706   
       2006-03-05      4.75      0.004228           0.02  0.300588 -0.428706   
       2006-03-12      4.77      0.004211           0.02  0.300588 -0.428706   
       2006-03-19      4.74     -0.006289          -0.03  0.300588 -0.428706   
...                     ...           ...            ...       ...       ...   
XLY    2006-12-17      4.96      0.002020           0.01  0.409240  0.451581   
       2006-12-24      4.96      0.002020           0.01  0.409240  0.451581   
       2006-12-31      5.00      0.008065           0.04  0.841837  0.432597   
       2007-01-07      4.98     -0.004000          -0.02  0.841837  0.432597   
       2007-01-14      5.06      0.01606

## LightGBM Random Forest Model Tuning

Helper function to obtain the LightGBM feature importance metrics:

The notebook example iterates over many configurations, optionally using random samples to speed up model selection using a diverse subset. The goal is to identify the most impactful parameters without trying every possible combination.

In [165]:
def get_fi(model):
    fi = model.feature_importance(importance_type="gain")
    return pd.Series(fi / fi.sum(), index=model.feature_name())

### Hyperparameter Options

The base_params are not affected by cross-validation:

In [166]:
base_params = dict(
    boosting_type="rf", objective="regression", bagging_freq=1, verbose=-1
)

In [167]:
# Asumiendo que data es tu conjunto de datos
n_samples = data.shape[0]

# Establece min_data_in_leaf como un porcentaje del total de muestras
min_data_in_leaf_op0 = max(
    1, int(n_samples * 0.01)
)  # 1% del total de muestras, mínimo 1

In [168]:
min_data_in_leaf_op0

89

We choose the following parameters and values to select our best model (see book chapter for detail):

In [169]:
bagging_fraction_opts = [0.75, 0.95]
feature_fraction_opts = [0.75, 0.95]
min_data_in_leaf_opts = [int(250 / 22), int(500 / 22), int(1000 / 22)]
min_data_in_leaf_opts = [min_data_in_leaf_op0, int(500 / 5), int(1000 / 5)]
max_depth_opts = [5, 10, -1]

In [170]:
# Imprime los valores de los hiperparámetros utilizados para Random Forest-LightGBM
print("bagging_fraction_opts:", bagging_fraction_opts)
print("feature_fraction_opts:", feature_fraction_opts)
print("min_data_in_leaf_opts:", min_data_in_leaf_opts)
print("max_depth_opts:", max_depth_opts)

bagging_fraction_opts: [0.75, 0.95]
feature_fraction_opts: [0.75, 0.95]
min_data_in_leaf_opts: [89, 100, 200]
max_depth_opts: [5, 10, -1]


In [171]:
param_names = ["bagging_fraction", "feature_fraction", "min_data_in_leaf", "max_depth"]

In [172]:
cv_params = list(
    product(
        bagging_fraction_opts,
        feature_fraction_opts,
        min_data_in_leaf_opts,
        max_depth_opts,
    )
)
n_params = len(cv_params)
print(f"# Parameters: {n_params}")

# Parameters: 36


### Train/Test Period Lengths

In [173]:
test_params = list(product(lookaheads, train_lengths, test_lengths))
n = len(test_params)
test_param_sample = np.random.choice(list(range(n)), size=int(n), replace=False)
test_params = [test_params[i] for i in test_param_sample]
print("Train configs:", len(test_params))

Train configs: 4


### Categorical Variables

In [174]:
categoricals = ["month", "sector"]
for feature in categoricals:
    data[feature] = pd.factorize(data[feature], sort=True)[0]

### Custom Loss Function: Information Coefficient

In [175]:
def ic_rf(preds, train_data):
    """Custom IC eval metric for lightgbm"""
    is_higher_better = True
    return "ic", spearmanr(preds, train_data.get_label())[0], is_higher_better

### Run Cross-Validation

In [176]:
rf_store = Path(results_path / "tuning_rf.h5")

# Comprueba si el archivo existe
if rf_store.exists():
    # Elimina el archivo
    os.remove(rf_store)
    print(f"El archivo {rf_store} ha sido eliminado.")
else:
    print(f"No se encontró el archivo {rf_store}.")

No se encontró el archivo ../data/results/us_stocks/tuning_rf.h5.


In [177]:
rf_store = Path(results_path / "tuning_rf.h5")

In [178]:
labels = sorted(data.filter(like="target").columns)
features = data.columns.difference(labels).tolist()

In [179]:
labels

['target_12w', 'target_1w', 'target_2w', 'target_3w', 'target_6w']

In [180]:
features

['1y_yield',
 '1y_yield_chg',
 '1y_yield_diff',
 'CMA',
 'CMA_diff',
 'HML',
 'HML_diff',
 'Mkt-RF',
 'Mkt-RF_diff',
 'RMW',
 'RMW_diff',
 'SMB',
 'SMB_diff',
 'const',
 'const_diff',
 'corp_oas',
 'corp_oas_chg',
 'corp_oas_diff',
 'empleo_chg',
 'empleo_diff',
 'eu_hy_oas',
 'eu_hy_oas_chg',
 'eu_hy_oas_diff',
 'hy_oas',
 'hy_oas_chg',
 'hy_oas_diff',
 'inflacion',
 'inflacion_chg',
 'inflacion_diff',
 'leading',
 'leading_chg',
 'leading_diff',
 'momentum_12',
 'momentum_2',
 'momentum_3',
 'momentum_3_12',
 'momentum_52',
 'momentum_6',
 'month',
 'oil',
 'oil_chg',
 'oil_diff',
 'real_gdp',
 'real_gdp_chg',
 'real_gdp_diff',
 'recession',
 'recession_chg',
 'recession_diff',
 'retail_sales',
 'retail_sales_chg',
 'retail_sales_diff',
 'retail_sales_percent',
 'retail_sales_percent_chg',
 'retail_sales_percent_diff',
 'return_12w',
 'return_1w',
 'return_1w_t-1',
 'return_1w_t-2',
 'return_1w_t-3',
 'return_1w_t-4',
 'return_1w_t-5',
 'return_1w_t-6',
 'return_2w',
 'return_3w',
 '

In [181]:
num_iterations = [50, 100]
num_boost_round = num_iterations[-1]

In [182]:
num_iterations

[50, 100]

In [183]:
num_boost_round

100

In [184]:
metric_cols = (
    param_names
    + ["t", "daily_ic_mean", "daily_ic_mean_n", "daily_ic_median", "daily_ic_median_n"]
    + [str(n) for n in num_iterations]
)

In [185]:
metric_cols

['bagging_fraction',
 'feature_fraction',
 'min_data_in_leaf',
 'max_depth',
 't',
 'daily_ic_mean',
 'daily_ic_mean_n',
 'daily_ic_median',
 'daily_ic_median_n',
 '50',
 '100']

Now we take the following steps:
- we iterate over the prediction horizons and train/test period length,
- set up the `MultipleTimeSeriesCV` accordingly
- create the binary LightGBM dataset with the appropriate target, and
- iterate over the model hyperparamters to train and validate the model while capturing the relevant performance metrics:

In [186]:
for lookahead, train_length, test_length in test_params:
    # randomized grid search
    cvp = np.random.choice(list(range(n_params)), size=int(n_params / 2), replace=False)
    cv_params_ = [cv_params[i] for i in cvp]

    # set up cross-validatio. years_val serán los años de validación
    n_splits = int(YEAR * years_val / test_length)  # modificado
    print(
        f"Lookahead: {lookahead:2.0f} | "
        f"Train: {train_length:3.0f} | "
        f"Test: {test_length:2.0f} | "
        f"Params: {len(cv_params_):3.0f} | "
        f"Train configs: {len(test_params)}"
    )

    # time-series cross-validation
    cv = MultipleTimeSeriesCV(
        n_splits=n_splits,
        lookahead=lookahead,
        test_period_length=test_length,
        train_period_length=train_length,
    )

    # label = label_dict[lookahead]
    label = label  # modificado
    outcome_data = data.loc[:, features + [label]].dropna()

    # binary dataset
    lgb_data = lgb.Dataset(
        data=outcome_data.drop(label, axis=1),
        label=outcome_data[label],
        categorical_feature=categoricals,
        free_raw_data=False,
    )
    T = 0
    predictions, metrics, feature_importance, daily_ic = [], [], [], []

    # iterate over (shuffled) hyperparameter combinations
    for p, param_vals in enumerate(cv_params_):
        key = f"{lookahead}/{train_length}/{test_length}/" + "/".join(
            [str(p) for p in param_vals]
        )
        params = dict(zip(param_names, param_vals))
        params.update(base_params)

        start = time()
        cv_preds, nrounds = [], []
        ic_cv = defaultdict(list)

        # iterate over folds
        for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):

            # select train subset
            lgb_train = lgb_data.subset(
                used_indices=train_idx.tolist(), params=params
            ).construct()

            # train model for num_boost_round
            model = lgb.train(
                params=params, train_set=lgb_train, num_boost_round=num_boost_round
            )
            # log feature importance
            if i == 0:
                fi = get_fi(model).to_frame()
            else:
                fi[i] = get_fi(model)

            # capture predictions
            test_set = outcome_data.iloc[test_idx, :]
            X_test = test_set.loc[:, model.feature_name()]
            y_test = test_set.loc[:, label]
            y_pred = {
                str(n): model.predict(X_test, num_iteration=n) for n in num_iterations
            }

            # record predictions for each fold
            cv_preds.append(y_test.to_frame("y_test").assign(**y_pred).assign(i=i))

        # combine fold results
        cv_preds = pd.concat(cv_preds).assign(**params)
        predictions.append(cv_preds)

        # compute IC per day
        by_day = cv_preds.groupby(level="date")
        ic_by_day = pd.concat(
            [
                by_day.apply(lambda x: spearmanr(x.y_test, x[str(n)])[0]).to_frame(n)
                for n in num_iterations
            ],
            axis=1,
        )
        daily_ic_mean = ic_by_day.mean()
        daily_ic_mean_n = daily_ic_mean.idxmax()
        daily_ic_median = ic_by_day.median()
        daily_ic_median_n = daily_ic_median.idxmax()

        # compute IC across all predictions
        ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0] for n in num_iterations]
        t = time() - start
        T += t

        # collect metrics
        metrics = pd.Series(
            list(param_vals)
            + [
                t,
                daily_ic_mean.max(),
                daily_ic_mean_n,
                daily_ic_median.max(),
                daily_ic_median_n,
            ]
            + ic,
            index=metric_cols,
        )
        msg = f'\t{p:3.0f} | {format_time(T)} ({t:3.0f}) | {params["bagging_fraction"]:5.2f} | '
        msg += f' {params["feature_fraction"]:3.0%} | {params["min_data_in_leaf"]:4.0f} |  {params["max_depth"]:4.0f} |'
        msg += f" {max(ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {daily_ic_mean_n: 4.0f} | {ic_by_day.median().max(): 6.2%} | {daily_ic_median_n: 4.0f} |"

        print(msg)

        # persist results for given CV run and hyperparameter combination
        metrics.to_hdf(rf_store, "metrics/" + key)
        ic_by_day.assign(**params).to_hdf(rf_store, "daily_ic/" + key)
        fi.T.describe().T.assign(**params).to_hdf(rf_store, "fi/" + key)
        cv_preds.to_hdf(rf_store, "predictions/" + key)

Lookahead:  1 | Train: 216 | Test:  1 | Params:  18 | Train configs: 4
	  0 | 00:01:58 (118) |  0.75 |  95% |   89 |    10 | -2.61% |  1.02% |  100 |  0.85% |  100 |
	  1 | 00:03:13 ( 75) |  0.95 |  75% |  200 |     5 |  1.85% | -1.37% |   50 | -1.73% |   50 |
	  2 | 00:04:52 (100) |  0.75 |  95% |   89 |     5 | -2.62% | -0.11% |  100 | -2.28% |   50 |
	  3 | 00:06:40 (107) |  0.75 |  95% |  100 |    10 | -2.34% | -1.92% |   50 | -1.83% |  100 |
	  4 | 00:07:56 ( 77) |  0.95 |  95% |  200 |    10 |  2.68% | -1.69% |   50 |  0.00% |   50 |
	  5 | 00:09:13 ( 76) |  0.95 |  95% |  200 |     5 |  2.45% | -1.03% |   50 |  0.00% |   50 |
	  6 | 00:10:46 ( 94) |  0.75 |  75% |  100 |     5 | -1.21% |  0.83% |   50 |  2.28% |   50 |
	  7 | 00:12:43 (116) |  0.75 |  75% |   89 |    -1 | -1.25% |  1.13% |   50 |  1.76% |   50 |
	  8 | 00:14:53 (130) |  0.95 |  75% |  100 |    -1 | -1.91% | -1.15% |  100 |  0.00% |  100 |
	  9 | 00:16:38 (106) |  0.75 |  75% |  100 |    -1 | -1.24% |  0.97% |   

In [187]:
cv_preds.y_test

ticker  date      
XLB     2018-10-14   -0.532655
        2018-10-21   -1.828159
        2018-10-28    2.568585
        2018-11-04    0.747383
        2018-11-11    0.186265
                        ...   
XLY     2007-03-11   -1.242011
        2007-03-18    1.937212
        2007-03-25   -0.854511
        2007-04-01    1.120970
        2007-04-08    0.066323
Name: y_test, Length: 5616, dtype: float64

In [188]:
cv_preds[str(50)].tail(20)

ticker  date      
XLV     2007-02-18   -0.004185
        2007-02-25   -0.004185
        2007-03-04   -0.004185
        2007-03-11   -0.004185
        2007-03-18   -0.004185
        2007-03-25   -0.004185
        2007-04-01   -0.004185
        2007-04-08   -0.004185
XLY     2007-01-21   -0.004185
        2007-01-28   -0.004185
        2007-02-04   -0.004185
        2007-02-11   -0.004185
        2007-02-18   -0.004185
        2007-02-25   -0.004185
        2007-03-04   -0.004185
        2007-03-11   -0.004185
        2007-03-18   -0.004185
        2007-03-25   -0.004185
        2007-04-01   -0.004185
        2007-04-08   -0.004185
Name: 50, dtype: float64

In [189]:
y_pred

{'50': array([-0.00418532, -0.00418532, -0.00418532, -0.00418532, -0.00418532,
        -0.00418532, -0.00418532, -0.00418532, -0.00418532, -0.00418532,
        -0.00418532, -0.00418532, -0.00418532, -0.00418532, -0.00418532,
        -0.00418532, -0.00418532, -0.00418532, -0.00418532, -0.00418532,
        -0.00418532, -0.00418532, -0.00418532, -0.00418532, -0.00418532,
        -0.00418532, -0.00418532, -0.00418532, -0.00418532, -0.00418532,
        -0.00418532, -0.00418532, -0.00418532, -0.00418532, -0.00418532,
        -0.00418532, -0.00418532, -0.00418532, -0.00418532, -0.00418532,
        -0.00418532, -0.00418532, -0.00418532, -0.00418532, -0.00418532,
        -0.00418532, -0.00418532, -0.00418532, -0.00418532, -0.00418532,
        -0.00418532, -0.00418532, -0.00418532, -0.00418532, -0.00418532,
        -0.00418532, -0.00418532, -0.00418532, -0.00418532, -0.00418532,
        -0.00418532, -0.00418532, -0.00418532, -0.00418532, -0.00418532,
        -0.00418532, -0.00418532, -0.00418532

In [190]:
X_test

Unnamed: 0_level_0,Unnamed: 1_level_0,1y_yield,1y_yield_chg,1y_yield_diff,CMA,CMA_diff,HML,HML_diff,Mkt-RF,Mkt-RF_diff,RMW,...,vix_diff,vixoil,vixoil_chg,vixoil_diff,weekjobclaims,weekjobclaims_chg,weekjobclaims_diff,yield_curve,yield_curve_chg,yield_curve_diff
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
XLB,2007-01-21,5.09,0.005929,0.03,-0.458106,0.455410,-0.424672,-1.284505,1.201370,-0.539414,0.71626,...,0.25,26.41,-0.06437,-1.7,335000.0,0.131757,39000.0,-0.36,0.125000,-0.04
XLB,2007-01-28,5.12,0.005894,0.03,-0.458106,0.455410,-0.424672,-1.284505,1.201370,-0.539414,0.71626,...,0.73,26.41,-0.06437,-1.7,308000.0,-0.080597,-27000.0,-0.25,-0.305556,0.11
XLB,2007-02-04,5.08,-0.007812,-0.04,-0.458106,0.455410,-0.424672,-1.284505,1.201370,-0.539414,0.71626,...,-1.05,26.41,-0.06437,-1.7,310000.0,0.006494,2000.0,-0.31,0.240000,-0.06
XLB,2007-02-11,5.08,-0.007812,-0.04,-0.458106,0.455410,-0.424672,-1.284505,1.201370,-0.539414,0.71626,...,1.02,26.41,-0.06437,-1.7,338000.0,0.090323,28000.0,-0.36,0.161290,-0.05
XLB,2007-02-18,5.05,-0.005906,-0.03,-0.458106,0.455410,-0.424672,-1.284505,1.201370,-0.539414,0.71626,...,-1.08,26.41,-0.06437,-1.7,321000.0,-0.050296,-17000.0,-0.48,0.333333,-0.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XLY,2007-03-11,4.98,0.016327,0.08,0.841837,0.432597,-0.419036,0.174286,1.253142,0.252178,0.35957,...,-4.52,26.41,-0.06437,-1.7,308000.0,-0.037500,-12000.0,-0.51,-0.150000,0.09
XLY,2007-03-18,4.95,-0.006024,-0.03,0.841837,0.432597,-0.419036,0.174286,1.253142,0.252178,0.35957,...,2.70,26.41,-0.06437,-1.7,309000.0,0.003247,1000.0,-0.49,-0.039216,0.02
XLY,2007-03-25,4.93,-0.004040,-0.02,0.841837,0.432597,-0.419036,0.174286,1.253142,0.252178,0.35957,...,-3.84,26.41,-0.06437,-1.7,303000.0,-0.019417,-6000.0,-0.46,-0.061224,0.03
XLY,2007-04-01,4.90,-0.006085,-0.03,0.841837,0.432597,-0.419036,0.174286,1.253142,0.252178,0.35957,...,1.69,26.41,-0.06437,-1.7,307000.0,0.013201,4000.0,-0.39,-0.152174,0.07


In [191]:
lgb_train

<lightgbm.basic.Dataset at 0x1423288d0>

In [192]:
data.sector

ticker  date      
XLB     1999-12-26    0
        2000-01-02    0
        2000-01-09    0
        2000-01-16    0
        2000-01-23    0
                     ..
XLY     2018-12-02    8
        2018-12-09    8
        2018-12-16    8
        2018-12-23    8
        2018-12-30    8
Name: sector, Length: 8937, dtype: int64