# Generamos trading signals con LightGBM

## Imports & Settings

In [131]:
import warnings
warnings.filterwarnings('ignore')

In [132]:
# !pip install lightgbm==3.3.2
# !pip install catboost==1.0.5

In [133]:
%matplotlib inline

from pathlib import Path
import sys, os
from time import time
from collections import defaultdict
from itertools import product

import numpy as np
import pandas as pd

import lightgbm as lgb
from catboost import Pool, CatBoostRegressor

from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr

from alphalens.tears import (create_summary_tear_sheet,
                             create_full_tear_sheet)

from alphalens.utils import get_clean_factor_and_forward_returns

import matplotlib.pyplot as plt
import seaborn as sns

In [134]:
np.random.seed(42)

In [135]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))

from new_utils import MultipleTimeSeriesCV, format_time

In [136]:
sns.set_style('whitegrid')

## Algunas declaraciones iniciales

In [137]:
#decimos si queremos datos semanales o no
datos_semanales=1

In [138]:
#YEAR = 252
YEAR = 12
if datos_semanales==1:
    YEAR=52

idx = pd.IndexSlice

In [139]:
borrado=True

# Obtenemos la información

In [140]:
DATA_STORE = '../data/assets.h5'


In [141]:
with pd.HDFStore(DATA_STORE) as store:
    data = (store['engineered_features']
            .sort_index()
            .loc[idx[:, :'2019'], :])#train & validation period

In [143]:
results_path = Path('results', 'us_stocks')

if not results_path.exists():
    results_path.mkdir(parents=True)

In [144]:
# # Calculamos la matriz de correlación
# corr_df = data.drop(columns='sector').corr()

# # Creamos una máscara para la diagonal inferior
# mask = np.tril(np.ones(corr_df.shape), k=-1).astype(bool)

# # Filtramos la matriz de correlación usando la máscara
# lower_triangle_corr = corr_df.where(mask)

# # Lista inicial de columnas
# column_list = corr_df.columns
# column_list_final = list(column_list)
# salvado = []

# # Iteramos sobre la diagonal inferior para eliminar columnas correlacionadas > 0.50
# for i, column in enumerate(column_list):
#     corr_cols = lower_triangle_corr[column][lower_triangle_corr[column] > 0.70].index
#     for elem in corr_cols:
#         if elem in column_list_final:
#             column_list_final.remove(elem)
#         else:
#             salvado.append(elem)

# # Resultado
# print(len(column_list_final))


In [145]:
labels = sorted(data.filter(like='target_').columns)
features = data.columns.difference(labels).tolist()
label = 'target_1m'#modificado

## Model Selection: Lookback, lookahead and roll-forward periods

In [None]:
# Obtener el nombre del primer nivel
level_name = data.index.get_level_values(0).name
level_name

In [147]:
tickers = data.index.get_level_values('ticker').unique()

In [148]:
#lookaheads = [1, 5, 21]
lookaheads = [1]

In [149]:
categoricals = [ 'month','sector', 'fase']

In [150]:

train_lengths = [int(4.5 * 12 *4), 52]

test_lengths = [int(1), 12]

In [None]:
test_params = list(product(lookaheads, train_lengths, test_lengths))
test_params

In [152]:
years_val=14

## LightGBM Model Tuning

In [153]:
def get_fi(model):
    fi = model.feature_importance(importance_type='gain')
    return (pd.Series(fi / fi.sum(),
                      index=model.feature_name()))

### Hyperparameter Options

In [154]:
base_params = dict(boosting='gbdt',
                   objective='regression',
                   verbose=-1)

In [155]:
# weight of each new tree in the ensemble
learning_rate_ops = [.01, .1, .3]
# constraints on structure (depth) of each tree
max_depths = [2, 3, 5, 7]
num_leaves_opts = [2 ** i for i in max_depths]
#min_data_in_leaf_opts = [250, 500, 1000]
min_data_in_leaf_opts = [int(250/22), int(500/22), int(1000/22)]
if datos_semanales==1: min_data_in_leaf_opts = [int(250/5), int(500/5), int(1000/5)]
# random feature selection
feature_fraction_opts = [.3, .6, .95]


In [None]:
min_data_in_leaf_opts

In [157]:
param_names = ['learning_rate', 'num_leaves',
               'feature_fraction', 'min_data_in_leaf']

In [None]:
cv_params = list(product(learning_rate_ops,
                         num_leaves_opts,
                         feature_fraction_opts,
                         min_data_in_leaf_opts))
n_params = len(cv_params)
print(f'# Parameters: {n_params}')

In [None]:
train_lengths

In [None]:
lookaheads

In [None]:
test_params = list(product(lookaheads, train_lengths, test_lengths))
n = len(test_params)
test_param_sample = np.random.choice(list(range(n)), size=int(n), replace=False)
test_params = [test_params[i] for i in test_param_sample]
print('Train configs:', len(test_params))

In [None]:
test_params

### Categorical Variables

In [163]:
for feature in categoricals:
    data[feature] = pd.factorize(data[feature], sort=True)[0]

### Custom Loss Function: Information Coefficient

In [164]:
def ic_lgbm(preds, train_data):
    """Custom IC eval metric for lightgbm"""
    is_higher_better = True
    return 'ic', spearmanr(preds, train_data.get_label())[0], is_higher_better

### Run Cross-Validation

In [None]:
#primero borramos 'tuning_lgb.h5' para que no sobrescriba parcialmente
#si queremos borrar los datos anteriores, si no, poner borrado = False
if borrado==True:
    lgb_store = Path(results_path / 'tuning_lgb.h5')
    
    # Comprueba si el archivo existe
    if lgb_store.exists():
        # Elimina el archivo
        os.remove(lgb_store)
        print(f"El archivo {lgb_store} ha sido eliminado.")
    else:
        print(f"No se encontró el archivo {lgb_store}.")

In [166]:
lgb_store = Path(results_path / 'tuning_lgb.h5')

In [167]:
labels = sorted(data.filter(like='target').columns)
features = data.columns.difference(labels).tolist()

In [168]:
num_iterations = [50, 100] #+ list(range(100, 501, 50))
num_boost_round = num_iterations[-1]

In [169]:
metric_cols = (param_names + ['t', 'daily_ic_mean', 'daily_ic_mean_n',
                              'daily_ic_median', 'daily_ic_median_n'] +
               [str(n) for n in num_iterations])

In [None]:
metric_cols

In [None]:
YEAR

In [None]:
n_params

In [None]:
primera_fecha_no_nula = (
    data.dropna().notnull()                                      # Identifica valores no nulos
    .any(axis=1)                                     # Filtra filas con al menos un valor no nulo
    .groupby(level='ticker')                         # Agrupa por 'Ticker'
    .apply(lambda x: x[x].index.get_level_values('date').min())  # Obtiene la primera fecha
)
primera_fecha_no_nula

In [None]:
for lookahead, train_length, test_length in test_params:
    # randomized grid search
    cvp = np.random.choice(list(range(n_params)),
                           size=int(n_params / 2),
                           replace=False)
    cv_params_ = [cv_params[i] for i in cvp]

    # set up cross-validatio. years_val serán los años de validación
    n_splits = int(YEAR * years_val/ test_length)#modificado
    print(f'Lookahead: {lookahead:2.0f} | '
          f'Train: {train_length:3.0f} | '
          f'Test: {test_length:2.0f} | '
          f'Params: {len(cv_params_):3.0f} | '
          f'Train configs: {len(test_params)}')

    # time-series cross-validation
    cv = MultipleTimeSeriesCV(n_splits=n_splits,
                              lookahead=lookahead,
                              test_period_length=test_length,
                              train_period_length=train_length)

    #label = label_dict[lookahead] modificado para no liar
    label = label 
    outcome_data = data.loc[:, features + [label]].dropna()
    
    # binary dataset
    lgb_data = lgb.Dataset(data=outcome_data.drop(label, axis=1),
                           label=outcome_data[label],
                           categorical_feature=categoricals,
                           free_raw_data=False)
    T = 0
    predictions, metrics, feature_importance, daily_ic = [], [], [], []
    
    # iterate over (shuffled) hyperparameter combinations
    for p, param_vals in enumerate(cv_params_):
        key = f'{lookahead}/{train_length}/{test_length}/' + '/'.join([str(p) for p in param_vals])
        params = dict(zip(param_names, param_vals))
        params.update(base_params)

        start = time()
        cv_preds, nrounds = [], []
        ic_cv = defaultdict(list)
        
        # iterate over folds
        for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):
            
            # select train subset
            lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
                                       params=params).construct()
            
            # train model for num_boost_round
            model = lgb.train(params=params,
                              train_set=lgb_train,
                              num_boost_round=num_boost_round,
                              verbose_eval=False)
            # log feature importance
            if i == 0:
                fi = get_fi(model).to_frame()
            else:
                fi[i] = get_fi(model)

            # capture predictions
            test_set = outcome_data.iloc[test_idx, :]
            X_test = test_set.loc[:, model.feature_name()]
            y_test = test_set.loc[:, label]
            y_pred = {str(n): model.predict(X_test, num_iteration=n) for n in num_iterations}
            
            # record predictions for each fold
            cv_preds.append(y_test.to_frame('y_test').assign(**y_pred).assign(i=i))
        
        # combine fold results
        cv_preds = pd.concat(cv_preds).assign(**params)
        predictions.append(cv_preds)
        
        # compute IC per day
        by_day = cv_preds.groupby(level='date')
        ic_by_day = pd.concat([by_day.apply(lambda x: spearmanr(x.y_test, x[str(n)])[0]).to_frame(n)
                               for n in num_iterations], axis=1)
        daily_ic_mean = ic_by_day.mean()
        daily_ic_mean_n = daily_ic_mean.idxmax()
        daily_ic_median = ic_by_day.median()
        daily_ic_median_n = daily_ic_median.idxmax()
        
        # compute IC across all predictions
        ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0] for n in num_iterations]
        t = time() - start
        T += t
        
        # collect metrics
        metrics = pd.Series(list(param_vals) +
                            [t, daily_ic_mean.max(), daily_ic_mean_n, daily_ic_median.max(), daily_ic_median_n] + ic,
                            index=metric_cols)
        msg = f'\t{p:3.0f} | {format_time(T)} ({t:3.0f}) | {params["learning_rate"]:5.2f} | '
        msg += f'{params["num_leaves"]:3.0f} | {params["feature_fraction"]:3.0%} | {params["min_data_in_leaf"]:4.0f} | '
        msg += f' {max(ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {daily_ic_mean_n: 4.0f} | {ic_by_day.median().max(): 6.2%} | {daily_ic_median_n: 4.0f}'
        print(msg)

        # persist results for given CV run and hyperparameter combination
        metrics.to_hdf(lgb_store, 'metrics/' + key)
        ic_by_day.assign(**params).to_hdf(lgb_store, 'daily_ic/' + key)
        fi.T.describe().T.assign(**params).to_hdf(lgb_store, 'fi/' + key)
        cv_preds.to_hdf(lgb_store, 'predictions/' + key)
        

In [None]:


# Para ver los datos en sí, puedes convertir el conjunto de datos a un DataFrame de pandas
df = pd.DataFrame(lgb_train.data)
print(df.tail(100))


In [None]:
ic

In [None]:
params

In [None]:
stop

## LightGBM Random Forest Model Tuning

In [179]:
def get_fi(model):
    fi = model.feature_importance(importance_type='gain')
    return (pd.Series(fi / fi.sum(),
                      index=model.feature_name()))

### Hyperparameter Options

In [180]:
base_params = dict(boosting_type='rf',
                   objective='regression',
                   bagging_freq=1,
                   verbose=-1)

In [181]:
# Asumiendo que data es tu conjunto de datos
n_samples = data.shape[0]

# Establece min_data_in_leaf como un porcentaje del total de muestras
min_data_in_leaf_op0 = max(1, int(n_samples * 0.01))  # 1% del total de muestras, mínimo 1

In [None]:
min_data_in_leaf_op0

In [183]:
#bagging_fraction_opts = [.5, .75, .95]
bagging_fraction_opts = [.75, .95]
feature_fraction_opts = [.75, .95]
min_data_in_leaf_opts = [int(250/22), int(500/22), int(1000/22)]
if datos_semanales==1: min_data_in_leaf_opts = [min_data_in_leaf_op0, int(500/5), int(1000/5)]
#min_data_in_leaf_opts = [250, 500, 1000]
max_depth_opts=[5, 10, -1]

In [None]:
min_data_in_leaf_opts

In [185]:
param_names = ['bagging_fraction', 'feature_fraction',
               'min_data_in_leaf','max_depth']

In [None]:
cv_params = list(product(bagging_fraction_opts,
                         feature_fraction_opts,
                         min_data_in_leaf_opts,max_depth_opts))
n_params = len(cv_params)
print(f'# Parameters: {n_params}')

In [None]:
test_params = list(product(lookaheads, train_lengths, test_lengths))
n = len(test_params)
test_param_sample = np.random.choice(list(range(n)), size=int(n), replace=False)
test_params = [test_params[i] for i in test_param_sample]
print('Train configs:', len(test_params))

### Categorical Variables

In [188]:
for feature in categoricals:
    data[feature] = pd.factorize(data[feature], sort=True)[0]

### Custom Loss Function: Information Coefficient

In [189]:
def ic_rf(preds, train_data):
    """Custom IC eval metric for lightgbm"""
    is_higher_better = True
    return 'ic', spearmanr(preds, train_data.get_label())[0], is_higher_better

### Run Cross-Validation

In [None]:
#primero borramos 'tuning_rf.h5' para que no sobrescriba parcialmente

rf_store = Path(results_path / 'tuning_rf.h5')

# Comprueba si el archivo existe
if rf_store.exists():
    # Elimina el archivo
    os.remove(rf_store)
    print(f"El archivo {rf_store} ha sido eliminado.")
else:
    print(f"No se encontró el archivo {rf_store}.")

In [191]:
rf_store = Path(results_path / 'tuning_rf.h5')

In [192]:
labels = sorted(data.filter(like='target').columns)
features = data.columns.difference(labels).tolist()

In [None]:
labels

In [None]:
len(features)

In [195]:
num_iterations = [50, 100] #+ list(range(100, 501, 50))
num_boost_round = num_iterations[-1]

In [None]:
num_iterations

In [None]:
num_boost_round

In [198]:
metric_cols = (param_names + ['t', 'daily_ic_mean', 'daily_ic_mean_n',
                              'daily_ic_median', 'daily_ic_median_n'] +
               [str(n) for n in num_iterations])

In [None]:
for lookahead, train_length, test_length in test_params:
    # randomized grid search
    cvp = np.random.choice(list(range(n_params)),
                           size=int(n_params / 2),
                           replace=False)
    cv_params_ = [cv_params[i] for i in cvp]

    # set up cross-validatio. years_val serán los años de validación
    n_splits = int(YEAR * years_val/ test_length)#modificado
    print(f'Lookahead: {lookahead:2.0f} | '
          f'Train: {train_length:3.0f} | '
          f'Test: {test_length:2.0f} | '
          f'Params: {len(cv_params_):3.0f} | '
          f'Train configs: {len(test_params)}')

    # time-series cross-validation
    cv = MultipleTimeSeriesCV(n_splits=n_splits,
                              lookahead=lookahead,
                              test_period_length=test_length,
                              train_period_length=train_length)

    #label = label_dict[lookahead]
    label = label #modificado
    outcome_data = data.loc[:, features + [label]].dropna()
    
    # binary dataset
    lgb_data = lgb.Dataset(data=outcome_data.drop(label, axis=1),
                           label=outcome_data[label],
                           categorical_feature=categoricals,
                           free_raw_data=False)
    T = 0
    predictions, metrics, feature_importance, daily_ic = [], [], [], []
    
    # iterate over (shuffled) hyperparameter combinations
    for p, param_vals in enumerate(cv_params_):
        key = f'{lookahead}/{train_length}/{test_length}/' + '/'.join([str(p) for p in param_vals])
        params = dict(zip(param_names, param_vals))
        params.update(base_params)

        start = time()
        cv_preds, nrounds = [], []
        ic_cv = defaultdict(list)
        
        # iterate over folds
        for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):
            
            # select train subset
            lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
                                       params=params).construct()
            
            # train model for num_boost_round
            model = lgb.train(params=params,
                              train_set=lgb_train,
                              num_boost_round=num_boost_round,
                              verbose_eval=False)
            # log feature importance
            if i == 0:
                fi = get_fi(model).to_frame()
            else:
                fi[i] = get_fi(model)

            # capture predictions
            test_set = outcome_data.iloc[test_idx, :]
            X_test = test_set.loc[:, model.feature_name()]
            y_test = test_set.loc[:, label]
            y_pred = {str(n): model.predict(X_test, num_iteration=n) for n in num_iterations}
            
            # record predictions for each fold
            cv_preds.append(y_test.to_frame('y_test').assign(**y_pred).assign(i=i))
        
        # combine fold results
        cv_preds = pd.concat(cv_preds).assign(**params)
        predictions.append(cv_preds)
        
        # compute IC per day
        by_day = cv_preds.groupby(level='date')
        ic_by_day = pd.concat([by_day.apply(lambda x: spearmanr(x.y_test, x[str(n)])[0]).to_frame(n)
                               for n in num_iterations], axis=1)
        daily_ic_mean = ic_by_day.mean()
        daily_ic_mean_n = daily_ic_mean.idxmax()
        daily_ic_median = ic_by_day.median()
        daily_ic_median_n = daily_ic_median.idxmax()
        
        # compute IC across all predictions
        ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0] for n in num_iterations]
        t = time() - start
        T += t
        
        # collect metrics
        metrics = pd.Series(list(param_vals) +
                            [t, daily_ic_mean.max(), daily_ic_mean_n, daily_ic_median.max(), daily_ic_median_n] + ic,
                            index=metric_cols)
        msg = f'\t{p:3.0f} | {format_time(T)} ({t:3.0f}) | {params["bagging_fraction"]:5.2f} | '
        msg += f' {params["feature_fraction"]:3.0%} | {params["min_data_in_leaf"]:4.0f} |  {params["max_depth"]:4.0f} |'
        msg += f' {max(ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {daily_ic_mean_n: 4.0f} | {ic_by_day.median().max(): 6.2%} | {daily_ic_median_n: 4.0f} |'
        
        print(msg)

        # persist results for given CV run and hyperparameter combination
        metrics.to_hdf(rf_store, 'metrics/' + key)
        ic_by_day.assign(**params).to_hdf(rf_store, 'daily_ic/' + key)
        fi.T.describe().T.assign(**params).to_hdf(rf_store, 'fi/' + key)
        cv_preds.to_hdf(rf_store, 'predictions/' + key)
        

In [None]:
cv_preds.y_test

In [None]:
cv_preds[str(50)].tail(20)

In [None]:
y_pred