# Generando predicciones out-of-sample

## Imports & Settings

In [179]:
import warnings
warnings.filterwarnings('ignore')

In [180]:
%matplotlib inline

from time import time
import sys, os
from pathlib import Path

import pandas as pd
from scipy.stats import spearmanr

import lightgbm as lgb
from catboost import Pool, CatBoostRegressor

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [181]:

np.random.seed(42)

In [182]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from new_utils import MultipleTimeSeriesCV

In [183]:
sns.set_style('whitegrid')

In [184]:
YEAR = 12
datos_semanales=1
if datos_semanales==1:
    YEAR=52
idx = pd.IndexSlice

In [185]:
scope_params = ['lookahead', 'train_length', 'test_length']
daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n', 'daily_ic_median', 'daily_ic_median_n']
lgb_train_params = ['learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf']
rf_train_params = ['bagging_fraction', 'feature_fraction', 'min_data_in_leaf','max_depth']

## Generate Lightgbm predictions

### Model Configuration

In [186]:
base_params = dict(boosting='gbdt',
                   objective='regression',
                   random_state = 42, 
                   verbose=-1)

categoricals = ['month', 'sector']
# categoricals = ['month','sector', 'fase']
# categoricals = ['month']

In [187]:
#dos semanas
lookahead = 1
store = Path('../data/predictions.h5') #no lo borramos previamente pq ya lo ha hecho el paso 5

### Get Data

In [188]:
data = pd.read_hdf('../data/assets.h5','engineered_features').sort_index()#modificado

configuración_neutral_f = ['1y_yield', 'sector', 'month', 'return_2m', 'retail_sales', 'return_6m',
       'CMA', 'momentum_6', 'return_52m', 'return_1m_t-5', 'sentiment',
       'return_3m', 'return_1m_t-1', 'RMW',
        'return_1m_t-6', 'Mkt-RF_diff', 'SMB', 'oil'] + ['target_1m']


configuración_norm_f =  ['month', 'sentiment',  'weekjobclaims_chg',
       'corp_oas_chg', 'oil', 'hy_oas_chg', 'us_asset_balance_chg',
       '1y_yield_chg','yield_curve_diff',
       'curva_tipos_diff', 'vixoil_diff',
       'return_1m_t-6', 
       'vix', 'return_1m_t-3', 'sector', 'retail_sales_diff', 'momentum_3',
       ]  + ['target_1m']


In [193]:
labels = sorted(data.filter(like='target').columns)
features = data.columns.difference(labels).tolist()
label = 'target_1m'#modificado

In [None]:
# Encuentra las filas con al menos un valor NaN
nan_cols = data.loc[idx[:, '2024':], features + [label]].isna().any(axis=0)

print(nan_cols[nan_cols == True])


In [198]:
#completamos con los valores del periodo anterior, para evitar que el último dato apareza nan
data= data.fillna(method='ffill')

In [199]:
#datos desde 2010
data = data.loc[idx[:, '2010':], features + [label]].dropna()

In [None]:
data.info()

In [202]:
for feature in categoricals:
    data[feature] = pd.factorize(data[feature], sort=True)[0]

In [203]:
lgb_data = lgb.Dataset(data=data[features],
                       label=data[label],
                       categorical_feature=categoricals,
                       free_raw_data=False)

### Generamos predicciones

In [204]:
#tomamos los IC almacenados
lgb_ic = pd.read_hdf('../data/model_tuning.h5', 'lgb/ic')
lgb_daily_ic = pd.read_hdf('../data/model_tuning.h5', 'lgb/daily_ic')

In [205]:
#función para tomar los mejores parametros que saliernon en entrenamiento para un lookahead determinado
def get_lgb_params(data, t=5, best=0):
    param_cols = scope_params[1:] + lgb_train_params + ['boost_rounds']
    df = data[data.lookahead==t].sort_values('ic', ascending=False).iloc[best]
    return df.loc[param_cols]

In [206]:
#para hacer más OOS que el 1 año definido inicialmente
years_OOS=5

In [None]:
#for par las 10 mejores configuracones de paramentros de las cuales almacenaremos sus predicciones
for position in range(10):
    params = get_lgb_params(lgb_daily_ic,
                            t=lookahead,
                            best=position)

    params = params.to_dict()#parametros a diccionario

    for p in ['min_data_in_leaf', 'num_leaves']:
        params[p] = int(params[p])
    train_length = int(params.pop('train_length')) # Extrae y elimina el parámetro 'train_length' del diccionario de parámetros y lo convierte a un entero
    test_length = int(params.pop('test_length'))
    num_boost_round = int(params.pop('boost_rounds'))
    params.update(base_params)

    print(f'\nPosition: {position:02}')

    # 1-year out-of-sample period
    #vamos a ir haciendo el walk forward con periodos de test de un mes, moveremos el modelo para volver a entrenar y predeciremos el siguiente mes
    n_splits = int(YEAR * years_OOS / test_length)
    cv = MultipleTimeSeriesCV(n_splits=n_splits,
                              test_period_length=test_length,
                              lookahead=lookahead,
                              train_period_length=train_length)

    predictions = []
    start = time()
    for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
        print(i, end=' ', flush=True)
        
        # Crea un conjunto de datos de entrenamiento para LightGBM
        lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
                                    params=params).construct()
         # Entrena el modelo LightGBM
        model = lgb.train(params=params,
                          train_set=lgb_train,
                          num_boost_round=num_boost_round,
                          verbose_eval=False)

        test_set = data.iloc[test_idx, :]
        y_test = test_set.loc[:, label].to_frame('y_test')
        # Realiza predicciones en el conjunto de datos de prueba
        y_pred = model.predict(test_set.loc[:, model.feature_name()])
        predictions.append(y_test.assign(prediction=y_pred))

    if position == 0:
        test_predictions = (pd.concat(predictions)
                            .rename(columns={'prediction': position}))
    else:
        test_predictions[position] = pd.concat(predictions).prediction

by_day = test_predictions.groupby(level='date')# Agrupa las predicciones por fecha
for position in range(10):
     # Si es la primera iteración, calcula el coeficiente de correlación de Spearman
    #entre las predicciones y las etiquetas verdaderas y lo almacena en `ic_by_day`
    if position == 0:
        ic_by_day = by_day.apply(lambda x: spearmanr(
            x.y_test, x[position])[0]).to_frame()
    else:
        ic_by_day[position] = by_day.apply(
            lambda x: spearmanr(x.y_test, x[position])[0])
print(ic_by_day.describe())
test_predictions.to_hdf(store, f'lgb/test/{lookahead:02}')

## Verificamos que funciona el cross validation 

In [208]:
train_period_length = 216
test_period_length = 12
#MultipleTimeSeriesCV siempre empieza por el final por eso tomará como periodo de validación/teste desde la ultima fecha que le pasemos hasta 
#los años que definamos por n_splits
n_splits = int(YEAR* years_OOS/test_period_length)
lookahead =1 

cv = MultipleTimeSeriesCV(n_splits=n_splits,
                          test_period_length=test_period_length,
                          lookahead=lookahead,
                          train_period_length=train_period_length)

n_splits

In [None]:
i = 0
for train_idx, test_idx in cv.split(X=data):
    train = data.iloc[train_idx]
    train_dates = train.index.get_level_values('date')
    test = data.iloc[test_idx]
    test_dates = test.index.get_level_values('date')
    df = pd.concat([train.reset_index(), test.reset_index()])
    n = len(df)
    assert n== len(df.drop_duplicates())
    print(train.groupby(level='ticker').size().value_counts().index[0],
          train_dates.min().date(), train_dates.max().date(),
          test.groupby(level='ticker').size().value_counts().index[0],
          test_dates.min().date(), test_dates.max().date())
    i += 1
    if i == 100:
        break

n_splits

In [None]:
stop

## Generar RF predictions

### Model Configuration

In [582]:
base_params = dict(boosting='rf',
                   objective='regression',
                   random_state = 42, 
                   bagging_freq=1, 
                   verbose=-1)

#categoricals = ['year', 'month', 'sector', 'weekday']

In [584]:

store = Path('../data/predictions.h5')

### Get Data

In [None]:
data.loc[idx[:, '2024'],:]

In [589]:
for feature in categoricals:
    data[feature] = pd.factorize(data[feature], sort=True)[0]

In [590]:
lgb_data = lgb.Dataset(data=data[features],
                       label=data[label],
                       categorical_feature=categoricals,
                       free_raw_data=False)

In [None]:
stop

### Generate predictions

In [591]:
#tomamos los IC almacenados
rf_ic = pd.read_hdf('../data/model_tuning.h5', 'rf/ic')
rf_daily_ic = pd.read_hdf('../data/model_tuning.h5', 'rf/daily_ic')

In [None]:
rf_daily_ic

In [593]:
rf_daily_ic['test_length']=1

In [594]:
#función para tomar los mejores parametros que saliernon en entrenamiento para un lookahead determinado
def get_rf_params(data, t=5, best=0):
    param_cols = scope_params[1:] + rf_train_params + ['boost_rounds']
    df = data[data.lookahead==t].sort_values('ic', ascending=False).iloc[best]
    return df.loc[param_cols]

In [595]:
#para hacer más OOS que el 1 año definido inicialmente
#years_OOS=1

In [None]:
#for par las 10 mejores configuracones de paramentros de las cuales almacenaremos sus predicciones
for position in range(10):
    params = get_rf_params(rf_daily_ic,
                            t=lookahead,
                            best=position)

    params = params.to_dict()#parametros a diccionario

    for p in ['min_data_in_leaf','max_depth']:
        params[p] = int(params[p])
    train_length = int(params.pop('train_length')) # Extrae y elimina el parámetro 'train_length' del diccionario de parámetros y lo convierte a un entero
    test_length = int(params.pop('test_length'))
    num_boost_round = int(params.pop('boost_rounds'))
    params.update(base_params)

    print(f'\nPosition: {position:02}')

    # 1-year out-of-sample period
    #vamos a ir haciendo el walk forward con periodos de test de un mes, moveremos el modelo para volver a entrenar y predeciremos el siguiente mes
    n_splits = int(YEAR * years_OOS / test_length)
    cv = MultipleTimeSeriesCV(n_splits=n_splits,
                              test_period_length=test_length,
                              lookahead=lookahead,
                              train_period_length=train_length)

    predictions = []
    start = time()
    for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
        print(i, end=' ', flush=True)
        
        # Crea un conjunto de datos de entrenamiento para LightGBM
        lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
                                    params=params).construct()
         # Entrena el modelo LightGBM
        model = lgb.train(params=params,
                          train_set=lgb_train,
                          num_boost_round=num_boost_round,
                          verbose_eval=False)

        test_set = data.iloc[test_idx, :]
        y_test = test_set.loc[:, label].to_frame('y_test')
        # Realiza predicciones en el conjunto de datos de prueba
        y_pred = model.predict(test_set.loc[:, model.feature_name()])
        predictions.append(y_test.assign(prediction=y_pred))
        #if position == 0:
        #    break
    #if position == 0:
    #    break
    if position == 0:
        test_predictions = (pd.concat(predictions)
                            .rename(columns={'prediction': position}))
    else:
        test_predictions[position] = pd.concat(predictions).prediction

by_day = test_predictions.groupby(level='date')# Agrupa las predicciones por fecha
for position in range(10):
     # Si es la primera iteración, calcula el coeficiente de correlación de Spearman
    #entre las predicciones y las etiquetas verdaderas y lo almacena en `ic_by_day`
    if position == 0:
        ic_by_day = by_day.apply(lambda x: spearmanr(
            x.y_test, x[position])[0]).to_frame()
    else:
        ic_by_day[position] = by_day.apply(
            lambda x: spearmanr(x.y_test, x[position])[0])
print(ic_by_day.describe())
test_predictions.to_hdf(store, f'rf/test/{lookahead:02}')

In [597]:
 n_splits = int(YEAR * years_OOS / test_length)

In [None]:
test_length

In [None]:
ic_by_day

In [None]:
 f'rf/test/{lookahead:02}'

In [None]:
test_set.loc['XLU', model.feature_name()]

In [None]:
num_boost_round

In [None]:
params

In [None]:
 model.feature_name()

In [None]:
#for par las 10 mejores configuracones de paramentros de las cuales almacenaremos sus predicciones
for position in range(10):
    params = get_rf_params(rf_daily_ic,
                            t=lookahead,
                            best=position)
    print (params)

In [None]:
params

## Verificamos que funciona el cross validation 

In [546]:
train_period_length = 216
test_period_length = 4
#MultipleTimeSeriesCV siempre empieza por el final por eso tomará como periodo de validación/teste desde la ultima fecha que le pasemos hasta 
#los años que definamos por n_splits
n_splits = int(YEAR* years_OOS/test_period_length)
lookahead =2

cv = MultipleTimeSeriesCV(n_splits=n_splits,
                          test_period_length=test_period_length,
                          lookahead=lookahead,
                          train_period_length=train_period_length)

In [None]:
n_splits

In [None]:
i = 0
for train_idx, test_idx in cv.split(X=data):
    train = data.iloc[train_idx]
    train_dates = train.index.get_level_values('date')
    test = data.iloc[test_idx]
    test_dates = test.index.get_level_values('date')
    df = pd.concat([train.reset_index(), test.reset_index()])
    n = len(df)
    assert n== len(df.drop_duplicates())
    print(train.groupby(level='ticker').size().value_counts().index[0],
          train_dates.min().date(), train_dates.max().date(),
          test.groupby(level='ticker').size().value_counts().index[0],
          test_dates.min().date(), test_dates.max().date())
    i += 1
    if i == 100:
        break

In [None]:
n_splits

In [None]:
years_OOS