In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from utils.ml_utils import *

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterSampler
from tqdm import tqdm

from data_prep import load_features_data



**Load the features data, these are popular trend following features used in prior literature**

In [13]:
feats = load_features_data()
feats.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,ret,rVol,1d_ret,1wk_ret,1m_ret,1Q_ret,6M_ret,12M_ret,feature_1d_ra,feature_1wk_ra,...,lag5_feature_MACD_short,lag5_feature_MACD_medium,lag5_feature_MACD_long,lag5_feature_skew6m,lag5_feature_skew12m,lag5_feature_kurt6m,lag5_feature_kurt12m,fwd_ret1d,target,targetBin
date,cluster,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2024-03-15,15,0.052935,0.010646,-0.009848,-0.017287,0.002016,0.071679,0.201208,0.202884,-0.925114,-0.726237,...,0.118525,0.407592,0.856639,0.066425,0.395266,2.899242,4.029588,,,
2024-03-15,16,2.355712,0.010487,0.003585,0.054587,0.082955,0.095509,0.074083,0.173772,0.341819,2.327878,...,0.464299,0.56147,0.464375,0.05151,0.388949,2.898267,4.018791,,,
2024-03-15,17,6.049651,0.010363,0.005988,0.013662,0.00898,0.21159,0.52995,0.66319,0.577818,0.589546,...,0.115569,0.447172,0.922814,0.039734,0.385207,2.909077,4.015372,,,
2024-03-15,18,1.81813,0.010196,-0.001021,0.011817,0.038944,-0.010944,0.054041,0.345464,-0.10013,0.518333,...,-0.049638,-0.360213,-0.198844,0.021089,0.386226,2.867876,4.029249,,,
2024-03-15,19,1.01405,0.010568,-0.018105,-0.010729,0.088893,0.143403,-0.0368,-0.004099,-1.713252,-0.454027,...,0.627858,0.65085,0.180584,-0.006129,0.37694,2.834308,3.977828,,,


**Extract the predictive features and target which is one day forward risk adjusted returns - drop na on this subset**

In [14]:
features = [f for f in feats.columns if f.startswith("feature")]
lag_feats = [f for f in feats.columns if f.startswith("lag")]
target = ["target"]

all_feats = features + target + lag_feats
feats.dropna(subset=all_feats, inplace=True)
feats = feats[all_feats]

print("Shape of features", feats.shape)

Shape of features (117040, 79)


**Break out X and y and set up cross-validation**

In [15]:
X = feats[all_feats].copy()

baseRF = RandomForestRegressor(max_depth=5, 
                               n_estimators=1000,
                               max_features=int(1),
                               n_jobs=-3)

# simple-grid
grid = {'n_estimators': np.arange(100, 1000, 100),
        'max_depth': [3, 6, 9],
        'max_features': [int(1), 'sqrt'],
        'min_weight_fraction_leaf': np.arange(0.0, 0.05, 0.005)}

params = ParameterSampler(n_iter=25, param_distributions=grid)


**Form the training loop, here we train on 3 year expanding windows, using 90% of each split for training and 10% for tuning hyper-parameters, we then use the same model to forecast forward 3-years before re-training again.**

In [16]:
predictions = []
scores = []
for train, test in tqdm(get_cv_splits(X, split_length=252*3)):
    # break out X and y train, test
    X_train, y_train = train[features], train[target] 
    X_test, y_test = test[features], test[target]

    # hyper-param loop
    X_train2, X_val, y_train2, y_val = train_val_split(X_train, y_train)
    print(X_train2.shape, X_val.shape)

    # inner loop for parameter tuning
    gscv_scores = {'scores': [], 'grid':[]}
    for k, p in enumerate(params):
        model = RandomForestRegressor(**p)
        model.n_jobs=-1
        model.fit(X_train2, y_train2.values.reshape(y_train2.shape[0], ))
        _pred = model.predict(X_val)
        _score = mean_squared_error(y_val, _pred)
        gscv_scores['scores'].append(_score)
        gscv_scores['grid'].append(p)
        print(f'Iter: {k}: Score: {_score}')

    # now fit the best model
    best_model = pd.DataFrame(gscv_scores).sort_values(by='scores').head(1)['grid'].values[0]
    print(best_model)
    best_model = RandomForestRegressor(**best_model)
    best_model.n_jobs=-1
    best_model.fit(X_train, y_train.values.reshape(y_train.shape[0], ))
    preds = best_model.predict(X_test)

    # append the predictions
    predictions.append(pd.Series(index=y_test.index, data=preds))

    # score
    scores.append(mean_squared_error(y_test, preds))

# predictions
predictions = pd.concat(predictions).to_frame("predictions")

0it [00:00, ?it/s]

7
(13626, 13) (1500, 13)
Iter: 0: Score: 1.3397313566930171
Iter: 1: Score: 1.345525274633928
Iter: 2: Score: 1.3365385044086682
Iter: 3: Score: 1.3369046968004257
Iter: 4: Score: 1.3369205424596684
Iter: 5: Score: 1.338609732396373
Iter: 6: Score: 1.347388426348643
Iter: 7: Score: 1.3369017352714256
Iter: 8: Score: 1.3376600332812947
Iter: 9: Score: 1.3361202229643547
Iter: 10: Score: 1.3374012750602722
Iter: 11: Score: 1.3496487757544722
Iter: 12: Score: 1.3375422779642598
Iter: 13: Score: 1.3387454043328562
Iter: 14: Score: 1.3365181370161672
Iter: 15: Score: 1.3369728200207018
Iter: 16: Score: 1.3362784894960376
Iter: 17: Score: 1.338226450065164
Iter: 18: Score: 1.3362800042660212
Iter: 19: Score: 1.33907438992572
Iter: 20: Score: 1.3381081790002205
Iter: 21: Score: 1.3380336643316648
Iter: 22: Score: 1.338157449163515
Iter: 23: Score: 1.3359969748498701
Iter: 24: Score: 1.3366596936366995
{'n_estimators': 800, 'min_weight_fraction_leaf': 0.01, 'max_features': 1, 'max_depth': 3}


1it [00:25, 25.54s/it]

(27252, 13) (3020, 13)
Iter: 0: Score: 1.443540647589011
Iter: 1: Score: 1.4429349189080458
Iter: 2: Score: 1.445924156148313
Iter: 3: Score: 1.4428475822893514
Iter: 4: Score: 1.4431436464479817
Iter: 5: Score: 1.443599920929328
Iter: 6: Score: 1.4431155251252523
Iter: 7: Score: 1.4431147866468095
Iter: 8: Score: 1.443912480843022
Iter: 9: Score: 1.4428546191092047
Iter: 10: Score: 1.4484132886470036
Iter: 11: Score: 1.4428105372211835
Iter: 12: Score: 1.4437952190390737
Iter: 13: Score: 1.4425370222422258
Iter: 14: Score: 1.4425874496586506
Iter: 15: Score: 1.4443828938571692
Iter: 16: Score: 1.4427482623568788
Iter: 17: Score: 1.4434206176452844
Iter: 18: Score: 1.4439928202828232
Iter: 19: Score: 1.4430660637690778
Iter: 20: Score: 1.4625343033204263
Iter: 21: Score: 1.4468367992527802
Iter: 22: Score: 1.4432645499078882
Iter: 23: Score: 1.4439796153445787
Iter: 24: Score: 1.4449680801279583
{'n_estimators': 600, 'min_weight_fraction_leaf': 0.045, 'max_features': 1, 'max_depth': 9}

2it [01:02, 32.50s/it]

(40860, 13) (4540, 13)
Iter: 0: Score: 1.3530363963627157
Iter: 1: Score: 1.3536065370414125
Iter: 2: Score: 1.353151170385124
Iter: 3: Score: 1.3546694566588493
Iter: 4: Score: 1.3536861431220026
Iter: 5: Score: 1.3522372528415323
Iter: 6: Score: 1.353984418588146
Iter: 7: Score: 1.3537224827660914
Iter: 8: Score: 1.3546709244906012
Iter: 9: Score: 1.3536000663295504
Iter: 10: Score: 1.353431265659952
Iter: 11: Score: 1.3523421569588645
Iter: 12: Score: 1.3538487508651396
Iter: 13: Score: 1.3534915461521937
Iter: 14: Score: 1.353497420081964
Iter: 15: Score: 1.3538229113246942
Iter: 16: Score: 1.3538382313662436
Iter: 17: Score: 1.3538743689400177
Iter: 18: Score: 1.3535383300403554
Iter: 19: Score: 1.3536615883051288
Iter: 20: Score: 1.3530149487653582
Iter: 21: Score: 1.354187962435328
Iter: 22: Score: 1.353804526928305
Iter: 23: Score: 1.3545064275398757
Iter: 24: Score: 1.3532688315831198
{'n_estimators': 300, 'min_weight_fraction_leaf': 0.005, 'max_features': 1, 'max_depth': 3}


3it [01:55, 41.76s/it]

(54468, 13) (6040, 13)
Iter: 0: Score: 1.3278544938645371
Iter: 1: Score: 1.3274615349169894
Iter: 2: Score: 1.3276735210630264
Iter: 3: Score: 1.328030542437902
Iter: 4: Score: 1.3275579229850525
Iter: 5: Score: 1.3279782255539576
Iter: 6: Score: 1.327733686457855
Iter: 7: Score: 1.327857776944277
Iter: 8: Score: 1.3283022585356332
Iter: 9: Score: 1.327714948840924
Iter: 10: Score: 1.3289712503859676
Iter: 11: Score: 1.3278807332452562
Iter: 12: Score: 1.3298225342933385
Iter: 13: Score: 1.3276261455084144
Iter: 14: Score: 1.3284008634069613
Iter: 15: Score: 1.3281702858366455
Iter: 16: Score: 1.3288429391427625
Iter: 17: Score: 1.327609172658639
Iter: 18: Score: 1.3320252053603128
Iter: 19: Score: 1.3276537143925513
Iter: 20: Score: 1.327758058652162
Iter: 21: Score: 1.3273451574385307
Iter: 22: Score: 1.3302876838687328
Iter: 23: Score: 1.3282726265816336
Iter: 24: Score: 1.3286144649263092
{'n_estimators': 100, 'min_weight_fraction_leaf': 0.04, 'max_features': 1, 'max_depth': 6}


4it [03:19, 58.48s/it]

(68076, 13) (7560, 13)
Iter: 0: Score: 1.301976780898697
Iter: 1: Score: 1.3019398985026815
Iter: 2: Score: 1.3017229487088977
Iter: 3: Score: 1.3021210457777437
Iter: 4: Score: 1.3020607800711408
Iter: 5: Score: 1.3018661757577146
Iter: 6: Score: 1.3020640459613042
Iter: 7: Score: 1.3017148435046082
Iter: 8: Score: 1.3019087403498055
Iter: 9: Score: 1.30223230723133
Iter: 10: Score: 1.3016559970064567
Iter: 11: Score: 1.3018301097834077
Iter: 12: Score: 1.3021042293122485
Iter: 13: Score: 1.3017692084572203
Iter: 14: Score: 1.3017173174997057
Iter: 15: Score: 1.3017812713838528
Iter: 16: Score: 1.3014511997230256
Iter: 17: Score: 1.3018780816759716
Iter: 18: Score: 1.3018941879694241
Iter: 19: Score: 1.3016964055708873
Iter: 20: Score: 1.3017062674829667
Iter: 21: Score: 1.3019902470303937
Iter: 22: Score: 1.303181000991906
Iter: 23: Score: 1.3017463944762875
Iter: 24: Score: 1.3018646305423307
{'n_estimators': 200, 'min_weight_fraction_leaf': 0.01, 'max_features': 'sqrt', 'max_depth'

5it [04:19, 58.84s/it]

(81684, 13) (9060, 13)
Iter: 0: Score: 1.3419159872864066
Iter: 1: Score: 1.341994957534767
Iter: 2: Score: 1.342276999449561
Iter: 3: Score: 1.3421013103837451
Iter: 4: Score: 1.3421179453026537
Iter: 5: Score: 1.3421366642028054
Iter: 6: Score: 1.3422656064540384
Iter: 7: Score: 1.3431303112527373
Iter: 8: Score: 1.3434249942584398
Iter: 9: Score: 1.3425004226618333
Iter: 10: Score: 1.342284581090663
Iter: 11: Score: 1.3422364986604964
Iter: 12: Score: 1.3427270853471847
Iter: 13: Score: 1.342733320540898
Iter: 14: Score: 1.3422420815510656
Iter: 15: Score: 1.3421743221294884
Iter: 16: Score: 1.3419755577852146
Iter: 17: Score: 1.3423722814458972
Iter: 18: Score: 1.3421461338643188
Iter: 19: Score: 1.342591735406409
Iter: 20: Score: 1.3419262132691203
Iter: 21: Score: 1.3418827312348398
Iter: 22: Score: 1.3419525857463248
Iter: 23: Score: 1.3436988342876024
Iter: 24: Score: 1.3421160874305798
{'n_estimators': 100, 'min_weight_fraction_leaf': 0.005, 'max_features': 1, 'max_depth': 3}


6it [05:52, 70.37s/it]

(95292, 13) (10580, 13)
Iter: 0: Score: 1.305711155937452
Iter: 1: Score: 1.3057110930031801
Iter: 2: Score: 1.3059276140234957
Iter: 3: Score: 1.3060095520304322
Iter: 4: Score: 1.305607765448201
Iter: 5: Score: 1.306398197770949
Iter: 6: Score: 1.3056074680999092
Iter: 7: Score: 1.3092310560078653
Iter: 8: Score: 1.3057712477593493
Iter: 9: Score: 1.3057797846987753
Iter: 10: Score: 1.305574238472711
Iter: 11: Score: 1.3055935614363827
Iter: 12: Score: 1.3055162881384295
Iter: 13: Score: 1.3056847287230702
Iter: 14: Score: 1.305824100611432
Iter: 15: Score: 1.3060469704651052
Iter: 16: Score: 1.3118334514664716
Iter: 17: Score: 1.3055941625643053
Iter: 18: Score: 1.3058402046902113
Iter: 19: Score: 1.3055445358112618
Iter: 20: Score: 1.3061756761081125
Iter: 21: Score: 1.3057385560238517
Iter: 22: Score: 1.3076844132634111
Iter: 23: Score: 1.3104996172742118
Iter: 24: Score: 1.3055953513471776
{'n_estimators': 300, 'min_weight_fraction_leaf': 0.02, 'max_features': 'sqrt', 'max_depth'

7it [08:00, 68.70s/it]


**Now we join the forecasted returns back into the returns df, we can create a simple portfolio construction strategy where we equal weight the top half, top quartile of the clusters. We can also create a long short strategy that goes long the top quartile of forecasting cluster returns and short the bottom quartile, for a market neutral portfolio.**

In [17]:
all_feats = load_features_data()
predictions=predictions.join(all_feats[['1d_ret']])

predictions['signal_quintile']=predictions.groupby(by='date')['predictions'].apply(lambda x: pd.qcut(x, 5, labels=False)).droplevel(0)
predictions['signal_quintiles_shift'] = predictions.groupby(by='cluster')['signal_quintile'].shift(1)
(1+predictions.groupby(by=['date', 'signal_quintiles_shift'])['1d_ret'].mean().unstack()).cumprod().plot(title="Forecasted by Cluster Quintile")