In [1]:
import pandas as pd
import numpy as np
from data_mani.utils import merge_market_and_gtrends
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from prediction.util import new_r2, add_shift
from prediction.util import get_selected_features
from prediction.models import RandomForestWrapper


from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import make_scorer
from IPython.display import display, HTML
from time import time
import os
from glob import glob

In [2]:
tickers = ['0910150D US Equity', '1288652D US Equity',
           '1831877D US Equity', 'ANDV US Equity',
           'BCR US Equity', 'GR US Equity', 'HAR US Equity',
           'HAS US Equity', 'HPC US Equity', 'MAT US Equity',
           'NBL US Equity', 'TSS US Equity', 'TWX US Equity',
           'TXU US Equity', 'WAMUQ US Equity']

## Variables

In [3]:
N_ITER_SEARCH = 5
N_SPLITS_SEARCH = 5
N_JOBS = 2

In [4]:
init = time()

## Get selected features 

In [18]:
ticker_name =  tickers[3]
fs_method = "fake"
out_folder = "spx"

select = get_selected_features(ticker_name=ticker_name,
                               out_folder=out_folder,
                               fs_method=fs_method)

print("number of selected features:", len(select))

number of selected features: 2219


## Get merged dataframe 

In [21]:
ticker_path = "data/index/{}/{}.csv".format(out_folder,ticker_name)
train, test = merge_market_and_gtrends(ticker_path, test_size=0.5)
train.shape, test.shape

((1856, 183), (1856, 183))

## Add shift + Preprocessing

In [22]:
words = train.drop("target_return",1).columns.to_list()

add_shift(merged_df=train, words=words, max_lag=20)
train = train[["target_return"] + select]
train = train.fillna(0.0)

add_shift(merged_df=test, words=words, max_lag=20)
test = test[["target_return"] + select]
test = test.fillna(0.0)

add shift: 100%|██████████| 182/182 [00:48<00:00,  3.78it/s]
add shift: 100%|██████████| 182/182 [00:40<00:00,  4.51it/s]


In [23]:
train.shape, test.shape

((1856, 2220), (1856, 2220))

In [24]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

rf_wrapper = RandomForestWrapper()

# model_name = "random_forest"
# rf_params =  {"max_features":['auto', 'sqrt', 'log2'],
#               "min_samples_split":sp_randint(2, 31),
#               "n_estimators": sp_randint(2, 301),
#               "max_depth": sp_randint(2, 20)}

# RandomForestRegressor()

## Hyperparameter search

In [25]:
def hyper_params_search(df,
                        wrapper,
                        n_iter,
                        n_splits,
                        n_jobs,
                        target_name="target_return"):
    """
    Use the dataframe 'df' to search for the best
    params for the model 'wrapper'.
    
    The CV split is performed using the TimeSeriesSplit
    class

    "Empirical Asset Pricing via Machine
    Learning"

    :param df: train data
    :type df: pd.DataFrame
    :param wrapper: predictive model
    :type wrapper: sklearn model wrapper
    :param n_iter: number of hyperparameter searchs
    :type n_iter: int
    :param n_splits: number of cross-validation splits
    :type n_splits: int
    :param target_name: name of the target column in 'df'
    :type target_name: str
    :return: R2 value
    :rtype: float
    """

    X = df.drop(target_name,1).values
    y = df[target_name].values

    time_split = TimeSeriesSplit(n_splits=n_splits)
    r2_scorer = make_scorer(new_r2)

    model_search = RandomizedSearchCV(estimator=wrapper.ModelClass,
                                      param_distributions=wrapper.param_grid,
                                      n_iter=n_iter,
                                      cv=time_split,
                                      verbose=1,
                                      n_jobs=n_jobs,
                                      scoring=r2_scorer)

    model_search = model_search.fit(X,y)

    return model_search



In [26]:
rf_wrapper = RandomForestWrapper()
model_search =  hyper_params_search(df=train,
                                    wrapper=rf_wrapper,
                                    n_jobs=2,
                                    n_iter=3,
                                    n_splits=2)

Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:   59.0s finished


In [39]:
train

Unnamed: 0_level_0,target_return,home_20,money_14,dow_jones_2,tourism_20,carolina_14,sell_20,return_13,present_17,loss_19,...,virginia_8,community_6,cash_4,return_12,BUY_AND_HOLD_2,companies_11,elections_4,democratic_1,gold_17,greed_14
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-02,0.016472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2004-01-05,0.039838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2004-01-06,-0.014935,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2004-01-07,-0.017798,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,-2.0,0.0,0.0
2004-01-08,-0.026846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011-05-10,0.000388,-1.0,-1.0,0.0,0.0,-2.0,0.0,-1.0,0.0,0.0,...,1.0,4.0,0.0,0.0,0.0,2.0,-1.0,0.0,0.0,0.0
2011-05-11,-0.047287,-2.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,...,-2.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2011-05-12,-0.031733,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,...,1.0,0.0,0.0,0.0,0.0,-1.0,-1.0,1.0,1.0,0.0
2011-05-13,-0.015546,-1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,-2.0,1.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0


In [36]:
model_search =  hyper_params_search(df=train["2006"],
                                    wrapper=rf_wrapper,
                                    n_jobs=2,
                                    n_iter=3,
                                    n_splits=2)

Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    7.7s finished


In [37]:
model_search

RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=2),
                   estimator=RandomForestRegressor(), n_iter=3, n_jobs=2,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff2e82353a0>,
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff2a3497880>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff2a3498c10>},
                   scoring=make_scorer(new_r2), verbose=1)

In [27]:
X_test = test.drop("target_return",1).values
y_test = test.target_return.values

test_pred  = model_search.best_estimator_.predict(X_test)

r2_test = new_r2(y_test, test_pred)
r2_test

-0.06225914585345893

In [43]:
X_train = train.drop("target_return",1).values
y_train = train.target_return.values
pred  = model_search.best_estimator_.predict(X_train)

r2_ = new_r2(y_train, pred)
r2_

0.023994168654803527

In [29]:
model_search.best_params_

{'max_depth': 5,
 'max_features': 'auto',
 'min_samples_split': 16,
 'n_estimators': 252}

In [44]:
complete = pd.concat([train,test])

In [None]:
# model_search.best_params_

In [70]:
from tqdm import tqdm

all_preds = []

years = list(set(complete.index.map(lambda x: x.year)))
years.sort()
years = years[:-1]
years = years[:4]

for y in tqdm(years):
    train_ys = complete[:str(y)]
    test_ys = complete[str(y+1)]
    rf_wrapper = RandomForestWrapper()
    model_search =  hyper_params_search(df=train_ys,
                                        wrapper=rf_wrapper,
                                        n_jobs=2,
                                        n_iter=1,
                                        n_splits=2)
    X_test = test_ys.drop("target_return",1).values
    y_test = test_ys.target_return.values
    test_pred  = model_search.best_estimator_.predict(X_test)
    dict_ = {"date": test_ys.index,
             "return":y_test,
             "prediction":test_pred}
    result = pd.DataFrame(dict_)
    all_preds.append(result)


  0%|          | 0/4 [00:00<?, ?it/s]

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    3.7s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    3.7s finished
 25%|██▌       | 1/4 [00:07<00:23,  7.68s/it][Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   17.5s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   17.5s finished
 50%|█████     | 2/4 [00:49<00:35, 17.93s/it][Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    4.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    4.2s finished
 75%|███████▌  | 3/4 [00:59<00:15, 15.41s/it][Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   46.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   46.2s finished
100%|██████████| 4/4 [02:57<00:00, 44.27s/it]


In [77]:
df = pd.concat(all_preds)
r2_ = new_r2(df["return"].values, df["prediction"].values)
r2_

-0.008211028150391986

In [79]:
ticker_name

'ANDV US Equity'