# Parameter Tuning for Stock Selection Models

In [1]:
"""Import Modules"""
# Numerical Computation
import numpy as np
import pandas as pd
import random
# Data Processing
from data_pipeline import *
from sklearn import preprocessing
## Training
from train import ModelClass, MonteCarlo
from sklearn.model_selection import ParameterGrid
import time 

In [12]:
"""**************Hyperparameters************"""
path = r'../../data/adj_daily'
start = '2007-12-31'
end = '2019-8-5' #'2019-11-5'
split = '2014-1-1'
nTrain = 8*12*20 # 8 years 
nVal = 1*12*20 # 1 year
thr = 0.005

"""**************Loading Data*****************"""
print("- Loading Data")

#
#price_df = load_data(path,'Open')
#price_df.to_csv('__datacache__/price_open_df.csv',index=False)

# stock prices
price_df = pd.read_csv('__datacache__/price_open_df.csv')
price_df['datetime'] = pd.to_datetime(price_df['datetime'])

# benchmark prices
benchmark_df = pd.read_csv('__datacache__/SPY_adj.csv')[['date','Open']]
benchmark_df['date'] = pd.to_datetime(benchmark_df['date'])
benchmark_df = benchmark_df.rename(columns = {'date':'datetime'})
benchmark_df.insert(0, 'ticker', 'SPY')


"""**************Data Preprocessing**************"""
print("- Data Preprocessing")
X, Y, dt_map= bulk_process(price_df, 'Open', start, end)
Y_label = (Y>thr).astype(int)
print("ratio of true labels:",Y_label.sum()/len(Y))

NTrain, NTest = calc_datepoints(dt_map, start, split, end)

# train-test split
idx = train_test_split(NTrain, NTest, dt_map, start=0, window = 'single')
for d, (idx_train, idx_test) in enumerate(idx):
    X_train, X_test = X[idx_train],X[idx_test]
    Y_train_label, Y_test_label = Y_label[idx_train],Y_label[idx_test]
    train_map = dt_map.iloc[idx_train,:].reset_index(drop=True)
    test_map = dt_map.iloc[idx_test,:].reset_index(drop=True)

# # Standarization if necessary
# scaler = preprocessing.StandardScaler()
# X_train = scaler.fit_transform(X_train)     
# X_test = scaler.transform(X_test)  


- Loading Data
- Data Preprocessing
ratio of true labels: 0.3724094546709313


## Random Forest

* n_estimators

In [10]:
print("- Parameter Tuning")

n_estimators = range(120,170,20)
max_depth = range(18,3,-2)
min_samples_split = range(5,200,20)

param_grid={'n_estimators':n_estimators, 'max_depth':max_depth,'min_samples_split':min_samples_split}
param_grid = list(ParameterGrid(param_grid))

window = 'single'
w = 40
nReps = 1

scores_rf = pd.DataFrame()
best_scores_prec= 0
best_scores_acc = 0
start_time = time.time()
for j, param_set in enumerate(param_grid):
    print('----------------Start Parameter Set: n_estimators=',param_set.get('n_estimators'),
          'max_depth=',param_set.get('max_depth'),
          'min_samples_split=',param_set.get('min_samples_split'),
          '----------------')
    rf = ModelClass(model_type='rf',
                       n_estimators=param_set.get('n_estimators'), 
                       oob_score=True, 
                       max_depth=param_set.get('max_depth'), 
                       min_samples_split=param_set.get('min_samples_split'), 
                       min_samples_leaf=5,
                       max_features = "auto",
                       class_weight = {0:1,1:1.25},
                       n_jobs = -1)
    rf_mc = MonteCarlo(rf, X_train, Y_train_label, nTrain, nVal, nReps = nReps, window=window, verbose = 0, seed =j)
    cur_score = rf_mc.experiment(train_map)
    tmp = pd.DataFrame(cur_score.reshape(1,6), columns = ['Accuracy','Precision','Recall', 'F1', 'TPR', 'FPR'])
    tmp['n_estimators']=param_set.get('n_estimators')
    tmp['max_depth']=param_set.get('max_depth')
    tmp['min_samples_split']=param_set.get('min_samples_split')
    scores_rf = scores_rf.append(tmp)
    if best_scores_acc < cur_score[0]:
        best_scores_acc = cur_score[0]
        best_param_acc = param_set
        
    if best_scores_prec < cur_score[1]:
        best_scores_prec = cur_score[1]
        best_param_prec = param_set
    print("******** average accuracy:",cur_score[0],"average precision:",cur_score[1]," ********")
print("best parameter set is %s, with Accuracy is %.4f" % (best_param_acc, best_scores_acc))
print("best parameter set is %s, with Precision is %.4f" % (best_param_prec, best_scores_prec))
print("finsied in %s seconds" %(time.time()-start_time))

- Parameter Tuning
----------------Start Parameter Set: n_estimators= 140 max_depth= 10 min_samples_split= 40 ----------------
----Now processing Repeat: 1----
---- Now processing Window 1/1 -----
----Rep 1 finished in 164.93 seconds----
 train accuracy: 0.6340, train precision: 0.5722, val accuracy: 0.5620, val precision: 0.4557, true signal: 0.4131
******** average accuracy: 0.5620057148124251 average precision: 0.45569814295725297  ********
best parameter set is {'max_depth': 10, 'min_samples_split': 40, 'n_estimators': 140}, with Accuracy is 0.5620
best parameter set is {'max_depth': 10, 'min_samples_split': 40, 'n_estimators': 140}, with Precision is 0.4557
finsied in 166.45820832252502 seconds


In [8]:
rf.model.feature_importances_

array([0.06745959, 0.0752548 , 0.06292007, 0.06173221, 0.036046  ,
       0.03473335, 0.02735881, 0.04030383, 0.03338088, 0.03233428,
       0.02782534, 0.09174652, 0.10473973, 0.17001224, 0.00583387,
       0.01232227, 0.01352696, 0.02230979, 0.01254725, 0.00560555,
       0.00705185, 0.00231655, 0.00745654, 0.01162194, 0.00518772,
       0.00817213, 0.00753443, 0.00969897, 0.00296654])

In [9]:
rf.model.feature_importances_.max()

0.17001223513721134

In [11]:
rf.model.feature_importances_

array([0.06727838, 0.07473722, 0.06176634, 0.06066314, 0.03507718,
       0.03509208, 0.02805362, 0.03882384, 0.03524727, 0.0318958 ,
       0.02741389, 0.08390091, 0.11132468, 0.17185005, 0.00694068,
       0.01261885, 0.01374641, 0.0232095 , 0.01321333, 0.00534401,
       0.00711293, 0.00231876, 0.00733214, 0.01161545, 0.0048838 ,
       0.00814273, 0.00716571, 0.01022682, 0.00300448])