In [15]:
import pandas as pd
import numpy as np
from data_mani.utils import merge_market_and_gtrends
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from prediction.util import new_r2, add_shift
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import make_scorer
from IPython.display import display, HTML
from glob import glob

In [18]:
names = glob("data/index/spx/*.csv")
names = [n for n in names if n.find("(1)")<0]
names.sort()

In [19]:
names

['data/index/spx/0111145D UN Equity.csv',
 'data/index/spx/0202445Q UN Equity.csv',
 'data/index/spx/0203524D UN Equity.csv',
 'data/index/spx/0226226D UN Equity.csv',
 'data/index/spx/0544749D UN Equity.csv',
 'data/index/spx/0574018D UN Equity.csv',
 'data/index/spx/0772031D UN Equity.csv',
 'data/index/spx/0848680D UN Equity.csv',
 'data/index/spx/0867887D UN Equity.csv',
 'data/index/spx/0867887D US Equity.csv',
 'data/index/spx/0910150D US Equity.csv',
 'data/index/spx/0948669D UN Equity.csv',
 'data/index/spx/0961514D UN Equity.csv',
 'data/index/spx/0964591D UQ Equity.csv',
 'data/index/spx/0964591D UW Equity.csv',
 'data/index/spx/1028411Q UN Equity.csv',
 'data/index/spx/1086832D UN Equity.csv',
 'data/index/spx/1255173D UN Equity.csv',
 'data/index/spx/1255459D UW Equity.csv',
 'data/index/spx/1280712D UQ Equity.csv',
 'data/index/spx/1280712D UW Equity.csv',
 'data/index/spx/1281683D UN Equity.csv',
 'data/index/spx/1284849D UN Equity.csv',
 'data/index/spx/1288453D UW Equit

In [13]:
for n in names:
    name1 = n 
    name2 = n.replace("(1)","")
    a = pd.read_csv(name1)
    b = pd.read_csv(name2)
    print(n,"DIFF ->", np.sum(a.values != b.values))

data/index/spx/1448062D UW Equity(1).csv DIFF -> 1
data/index/spx/2999130Q UN Equity(1).csv DIFF -> 1
data/index/spx/ABKFQ UN Equity(1).csv DIFF -> 1
data/index/spx/AN UN Equity(1).csv DIFF -> 1
data/index/spx/AZO UN Equity(1).csv DIFF -> 1
data/index/spx/COO UN Equity(1).csv DIFF -> 1
data/index/spx/DPHIQ UN Equity(1).csv DIFF -> 1
data/index/spx/QCP UN Equity(1).csv DIFF -> 1
data/index/spx/TER UN Equity(1).csv DIFF -> 1


## Variables

In [11]:
N_ITER = 500
N_SPLITS = 5
N_JOBS = 2

## Get selected features 

In [3]:
ticker_name =  "AMZN US Equity"
method = "sfi"
out_folder = "nasdaq"
score_path = "results/{}/{}/{}.csv".format(method,out_folder,ticker_name)
scores =  pd.read_csv(score_path)
cut = scores.feature_score.mean()
scores = (scores.loc[scores.feature_score > cut]).feature.to_list()

## Get merged dataframe 

In [4]:
ticker_path = "data/crsp/{}/{}.csv".format(out_folder,ticker_name)
train, test = merge_market_and_gtrends(ticker_path, test_size=0.5)

In [5]:
train.shape, test.shape

((2087, 183), (2087, 183))

In [6]:
words = train.drop("target_return",1).columns.to_list()

add_shift(merged_df=train, words=words, max_lag=20)
train = train[["target_return"] + scores]
train = train.fillna(0)

add_shift(merged_df=test, words=words, max_lag=20)
test = test[["target_return"] + scores]
test = test.fillna(0)

add shift: 100%|██████████| 182/182 [01:01<00:00,  2.96it/s]
add shift: 100%|██████████| 182/182 [00:55<00:00,  3.26it/s]


In [7]:
train.shape, test.shape

((2087, 2006), (2087, 2006))

In [8]:
results = {"train":[],
           "test":[],
           "model":[]}

In [9]:
X_train = train.drop("target_return",1).values
y_train = train.target_return.values

X_test = test.drop("target_return",1).values
y_test = test.target_return.values

rf = RandomForestRegressor(n_estimators=100, n_jobs=2)
rf = rf.fit(X_train, y_train)

pred_train = rf.predict(X_train)
pred_test = rf.predict(X_test)

r2_train = new_r2(y_train, pred_train)
r2_test = new_r2(y_test, pred_test)

results["train"].append(r2_train)
results["test"].append(r2_test)
results["model"].append("Default Random Forest")

result = pd.DataFrame(results).set_index("model").sort_values("test", ascending=False).to_html()
display(HTML(result))

Unnamed: 0_level_0,train,test
model,Unnamed: 1_level_1,Unnamed: 2_level_1
Default Random Forest,0.850046,-0.0589


## Random Forest

In [12]:
time_split = TimeSeriesSplit(n_splits=N_SPLITS)
r2_scorer = make_scorer(new_r2)
n_columns = X_train.shape[1]
rf_params =  {"max_features":list(range(1, int(np.sqrt(n_columns)+1))),
              "n_estimators": list(range(2, 60)),
              "max_depth": list(range(2, 21))}

rf_search = RandomizedSearchCV(estimator=RandomForestRegressor(),
                               param_distributions=rf_params,
                               n_iter=N_ITER,
                               cv=time_split,
                               verbose=1,
                               n_jobs=N_JOBS,
                               scoring=r2_scorer)

rf_search = rf_search.fit(X_train,y_train)
rf_train_pred  = rf_search.best_estimator_.predict(X_train)
rf_test_pred  = rf_search.best_estimator_.predict(X_test)

r2_train = new_r2(y_train, rf_train_pred)
r2_test = new_r2(y_test, rf_test_pred)

results["train"].append(r2_train)
results["test"].append(r2_test)
results["model"].append("Tunned Random Forest")

result = pd.DataFrame(results).set_index("model").sort_values("test", ascending=False).to_html()
display(HTML(result))
print(rf_search.best_params_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  88 tasks      | elapsed:    8.5s
[Parallel(n_jobs=2)]: Done 380 tasks      | elapsed:   49.3s
[Parallel(n_jobs=2)]: Done 672 tasks      | elapsed:  1.4min
[Parallel(n_jobs=2)]: Done 1336 tasks      | elapsed:  2.8min
[Parallel(n_jobs=2)]: Done 1844 tasks      | elapsed:  4.3min
[Parallel(n_jobs=2)]: Done 2432 tasks      | elapsed:  6.0min
[Parallel(n_jobs=2)]: Done 2500 out of 2500 | elapsed:  6.1min finished


Unnamed: 0_level_0,train,test
model,Unnamed: 1_level_1,Unnamed: 2_level_1
Tunned Random Forest,0.491416,-0.006191
Tunned Random Forest,0.375408,-0.009447
Default Random Forest,0.850046,-0.0589


{'n_estimators': 42, 'max_features': 33, 'max_depth': 10}
