In [1]:
import pandas as pd
import numpy as np
from data_mani.utils import merge_market_and_gtrends
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from prediction.util import new_r2, add_shift
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import make_scorer
from IPython.display import display, HTML
from time import time
import os

## Variables

In [2]:
N_ITER = 5
N_SPLITS = 5
N_JOBS = 2

In [3]:
init = time()

## Get selected features 

In [4]:
ticker_name =  "AMZN US Equity"
fs_method = "sfi"
model_name = "random_forest"
out_folder = "nasdaq"
score_path = "results/feature_selection/{}/{}/{}.csv".format(fs_method,
                                                             out_folder,
                                                             ticker_name)
scores =  pd.read_csv(score_path)
cut = scores.feature_score.mean()
scores = (scores.loc[scores.feature_score > cut]).feature.to_list()

## Get merged dataframe 

In [5]:
ticker_path = "data/crsp/{}/{}.csv".format(out_folder,ticker_name)
train, test = merge_market_and_gtrends(ticker_path, test_size=0.5)

In [6]:
train.shape, test.shape

((2087, 183), (2087, 183))

In [7]:
words = train.drop("target_return",1).columns.to_list()

add_shift(merged_df=train, words=words, max_lag=20)
train = train[["target_return"] + scores]
train = train.fillna(0)

add_shift(merged_df=test, words=words, max_lag=20)
test = test[["target_return"] + scores]
test = test.fillna(0)

add shift: 100%|██████████| 182/182 [01:09<00:00,  2.64it/s]
add shift: 100%|██████████| 182/182 [01:06<00:00,  2.73it/s]


In [8]:
train.shape, test.shape

((2087, 2006), (2087, 2006))

In [9]:
results = {"train":[],
           "test":[],
           "model":[]}

In [10]:
X_train = train.drop("target_return",1).values
y_train = train.target_return.values

X_test = test.drop("target_return",1).values
y_test = test.target_return.values

rf = RandomForestRegressor(n_estimators=100, n_jobs=2)
rf = rf.fit(X_train, y_train)

pred_train = rf.predict(X_train)
pred_test = rf.predict(X_test)

r2_train = new_r2(y_train, pred_train)
r2_test = new_r2(y_test, pred_test)

results["train"].append(r2_train)
results["test"].append(r2_test)
results["model"].append("Default Random Forest")

result = pd.DataFrame(results).set_index("model").sort_values("test", ascending=False).to_html()
display(HTML(result))

Unnamed: 0_level_0,train,test
model,Unnamed: 1_level_1,Unnamed: 2_level_1
Default Random Forest,0.852084,-0.067037


## Random Forest

In [11]:
time_split = TimeSeriesSplit(n_splits=N_SPLITS)
r2_scorer = make_scorer(new_r2)
n_columns = X_train.shape[1]
rf_params =  {"max_features":list(range(1, int(np.sqrt(n_columns)+1))),
              "n_estimators": list(range(2, 60)),
              "max_depth": list(range(2, 21))}

rf_search = RandomizedSearchCV(estimator=RandomForestRegressor(),
                               param_distributions=rf_params,
                               n_iter=N_ITER,
                               cv=time_split,
                               verbose=1,
                               n_jobs=N_JOBS,
                               scoring=r2_scorer)

rf_search = rf_search.fit(X_train,y_train)
rf_train_pred  = rf_search.best_estimator_.predict(X_train)
rf_test_pred  = rf_search.best_estimator_.predict(X_test)

r2_train = new_r2(y_train, rf_train_pred)
r2_test = new_r2(y_test, rf_test_pred)

results["train"].append(r2_train)
results["test"].append(r2_test)
results["model"].append("Tunned Random Forest")

result = pd.DataFrame(results).set_index("model").sort_values("test", ascending=False).to_html()
display(HTML(result))
print(rf_search.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  25 out of  25 | elapsed:    4.9s finished


Unnamed: 0_level_0,train,test
model,Unnamed: 1_level_1,Unnamed: 2_level_1
Tunned Random Forest,0.38954,-0.008931
Default Random Forest,0.852084,-0.067037


{'n_estimators': 19, 'max_features': 17, 'max_depth': 11}


In [12]:
train.loc[:, "prediction"] = rf_train_pred
test.loc[:, "prediction"] = rf_test_pred
both = [train[["target_return", "prediction"]], test[["target_return", "prediction"]]]
complete_forecast = pd.concat(both)
out_path_l = ["results", "forecast",fs_method,model_name,out_folder,ticker_name ]
out_path = os.path.join(*out_path_l)
out_path
complete_forecast.to_csv(out_path)

In [13]:
tempo = (time() - init) / 60
print(np.round(tempo,2), "min")

3.92 min


In [22]:
train.loc[:, "type"] = "train"
test.loc[:, "type"] = "test"

both = [train[["target_return", "prediction", "type"]],
        test[["target_return", "prediction", "type"]]]

complete_forecast = pd.concat(both)

In [27]:
t = complete_forecast[complete_forecast.type=="test"]
tt = new_r2(t.target_return.values, t.prediction.values)

In [29]:
tt, r2_test

(-0.008931305820449209, -0.008931305820449209)