In [1]:
import pandas as pd
import numpy as np
from data_mani.utils import merge_market_and_gtrends
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from prediction.util import new_r2, add_shift
from prediction.util import get_selected_features
from prediction.models import RandomForestWrapper


from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import make_scorer
from IPython.display import display, HTML
from time import time
import os

## Variables

In [2]:
N_ITER_SEARCH = 5
N_SPLITS_SEARCH = 5
N_JOBS = 2

In [3]:
init = time()

## Get selected features 

In [4]:
ticker_name =  "AMZN US Equity"
fs_method = "sfi"
out_folder = "nasdaq"

select = get_selected_features(ticker_name=ticker_name,
                               out_folder=out_folder,
                               fs_method=fs_method)

print("number of selected features:", len(select))

number of selected features: 2005


## Get merged dataframe 

In [5]:
ticker_path = "data/crsp/{}/{}.csv".format(out_folder,ticker_name)
train, test = merge_market_and_gtrends(ticker_path, test_size=0.5)

In [6]:
train.shape, test.shape

((2087, 183), (2087, 183))

In [7]:
np.sqrt(182*20)

60.332412515993425

## Add shift + Preprocessing

In [8]:
words = train.drop("target_return",1).columns.to_list()

add_shift(merged_df=train, words=words, max_lag=20)
train = train[["target_return"] + select]
train = train.fillna(0.0)

add_shift(merged_df=test, words=words, max_lag=20)
test = test[["target_return"] + select]
test = test.fillna(0.0)

add shift: 100%|██████████| 182/182 [01:20<00:00,  2.25it/s]
add shift: 100%|██████████| 182/182 [01:06<00:00,  2.72it/s]


In [9]:
train.shape, test.shape

((2087, 2006), (2087, 2006))

In [10]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

rf_wrapper = RandomForestWrapper()

# model_name = "random_forest"
# rf_params =  {"max_features":['auto', 'sqrt', 'log2'],
#               "min_samples_split":sp_randint(2, 31),
#               "n_estimators": sp_randint(2, 301),
#               "max_depth": sp_randint(2, 20)}

# RandomForestRegressor()

## Hyperparameter search

In [11]:
def hyper_params_search(df,
                        wrapper,
                        n_iter,
                        n_splits,
                        n_jobs,
                        target_name="target_return"):
    """
    Use the dataframe 'df' to search for the best
    params for the model 'wrapper'.
    
    The CV split is performed using the TimeSeriesSplit
    class

    "Empirical Asset Pricing via Machine
    Learning"

    :param df: train data
    :type df: pd.DataFrame
    :param wrapper: predictive model
    :type wrapper: sklearn model wrapper
    :param n_iter: number of hyperparameter searchs
    :type n_iter: int
    :param n_splits: number of cross-validation splits
    :type n_splits: int
    :param target_name: name of the target column in 'df'
    :type target_name: str
    :return: R2 value
    :rtype: float
    """

    X = df.drop(target_name,1).values
    y = df[target_name].values

    time_split = TimeSeriesSplit(n_splits=n_splits)
    r2_scorer = make_scorer(new_r2)

    model_search = RandomizedSearchCV(estimator=wrapper.ModelClass,
                                      param_distributions=wrapper.param_grid,
                                      n_iter=n_iter,
                                      cv=time_split,
                                      verbose=1,
                                      n_jobs=n_jobs,
                                      scoring=r2_scorer)

    model_search = model_search.fit(X,y)

    return model_search



In [12]:
rf_wrapper = RandomForestWrapper()
model_search =  hyper_params_search(df=train,
                                    wrapper=rf_wrapper,
                                    n_jobs=2,
                                    n_iter=3,
                                    n_splits=2)

Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    5.8s finished


In [13]:
X_test = test.drop("target_return",1).values
y_test = test.target_return.values

test_pred  = model_search.best_estimator_.predict(X_test)

r2_test = new_r2(y_test, test_pred)
r2_test

0.004701568881441576

In [15]:
model_search.best_params_

{'max_depth': 2,
 'max_features': 'sqrt',
 'min_samples_split': 30,
 'n_estimators': 246}

In [16]:
# assert False

In [None]:
# model_search.best_params_

## Random Forest Hyperparameter search

In [None]:
# time_split = TimeSeriesSplit(n_splits=N_SPLITS)
# r2_scorer = make_scorer(new_r2)
# n_columns = X_train.shape[1]
# rf_params =  {"max_features":list(range(1, int(np.sqrt(n_columns)+1))),
#               "n_estimators": list(range(2, 60)),
#               "max_depth": list(range(2, 21))}

# rf_search = RandomizedSearchCV(estimator=RandomForestRegressor(),
#                                param_distributions=rf_params,
#                                n_iter=N_ITER,
#                                cv=time_split,
#                                verbose=1,
#                                n_jobs=N_JOBS,
#                                scoring=r2_scorer)

# rf_search = rf_search.fit(X_train,y_train)
# rf_train_pred  = rf_search.best_estimator_.predict(X_train)
# rf_test_pred  = rf_search.best_estimator_.predict(X_test)

# r2_train = new_r2(y_train, rf_train_pred)
# r2_test = new_r2(y_test, rf_test_pred)

# results["train"].append(r2_train)
# results["test"].append(r2_test)
# results["model"].append("Tunned Random Forest")

# result = pd.DataFrame(results).set_index("model").sort_values("test", ascending=False).to_html()
# display(HTML(result))
# print(rf_search.best_params_)

In [None]:
# best_model = RandomForestRegressor(**rf_search.best_params_)

In [None]:
# print(best_model)

In [None]:
# from sklearn.model_selection import TimeSeriesSplit
# complete = pd.concat([train,test])
# splits = int(complete.shape[0]/30)
# tscv = TimeSeriesSplit(n_splits=splits)

In [None]:
# r2_hist = []


# for train_index, test_index in tscv.split(complete):
#     df_train = complete.iloc[train_index]
#     df_test = complete.iloc[test_index]
    
#     model = RandomForestRegressor(**rf_search.best_params_)
# #     print(df_train.shape)
# #     print(df_test.shape)
    
    
# #     X_train = train.drop("target_return",1).values
# #     y_train = train.target_return.values

# #     X_test = test.drop("target_return",1).values
# #     y_test = test.target_return.values

In [None]:
# train.loc[:, "prediction"] = rf_train_pred
# test.loc[:, "prediction"] = rf_test_pred
# both = [train[["target_return", "prediction"]], test[["target_return", "prediction"]]]
# complete_forecast = pd.concat(both)
# out_path_l = ["results", "forecast",fs_method,model_name,out_folder,ticker_name ]
# out_path = os.path.join(*out_path_l)
# out_path
# complete_forecast.to_csv(out_path)

In [None]:
# tempo = (time() - init) / 60
# print(np.round(tempo,2), "min")

In [None]:
# train.loc[:, "type"] = "train"
# test.loc[:, "type"] = "test"

# both = [train[["target_return", "prediction", "type"]],
#         test[["target_return", "prediction", "type"]]]

# complete_forecast = pd.concat(both)

In [None]:
# t = complete_forecast[complete_forecast.type=="test"]
# tt = new_r2(t.target_return.values, t.prediction.values)

In [None]:
# tt, r2_test