# Forecast Draft

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from glob import glob
from time import time

from data_mani.utils import merge_market_and_gtrends
from sklearn.ensemble import RandomForestRegressor
from prediction.util import new_r2, add_shift
from prediction.util import hyper_params_search
from prediction.util import get_selected_features
from prediction.util import annualy_fit_and_predict
from prediction.models import RandomForestWrapper

## Check Feature Selection

In [3]:
p1 = [n.split("/")[-1] for n in glob("results/feature_selection/sfi/spx/*.csv")]
p2 = [n.split("/")[-1] for n in glob("results/feature_selection/mdi/spx/*.csv")]
p3 = [n.split("/")[-1] for n in glob("results/feature_selection/mda/spx/*.csv")]
p4 = [n.split("/")[-1] for n in glob("results/feature_selection/granger/spx/*.csv")]
assert len(p1) == len(p2) == len(p3) == len(p4) == 1024

## Variables

In [5]:
ticker_name =  'ANDV US Equity'
fs_method = "sfi"
market_folder = "spx"
N_JOBS = -1
N_ITER = 10
N_SPLITS = 4
DEBUG = False
init = time()

# Get merged dataframe 

In [4]:
ticker_path = "data/index/{}/{}.csv".format(market_folder,ticker_name)
train, test = merge_market_and_gtrends(ticker_path, test_size=0.5)
print("train.shape = ", train.shape)
print("test.shape = ", test.shape)

train.shape =  (1856, 183)
test.shape =  (1856, 183)


## Add shift + Preprocessing + Selecting Features

In [5]:
words = train.drop("target_return",1).columns.to_list()

add_shift(merged_df=train, words=words, max_lag=20)

if fs_method == "all":
    select = train.drop("target_return",1).columns.to_list()
else:
    select = get_selected_features(ticker_name=ticker_name,
                                   out_folder=market_folder,
                                   fs_method=fs_method)


train = train[["target_return"] + select]
train = train.fillna(0.0)

add_shift(merged_df=test, words=words, max_lag=20)
test = test[["target_return"] + select]
test = test.fillna(0.0)

add shift: 100%|██████████| 182/182 [00:39<00:00,  4.62it/s]
add shift: 100%|██████████| 182/182 [00:31<00:00,  5.74it/s]


In [6]:
print("train.shape = ", train.shape)
print("test.shape = ", test.shape)

train.shape =  (1856, 2220)
test.shape =  (1856, 2220)


# Hyperparameter search + Prediction

In [7]:
complete = pd.concat([train, test])

if DEBUG:
    complete = complete[:"2006"]

pred_results = annualy_fit_and_predict(df=complete,
                                       Wrapper=RandomForestWrapper,
                                       n_iter=N_ITER,
                                       n_jobs=N_JOBS,
                                       n_splits=N_SPLITS,
                                       target_name="target_return")

anual training and prediction:   0%|          | 0/14 [00:00<?, ?it/s]

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   14.5s finished
anual training and prediction:   7%|▋         | 1/14 [00:14<03:10, 14.69s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   37.5s finished
anual training and prediction:  14%|█▍        | 2/14 [00:52<04:18, 21.54s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:   13.2s remaining:    7.9s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished
anual training and prediction:  21%|██▏       | 3/14 [02:12<07:11, 39.26s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:    5.1s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   19.1s finished
anual training and prediction:  29%|██▊       | 4/14 [02:32<05:35, 33.50s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.1min finished
anual training and prediction:  36%|███▌      | 5/14 [04:40<09:15, 61.77s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:   25.7s remaining:   15.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.1min finished
anual training and prediction:  43%|████▎     | 6/14 [07:27<12:27, 93.44s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.0min finished
anual training and prediction:  50%|█████     | 7/14 [08:52<10:34, 90.69s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:   39.1s remaining:   23.5s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.9min finished
anual training and prediction:  57%|█████▋    | 8/14 [11:47<11:37, 116.21s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.1min finished
anual training and prediction:  64%|██████▍   | 9/14 [13:56<09:59, 119.90s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  5.8min finished
anual training and prediction:  71%|███████▏  | 10/14 [19:42<12:30, 187.74s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.5min finished
anual training and prediction:  79%|███████▊  | 11/14 [21:12<07:55, 158.54s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.0min finished
anual training and prediction:  86%|████████▌ | 12/14 [23:14<04:55, 147.54s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.0min finished
anual training and prediction:  93%|█████████▎| 13/14 [25:15<02:19, 139.40s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  5.9min finished
anual training and prediction: 100%|██████████| 14/14 [31:09<00:00, 133.51s/it]


## Prediction DataFrame

In [8]:
pred_results.head(3)

Unnamed: 0,date,return,prediction
0,2005-01-03,-0.063089,0.003729
1,2005-01-04,-0.01139,0.003766
2,2005-01-05,-0.008811,0.00446


# Saving the predictions

In [9]:
out_path = "results/forecast/{}/{}/{}.csv".format(fs_method,market_folder,ticker_name)
pred_results.to_csv(out_path, index=False)

## Simple Analysis

In [10]:
years = list(set(complete.index.map(lambda x: x.year)))
years.sort()
years = years[:-1]

total_r2 = new_r2(pred_results["return"].values, pred_results["prediction"].values)
pred_results.loc[:, "date"] = pd.to_datetime(pred_results.date)
pred_results = pred_results.set_index("date")
all_r2 = []

for year in years[1:]:
    df_ =  pred_results[str(year)]
    r2_ = new_r2(df_["return"].values, df_["prediction"].values)
    all_r2.append(r2_)
all_r2 = pd.Series(all_r2, index=years[1:])

In [13]:
print("total r2 = {:.4f}".format(total_r2))
print("\nr2 by year:\n")
print(all_r2)

total r2 = -0.0176

r2 by year:

2005   -0.007965
2006   -0.009746
2007    0.004973
2008   -0.005079
2009   -0.008586
2010    0.005260
2011   -0.014230
2012    0.004360
2013    0.004084
2014    0.003612
2015   -0.002404
2016   -0.268245
2017    0.010570
dtype: float64


In [12]:
tempo = (time() - init) / 60
print("total run time = ", np.round(tempo,2), "min")

total run time =  32.35 min
