In [1]:
import os
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import numpy as np
import pandas as pd
from data_mani.utils import merge_market_and_gtrends
from prediction.functions import add_shift, get_features_granger_huang, new_r2

In [2]:
def hyper_params_search(df,
                        wrapper,
                        n_iter,
                        n_splits,
                        n_jobs,
                        verbose,
                        target_name="target_return"):
    """
    Use the dataframe 'df' to search for the best
    params for the model 'wrapper'.

    The CV split is performed using the TimeSeriesSplit
    class.

    We can define the size of the test set using the formula

    ``n_samples//(n_splits + 1)``,


    where ``n_samples`` is the number of samples. Hence,
    we can define

    n_splits = (n - test_size) // test_size


    :param df: train data
    :type df: pd.DataFrame
    :param wrapper: predictive model
    :type wrapper: sklearn model wrapper
    :param n_iter: number of hyperparameter searchs
    :type n_iter: int
    :param n_splits: number of splits for the cross-validation
    :type n_splits: int
    :param n_jobs: number of concurrent workers
    :type n_jobs: int
    :param verbose: param to print iteration status
    :type verbose: bool, int
    :param target_name: name of the target column in 'df'
    :type target_name: str
    :return: R2 value
    :rtype: float
    """

    X = df.drop(target_name, 1).values
    y = df[target_name].values

    time_split = TimeSeriesSplit(n_splits=n_splits)
    r2_scorer = make_scorer(new_r2)

    if wrapper.search_type == 'random':
        model_search = RandomizedSearchCV(estimator=wrapper.ModelClass,
                                          param_distributions=wrapper.param_grid,
                                          n_iter=n_iter,
                                          cv=time_split,
                                          verbose=verbose,
                                          n_jobs=n_jobs,
                                          scoring=r2_scorer)
    else:
        model_search = RandomizedSearchCV(estimator=wrapper.ModelClass,
                                          param_distributions=wrapper.param_grid,
                                          n_iter=n_iter,
                                          cv=time_split,
                                          verbose=verbose,
                                          n_jobs=n_jobs,
                                          scoring=r2_scorer)

    model_search = model_search.fit(X, y)

    return model_search

In [4]:
class LassoWrapper():
    def __init__(self, model_params=None):
        self.model_name = "lasso"
        self.search_type = 'random'
        self.param_grid = {'alphas': np.linspace(0, 1, 100, endpoint=True)}
        if model_params is None:
            self.ModelClass = Lasso()
        else:
            self.ModelClass = Lasso(**model_params)

In [7]:
ticker_name = 'SPX Index'
target_name="target_return"
max_lag = 20
n_splits = 5
verbose = False
wrapper = LassoWrapper()
path_list = ["data", "gtrends.csv"]

ticker_path = "data/indices/{}.csv".format(ticker_name)

In [14]:
train, test = merge_market_and_gtrends(ticker_path, test_size=0.5, path_gt_list=path_list)
words = train.drop(target_name, 1).columns.to_list()
complete = pd.concat([train, test])

del train, test

add_shift(merged_df=complete,
            words=words,
            max_lag=max_lag,
            verbose=verbose)
complete = complete.fillna(0.0)
all_features = complete.drop(words + [target_name], 1).columns.to_list()


In [7]:
select = get_features_granger_huang(ticker_name=ticker_name,
                                    out_folder="indices",
                                    fs_method='granger',
                                    path_list=path_list)

complete_selected = complete[[target_name] + select]

In [8]:
df = complete_selected

all_preds = []

years = df.index.map(lambda x: x.year)
years = range(np.min(years), np.max(years))
y = years[0]

train_ys = df.loc[:str(y)]
test_ys = df.loc[str(y + 1)]

In [10]:
X = df.drop(target_name, 1).values
y = df[target_name].values

time_split = TimeSeriesSplit(n_splits=n_splits)
r2_scorer = make_scorer(new_r2)

model_search = GridSearchCV(estimator=wrapper.ModelClass,
                            param_grid=wrapper.param_grid,
                            cv=time_split,
                            verbose=verbose,
                            scoring=r2_scorer)

model_search = model_search.fit(X, y)


ValueError: Parameter grid for parameter (fit_intercept) needs to be a list or numpy array, but got (<class 'bool'>). Single values need to be wrapped in a list with one element.

In [None]:
df = complete_selected

all_preds = []

years = df.index.map(lambda x: x.year)
years = range(np.min(years), np.max(years))
for y in tqdm(years,
              disable=not verbose,
              desc="anual training and prediction"):
    train_ys = df.loc[:str(y)]
    test_ys = df.loc[str(y + 1)]

    # we have some roles in the time interval
    # for some tickers, for example,
    # "SBUX UA Equity"
    if test_ys.shape[0] > 0:
        model_wrapper = Wrapper()
        model_search = hyper_params_search(df=train_ys,
                                            wrapper=model_wrapper,
                                            n_jobs=n_jobs,
                                            n_splits=n_splits,
                                            n_iter=n_iter,
                                            verbose=verbose)
        X_test = test_ys.drop(target_name, 1).values
        y_test = test_ys[target_name].values
        test_pred = model_search.best_estimator_.predict(X_test)
        dict_ = {"date": test_ys.index,
                    "return": y_test,
                    "prediction": test_pred}
        result = pd.DataFrame(dict_)
        all_preds.append(result)
    else:
        pass