In [None]:
import pandas as pd
import requests
import csv
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn import model_selection, metrics
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import csv
from scipy.stats import spearmanr

In [None]:
df = pd.read_parquet("../data/f_matrix.parquet")
target = pd.read_parquet("../data/target.parquet")
target = target[["date", "target_r"]]
# Only way to get extra information in a custom scorer function in sklearn is to pass a pandas Serie with the needed data in index.
# https://stackoverflow.com/questions/67227646/can-i-get-extra-information-to-a-custom-scorer-function-in-sklearn
target = target.set_index("date")
target = target.squeeze()

In [None]:
class TimeSeriesSplitGroups(_BaseKFold):
    def __init__(self, n_splits=5):
        super().__init__(n_splits, shuffle=False, random_state=None)

    """
    This function make sure that the split is not done arbitrarly in the middle of a cross-section.
    credit: https://forum.numer.ai/t/era-wise-time-series-cross-validation/791
    """

    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_list = np.unique(groups)
        n_groups = len(group_list)
        if n_folds > n_groups:
            raise ValueError(
                (
                    "Cannot have number of folds ={0} greater"
                    " than the number of samples: {1}."
                ).format(n_folds, n_groups)
            )
        indices = np.arange(n_samples)
        test_size = n_groups // n_folds  # floor division of index
        test_starts = range(test_size + n_groups % n_folds, n_groups, test_size)
        test_starts = list(test_starts)[::-1]
        for test_start in test_starts:
            yield (
                indices[groups.isin(group_list[:test_start])],
                indices[groups.isin(group_list[test_start : test_start + test_size])],
            )


"""
Custom scorer function.
Cross-sectional Spearman's correlation Sharpe ratio.
"""


def spearman(y_true, y_pred):
    data = pd.DataFrame(y_true)
    data["preds"] = y_pred
    data.reset_index(inplace=True)
    cor = data.groupby("date").corr(method="spearman").iloc[0::2, -1]

    return cor.mean() / cor.std()

In [None]:
features = [f for f in df.columns if f.startswith("feature")]
eras = df.date

In [None]:
# Creating a serie of hyperparameters for XGBs.
# Grid-Search can be plugged here.
cv_score = []
models = []
for lr in [0.006, 0.008, 0.01, 0.012, 0.014]:
    for cs in [0.06, 0.08, 0.1, 0.12, 0.14]:
        for md in [4, 5, 6]:
            models.append(
                XGBRegressor(
                    colsample_bytree=cs,
                    learning_rate=lr,
                    n_estimators=2000,
                    max_depth=md,
                    nthread=8,
                )
            )
del df["date"]

In [None]:
# Timeserie nested k-fold
for model in models:
    score = np.mean(
        model_selection.cross_val_score(
            model,
            df,
            target,
            cv=TimeSeriesSplitGroups(5),  # increase granularity here
            n_jobs=1,
            groups=eras,
            scoring=metrics.make_scorer(spearman, greater_is_better=True),
        )
    )
    cv_score.append(score)
    print(cv_score)