In [3]:
# default_exp models

In [4]:
# export

import pandas as pd
import numpy as np
import logging
# import tpot

from mlbt.utils import PurgedKFold
from math import ceil

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


def undersample(events, X, y):
    from imblearn.under_sampling import RandomUnderSampler
    under = RandomUnderSampler()

    _, _ = under.fit_sample(X, y)
    X_re = X.iloc[under.sample_indices_].sort_index()
    y_re = y.iloc[under.sample_indices_].sort_index()
    events_re = events.iloc[under.sample_indices_].sort_index()
    return events_re, X_re, y_re
    

def clf_hyper_fit(
    feat,
    lbl,
    t1,
    pipe_clf,
    param_grid,
    cv=5,
    bagging=[0, None, 1.0],
    rnd_search_iter=0,
    n_jobs=-1,
    pct_embargo=0,
    **fit_params,
):
    if set(lbl.values) == {0, 1}:
        scoring = "f1"  # f1 for meta-labeling
    else:
        scoring = "neg_log_loss"  # symmetric towards all classes

    # 1) hyperparameter searching, on train data
    inner_cv = PurgedKFold(
        n_splits=cv, t1=t1, pct_embargo=pct_embargo, random_state=None
    )
    if rnd_search_iter == 0:
        gs = GridSearchCV(
            estimator=pipe_clf,
            param_grid=param_grid,
            scoring=scoring,
            cv=inner_cv,
            n_jobs=n_jobs,
            iid=False,
        )
    else:
        gs = RandomizedSearchCV(
            estimator=pipe_clf,
            param_distributions=param_grid,
            scoring=scoring,
            cv=inner_cv,
            n_jobs=n_jobs,
            iid=False,
            n_iter=rnd_search_iter,
        )
    gs = gs.fit(feat, lbl, **fit_params)
    return gs


RF_PARAM_GRID = {
    "n_estimators": np.arange(10, 200, 10),
    "max_depth": np.arange(1, 11, 1),
}

XGB_PARAM_GRID = {
    "eta": np.arange(0.2, 0.41, 0.01),
    "max_depth": np.arange(1, 8, 1),
    "colsample_bytree": np.arange(0.3, 1.1, 0.1),
    "gamma": np.arange(0.0, 0.55, 0.05),
    "n_estimators": np.arange(25, 275, 25),
    "min_child_weight": np.arange(1, 10, 1),
}

LGBM_PARAM_GRID = {
    "max_depth": np.arange(1, 8, 1),
    "num_leaves": np.arange(8, 130, 2),
    "colsample_bytree": np.arange(0.3, 1.05, 0.05),
    "n_estimators": np.arange(25, 275, 25),
    "learning_rate": np.arange(0.01, 0.2, 0.01),
}

KNN_PARAM_GRID = {"n_neighbors": np.arange(1, 31, 1), "p": np.arange(1, 4, 1)}

SVC_PARAM_GRID = {
    "C": [0.1, 1, 10, 100, 1000],
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001],
    "probability": [True],
    "max_iter": [100000],
}


def get_model(
    events,
    X_all,
    y_all,
    clf_type,
    optimize_hypers,
    hypers_n_iter,
    num_threads=32,
    n_jobs=4,
    hyper_params=None,
):
    # X_all and y_all in this context are X_train and y_train in the grander scheme
    logging.info(f"Getting model {clf_type}")
    param_grids = {
        "random_forest": RF_PARAM_GRID,
        "xgboost": XGB_PARAM_GRID,
        "lgbm": LGBM_PARAM_GRID,
        "svc": SVC_PARAM_GRID,
        "knn": KNN_PARAM_GRID,
        "dummy": {},
    }
    clfs = {
        "random_forest": RandomForestClassifier,
        "xgboost": XGBClassifier,
        "lgbm": LGBMClassifier,
        "svc": SVC,
        "knn": KNeighborsClassifier,
        "dummy": DummyClassifier,
    }

    hyper_params = hyper_params or {}
    extra_hyper_params = {}

    clf = clfs[clf_type](**hyper_params, **extra_hyper_params)

    param_grid = param_grids[clf_type]
    if not param_grid:  # nothing to do
        return clf, hyper_params

    if not hyper_params and optimize_hypers:
        # We generally expect to be run with high num_threads which means we don't have to parallelize at the clf level here
#         clf.n_jobs = 1
        logging.info(
            f"hyperparam search n_iter={hypers_n_iter} for {clf_type} on num_threads={num_threads} and n_jobs={clf.n_jobs}"
        )
        events_re, X_re, y_re = undersample(events, X_all, y_all)
        search = clf_hyper_fit(
            feat=X_re,
            lbl=y_re,
            t1=events_re["t1"],
            pipe_clf=clf,
            param_grid=param_grid,
            rnd_search_iter=hypers_n_iter,
            n_jobs=num_threads,
        )

        clf, hyper_params = search.best_estimator_, search.best_params_

    if clf_type == 'random_forest':
        clf.n_jobs = min(8, n_jobs) # Doesn't like high n_jobs
    else:
        clf.n_jobs = n_jobs

    return clf, hyper_params

In [7]:
from path import Path

In [8]:
events = pd.read_hdf(Path('~/Dropbox/algotrading/data_trash/events_train.h5').expanduser(), 'table')
X = pd.read_hdf(Path('~/Dropbox/algotrading/data_trash/X_train.h5').expanduser(), 'table')
y = pd.read_hdf(Path('~/Dropbox/algotrading/data_trash/y_train.h5').expanduser(), 'table')


In [9]:
y.value_counts()

-1.0    290022
 1.0    263049
Name: bin, dtype: int64

In [33]:
neg, pos = y.value_counts()

290022

In [49]:
from sklearn.metrics import f1_score

In [None]:
from imblearn.under_sampling import RandomUnderSampler


under = RandomUnderSampler()

_, _ = under.fit_sample(X, y)

In [42]:
under.sample_indices_

array([417835, 485784,  63550, ..., 553068, 553069, 553070], dtype=int64)

In [43]:
X_re = X.iloc[under.sample_indices_].sort_index()
y_re = y.iloc[under.sample_indices_].sort_index()
events_re = events.iloc[under.sample_indices_].sort_index()


In [45]:
clf = LGBMClassifier()
param_grid = LGBM_PARAM_GRID
hypers_n_iter = 5
num_threads = 32
search = clf_hyper_fit(
    feat=X_re,
    lbl=y_re,
    t1=events_re["t1"],
    pipe_clf=clf,
    param_grid=param_grid,
    rnd_search_iter=hypers_n_iter,
    n_jobs=num_threads,
)




In [47]:
search

LGBMClassifier(boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.8999999999999999, importance_type='split',
               learning_rate=0.1, max_depth=3, min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, num_leaves=34, objective=None, random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [48]:
search.fit(X_re, y_re)



RandomizedSearchCV(cv=PurgedKFold(n_splits=3, pct_embargo=0, random_state=None,
      t1=2007-04-27 13:20:00.000042   2007-04-30 08:50:00.000042
2007-04-27 13:34:00.000043   2007-04-30 09:35:00.000043
2007-04-27 13:39:00.000042   2007-04-30 09:11:00.000042
2007-04-27 13:41:00.000021   2007-05-03 10:46:00.000021
2007-04-27 13:42:00.000043   2007-04-30 09:53:00.000043
2007-04-27 13:44:...8...
                                        'n_estimators': array([ 25,  50,  75, 100, 125, 150, 175, 200, 225, 250]),
                                        'num_leaves': array([  8,  10,  12,  14,  16,  18,  20,  22,  24,  26,  28,  30,  32,
        34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,
        60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,
        86,  88,  90,  92,  94,  96,  98, 100, 102, 104, 106, 108, 110,
       112, 114, 116, 118, 120, 122, 124, 126, 128])},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   

In [51]:
f1_score(y_re, search.predict(X_re), average='macro')

0.5331152055886863

In [52]:
search2 = clf_hyper_fit(
    feat=X,
    lbl=y,
    t1=events["t1"],
    pipe_clf=clf,
    param_grid=param_grid,
    rnd_search_iter=hypers_n_iter,
    n_jobs=num_threads,
)




In [53]:
f1_score(y, search2.predict(X), average='macro')

0.4303082696827688

In [None]:
def undersample(events, X, y):
    from imblearn.under_sampling import RandomUnderSampler
    under = RandomUnderSampler()

    _, _ = under.fit_sample(X, y)
    X_re = X.iloc[under.sample_indices_].sort_index()
    y_re = y.iloc[under.sample_indices_].sort_index()
    events_re = events.iloc[under.sample_indices_].sort_index()
    return X_re, y_re, events_re
    