Copy the file **spy.2008.2021.csv.gz** and **spy.csv.gz** located in https://github.com/crapher/medium/tree/main/13.TrendIntraday\data to your colab folder

In [1]:
import os
import math
import numpy as np
import pandas as pd
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# Constants
BARS=15
RESULT_DIR='./result'
DATE_SPLIT='2019-06-01'

In [3]:
# Get the data and generate the train, validation, and test datasets
def get_datasets():

    train_val = pd.read_csv('./spy.2008.2021.csv.gz', compression='gzip')
    train_val = train_val[['date','open','close']]
    train_val['date'] = pd.to_datetime(train_val['date'])

    test = pd.read_csv('./spy.csv.gz', compression='gzip')
    test = test[['date','open','close']]
    test['date'] = pd.to_datetime(test['date'])

    train = train_val[train_val['date'] <= DATE_SPLIT]
    validation = train_val[(train_val['date'] > DATE_SPLIT) & (train_val['date'] < test['date'].min())]

    return train, validation, test

In [4]:
# Process Dataframe and return features and targets
def get_features_targets(df, scale_obs=True):

    feature_result = []
    dates = []

    # Remove duplicated dates
    df = df.groupby(by='date').mean().reset_index()

    # Get Features based on BARS configuration
    features = df[((df['date'].dt.hour == 9) & (df['date'].dt.minute >= 30)) &
                   (df['date'].dt.hour == 9) & (df['date'].dt.minute < 30 + BARS)]
    features = features.groupby(features['date'].dt.date)

    for dt, feature in features:

        if len(feature) != BARS:
            feature = feature.set_index('date')
            feature = feature.resample('1T').asfreq().reindex(pd.date_range(str(dt) + ' 09:30:00', str(dt) + f' 09:{30+BARS-1}:00', freq='1T'))
            feature = feature.reset_index()
            feature['close'] = feature['close'].fillna(method='ffill')
            feature['open'] = feature['open'].fillna(feature['close'])
            feature = feature.dropna()

        if len(feature) == BARS:
            feature = feature['close'].values

            if scale_obs:
                feature -= np.min(feature)
                feature /= np.max(np.abs(feature))
                feature = np.nan_to_num(feature, nan=0.0, posinf=0.0, neginf=0.0)

            feature_result.append(feature)
            dates.append(dt)

    # Get Targets Trend based on first and last value / day (0: DOWN - 1: UP)
    targets = df.set_index('date')
    targets = targets.resample('1D').agg({'open':'first', 'close':'last'})
    targets = targets.loc[dates].reset_index().sort_values(by='date')
    targets['trend'] = np.where(targets['open'] < targets['close'], 1, 0)

    print(len(feature_result), len(targets))
    return np.array(feature_result), np.array(targets['trend'].values)

In [5]:
# Show the result of the operation
def show_result(target, pred, ds_type='TEST'):

    target = np.array(target)
    pred = np.array(pred)

    print(f' RESULT {ds_type.upper()} '.center(56, '*'))

    print('* Confusion Matrix (Top: Predicted - Left: Real)')
    print(confusion_matrix(y_true=target, y_pred=pred))

    print('* Classification Report')
    print(classification_report(target, pred))

In [6]:
# Save the results
def save_result(target, pred, name):

    os.makedirs(RESULT_DIR, exist_ok=True)
    df = pd.DataFrame.from_dict({'pred': pred, 'target': target})
    df.to_csv(f'{RESULT_DIR}/{name}', index=False)

In [7]:
# Return the predicted values
def get_predicted_values(model, features):

    pred_y = model.predict(features)
    return pred_y

In [8]:
# Get the datasets to be used in the tests
train, validation, test = get_datasets()

In [9]:
# Get Features and Targets
train_x, train_y = get_features_targets(train)
val_x, val_y = get_features_targets(validation)
test_x, test_y = get_features_targets(test)

2858 2858
487 487
529 529


In [10]:
# Find the best parameters
model = xgb.XGBClassifier(
    n_estimators=10000,
    seed=42,
    early_stopping_rounds=10)

params = {
        'min_child_weight': [50, 100, 300, 500],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [4, 5, 6, 7, 8],
        'eta': [0.01, 0.05, 0.1]}

grid = RandomizedSearchCV(
    model,
    param_distributions=params,
    n_iter=30,
    scoring='roc_auc',
    n_jobs=4,
    random_state=42)

grid.fit(
    train_x,
    train_y,
    eval_set=[(train_x, train_y), (val_x, val_y)],
    verbose=False)

print(grid.best_params_)

{'subsample': 0.8, 'min_child_weight': 50, 'max_depth': 5, 'eta': 0.01, 'colsample_bytree': 1.0}


In [11]:
# Train Model
model = xgb.XGBClassifier(
    max_depth=grid.best_params_['max_depth'],
    n_estimators=10000,
    min_child_weight=grid.best_params_['min_child_weight'],
    colsample_bytree=grid.best_params_['colsample_bytree'],
    subsample=grid.best_params_['subsample'],
    eta=grid.best_params_['eta'],
    seed=42,
    early_stopping_rounds=10)

model.fit(
    train_x,
    train_y,
    eval_set=[(train_x, train_y), (val_x, val_y)],
    verbose=True)

[0]	validation_0-logloss:0.69226	validation_1-logloss:0.69251
[1]	validation_0-logloss:0.69137	validation_1-logloss:0.69182
[2]	validation_0-logloss:0.69049	validation_1-logloss:0.69111
[3]	validation_0-logloss:0.68960	validation_1-logloss:0.69045
[4]	validation_0-logloss:0.68876	validation_1-logloss:0.68984
[5]	validation_0-logloss:0.68790	validation_1-logloss:0.68914
[6]	validation_0-logloss:0.68710	validation_1-logloss:0.68843
[7]	validation_0-logloss:0.68630	validation_1-logloss:0.68789
[8]	validation_0-logloss:0.68547	validation_1-logloss:0.68729
[9]	validation_0-logloss:0.68471	validation_1-logloss:0.68671
[10]	validation_0-logloss:0.68399	validation_1-logloss:0.68615
[11]	validation_0-logloss:0.68328	validation_1-logloss:0.68562
[12]	validation_0-logloss:0.68255	validation_1-logloss:0.68510
[13]	validation_0-logloss:0.68186	validation_1-logloss:0.68460
[14]	validation_0-logloss:0.68110	validation_1-logloss:0.68403
[15]	validation_0-logloss:0.68040	validation_1-logloss:0.68348
[1

In [12]:
# Predict and save train values
pred_y = get_predicted_values(model, train_x)
save_result(train_y, pred_y, 'xgb.train.csv.gz')

In [13]:
# Predict and save validation values
pred_y = get_predicted_values(model, val_x)
save_result(val_y, pred_y, 'xgb.val.csv.gz')

In [14]:
# Predict, show and save test values
pred_y = get_predicted_values(model, test_x)
show_result(test_y, pred_y)
save_result(test_y, pred_y, 'xgb.test.csv.gz')


********************* RESULT TEST **********************
* Confusion Matrix (Top: Predicted - Left: Real)
[[115 134]
 [ 64 216]]
* Classification Report
              precision    recall  f1-score   support

           0       0.64      0.46      0.54       249
           1       0.62      0.77      0.69       280

    accuracy                           0.63       529
   macro avg       0.63      0.62      0.61       529
weighted avg       0.63      0.63      0.62       529

