In [57]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import os
import re
import sys
sys.path.append("./utils")

from generate_features import *
from get_targets import *
from dnn_utils import *

from functools import reduce

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from catboost import CatBoostClassifier

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
DATA_PATH = "./data/prices/tech_companies_15minute.csv"
TICKERS = ["AAPL", "MSFT", "GOOG", "TSLA", "NVDA", "BABA", "AMD", "ATVI", "ZG"]
TARGET_TICKER = "AAPL"
ADDITIONAL_TICKERS = ["MSFT", "GOOG", "TSLA", "NVDA", "BABA", "AMD", "ATVI", "ZG"]

In [4]:
data = pd.read_csv(DATA_PATH, index_col=0)
data.dropna(inplace=True)
data = data[[ticker in TICKERS for ticker in data["ticker"]]]
data

Unnamed: 0,volume,open,close,high,low,date,ticker
0,67612.0,62.9025,62.6025,63.0275,62.6025,2020-04-06 07:59:00,AAPL
1,36348.0,62.6100,62.7500,62.7750,62.5125,2020-04-06 08:14:00,AAPL
2,27440.0,62.8500,62.7875,62.8500,62.4875,2020-04-06 08:29:00,AAPL
3,46384.0,62.8000,62.8675,62.8875,62.6375,2020-04-06 08:44:00,AAPL
4,24700.0,62.8000,62.7175,62.8475,62.5525,2020-04-06 08:59:00,AAPL
...,...,...,...,...,...,...,...
430677,35494.0,52.5900,52.0600,52.6000,52.0100,2022-04-04 19:30:00,ZG
430678,107073.0,51.9600,52.0700,52.1600,51.9101,2022-04-04 19:45:00,ZG
430679,12749.0,52.0300,52.1500,52.1500,52.0300,2022-04-04 20:00:00,ZG
430680,2000.0,51.9600,51.9600,51.9600,51.9600,2022-04-04 20:30:00,ZG


In [5]:
target_data = data[data["ticker"] == TARGET_TICKER]
additional_data = data[[ticker in ADDITIONAL_TICKERS for ticker in data["ticker"]]]
display(target_data)
display(additional_data)

Unnamed: 0,volume,open,close,high,low,date,ticker
0,67612.0,62.9025,62.6025,63.0275,62.6025,2020-04-06 07:59:00,AAPL
1,36348.0,62.6100,62.7500,62.7750,62.5125,2020-04-06 08:14:00,AAPL
2,27440.0,62.8500,62.7875,62.8500,62.4875,2020-04-06 08:29:00,AAPL
3,46384.0,62.8000,62.8675,62.8875,62.6375,2020-04-06 08:44:00,AAPL
4,24700.0,62.8000,62.7175,62.8475,62.5525,2020-04-06 08:59:00,AAPL
...,...,...,...,...,...,...,...
31858,6897.0,178.2400,178.3400,178.3800,178.2300,2022-04-04 22:45:00,AAPL
31859,13130.0,178.3300,178.2900,178.3300,178.2900,2022-04-04 23:00:00,AAPL
31860,12549.0,178.2800,178.2900,178.3000,178.2800,2022-04-04 23:15:00,AAPL
31861,7649.0,178.3000,178.2100,178.3700,178.2100,2022-04-04 23:30:00,AAPL


Unnamed: 0,volume,open,close,high,low,date,ticker
31863,1703.0,1138.36,1138.00,1138.36,1138.0000,2020-04-06 11:00:00,GOOG
31864,397.0,1138.00,1138.10,1138.10,1138.0000,2020-04-06 11:15:00,GOOG
31865,1671.0,1138.00,1138.50,1138.50,1135.8800,2020-04-06 11:30:00,GOOG
31866,1591.0,1138.22,1138.22,1138.22,1138.2200,2020-04-06 12:00:00,GOOG
31867,1267.0,1135.00,1136.00,1138.00,1135.0000,2020-04-06 12:15:00,GOOG
...,...,...,...,...,...,...,...
430677,35494.0,52.59,52.06,52.60,52.0100,2022-04-04 19:30:00,ZG
430678,107073.0,51.96,52.07,52.16,51.9101,2022-04-04 19:45:00,ZG
430679,12749.0,52.03,52.15,52.15,52.0300,2022-04-04 20:00:00,ZG
430680,2000.0,51.96,51.96,51.96,51.9600,2022-04-04 20:30:00,ZG


In [6]:
additional_data["ticker"].unique()

array(['GOOG', 'MSFT', 'TSLA', 'NVDA', 'BABA', 'AMD', 'ATVI', 'ZG'],
      dtype=object)

# Define help functions

In [7]:
def get_sample_weight(y: np.ndarray):
    num_observations = len(y)
    sample_weight = np.zeros(num_observations)
    for class_idx in np.unique(y):
        idx = (y == class_idx)
        sample_weight[idx] = idx.sum() / num_observations

    return sample_weight

In [8]:
def compute_balanced_accuracy(y_true: np.ndarray, y_pred: np.ndarray):
    assert y_true.shape == y_pred.shape, f"{y_true.shape}, {y_pred.shape}"
    sample_weight = get_sample_weight(y_true)
    assert y_true.shape == sample_weight.shape, f"{y_true.shape}, {sample_weight.shape}"
    return balanced_accuracy_score(y_true, y_pred, sample_weight=sample_weight)

In [9]:
def cross_validate_catboost(X: np.ndarray, y: np.ndarray, metric):
    tscv = TimeSeriesSplit()
    
    metrics = []
    for fold_idx, (train_idx, test_idx) in enumerate(tscv.split(X)):
        X_train = X[train_idx]
        X_test = X[test_idx]
        
        y_train = y[train_idx]
        y_test = y[test_idx]
        
        assert X_train.shape[0] == y_train.shape[0]
        assert X_test.shape[0] == y_test.shape[0]
        
        class_weight = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
        model = CatBoostClassifier(class_weights=class_weight, verbose=False)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test).flatten()
        assert y_test.shape == predictions.shape, f"{y_test.shape}, {predictions.shape}"
        
        fold_metric = metric(y_test, predictions)
        metrics.append(fold_metric)
        print(f"Fold {fold_idx + 1}: {fold_metric}")
    
    return np.mean(metrics)

In [10]:
all_data = simple_indicators(target_data).drop(["ticker"], axis=1).set_index("date")
for ticker in additional_data["ticker"].unique():
    ticker_data = additional_data[additional_data["ticker"] == ticker]
    ticker_indicators = simple_indicators(ticker_data).drop(["ticker"], axis=1).set_index("date")
    all_data = all_data.join(ticker_indicators, rsuffix=f"_{ticker}", how="left")

all_data.reset_index(drop=True, inplace=True)
all_data["target"] = get_direction_with_std_threshold_target(all_data, 10, 5, 5)
all_data.dropna(subset=["target"], inplace=True)

print(all_data.columns)
all_data

Index(['volume', 'open', 'close', 'high', 'low', 'macd', 'macds', 'macdh',
       'macd_xu_macds', 'macd_xd_macds',
       ...
       'macds_ZG', 'macdh_ZG', 'macd_xu_macds_ZG', 'macd_xd_macds_ZG',
       'boll_ZG', 'boll_ub_ZG', 'boll_lb_ZG', 'high_x_boll_ub_ZG',
       'low_x_boll_lb_ZG', 'target'],
      dtype='object', length=136)


Unnamed: 0,volume,open,close,high,low,macd,macds,macdh,macd_xu_macds,macd_xd_macds,...,macds_ZG,macdh_ZG,macd_xu_macds_ZG,macd_xd_macds_ZG,boll_ZG,boll_ub_ZG,boll_lb_ZG,high_x_boll_ub_ZG,low_x_boll_lb_ZG,target
0,67612.0,62.9025,62.6025,63.0275,62.6025,0.000000,0.000000,0.000000,False,False,...,,,,,,,,,,0.0
1,36348.0,62.6100,62.7500,62.7750,62.5125,0.003309,0.001838,0.001471,True,False,...,,,,,,,,,,0.0
2,27440.0,62.8500,62.7875,62.8500,62.4875,0.005380,0.003290,0.002090,False,False,...,,,,,,,,,,0.0
3,46384.0,62.8000,62.8675,62.8875,62.6375,0.009078,0.005250,0.003827,False,False,...,,,,,,,,,,0.0
4,24700.0,62.8000,62.7175,62.8475,62.5525,0.004927,0.005154,-0.000227,False,True,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31848,39099.0,178.4500,178.4700,178.5400,178.4400,0.583186,0.566229,0.016957,False,False,...,,,,,,,,,,0.0
31849,72351.0,178.4500,178.4700,178.4800,178.4400,0.578655,0.568714,0.009940,False,False,...,0.735301,-0.205013,False,False,52.33126,52.967305,51.695215,False,False,0.0
31850,21614.0,178.4500,178.3900,178.4700,178.3900,0.562128,0.567397,-0.005269,False,True,...,,,,,,,,,,0.0
31851,99154.0,178.3900,178.3400,178.4400,178.3400,0.538786,0.561675,-0.022889,False,False,...,0.685214,-0.200348,False,False,52.35826,52.904960,51.811560,False,False,0.0


In [11]:
all_data["target"].value_counts()

0.0    21920
1.0     5482
2.0     4451
Name: target, dtype: int64

In [12]:
TEST_RATIO = 0.2

X = all_data.drop(["target"], axis=1).to_numpy()
y = all_data["target"].to_numpy()

print(X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_RATIO, shuffle=False)
X_train.shape, X_test.shape

(31853, 135) (31853,)


((25482, 135), (6371, 135))

In [13]:
cross_validate_catboost(X, y, compute_balanced_accuracy)

Fold 1: 0.38344628907599726
Fold 2: 0.47223439720170174
Fold 3: 0.4297997330348273
Fold 4: 0.4107350116566824
Fold 5: 0.40212228891102947


0.4196675439760476

# Сравним результаты без использования коррелированных акций

In [78]:
one_stock_data = simple_indicators(target_data.reset_index(drop=True)).drop(["vw", "t", "ticker", "n"], axis=1)
one_stock_data["target"] = get_direction_with_std_threshold_target(one_stock_data, 10, 5, 5)
one_stock_data.dropna(subset=["target"], inplace=True)

one_stock_data

Unnamed: 0,volume,open,close,high,low,macd,macds,macdh,macd_xu_macds,macd_xd_macds,boll,boll_ub,boll_lb,high_x_boll_ub,low_x_boll_lb,target
0,25452.0,61.7500,61.3800,61.7500,61.3800,0.000000,0.000000,0.000000,False,False,61.380000,,,False,False,0.0
1,28416.0,61.7500,61.5875,61.7500,61.4225,0.004655,0.002586,0.002069,True,False,61.483750,61.777199,61.190301,False,True,0.0
2,17296.0,61.5400,61.7500,61.7500,61.5250,0.010971,0.006023,0.004948,False,False,61.572500,61.943411,61.201589,False,False,0.0
3,6776.0,61.6750,61.9100,61.9300,61.6750,0.019351,0.010538,0.008814,False,False,61.656875,62.110332,61.203418,False,False,0.0
4,17188.0,62.0000,61.7875,62.0250,61.7875,0.018623,0.012943,0.005680,False,False,61.683000,62.092716,61.273284,False,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21900,17731.0,145.7100,145.6800,145.7500,145.6600,-0.008033,0.051588,-0.059621,False,False,146.030925,146.464263,145.597587,False,False,0.0
21901,10326.0,145.6800,145.7800,145.8000,145.6800,-0.017429,0.037784,-0.055214,False,False,146.028425,146.467187,145.589663,False,False,0.0
21902,5767.0,145.7301,145.7000,145.7301,145.7000,-0.030974,0.024033,-0.055007,False,False,146.018925,146.478061,145.559789,False,False,0.0
21903,38578.0,145.7100,145.7200,145.8600,145.7000,-0.039638,0.011299,-0.050936,False,False,145.993175,146.459004,145.527346,False,False,0.0


In [79]:
TEST_RATIO = 0.2

X = one_stock_data.drop(["target"], axis=1).to_numpy()
y = one_stock_data["target"].to_numpy()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_RATIO, shuffle=False)
# X_train.shape, X_test.shape

In [80]:
cross_validate_catboost(X, y, compute_balanced_accuracy)

Fold 1: 0.38329649374812996
Fold 2: 0.43347486632309806
Fold 3: 0.4065327339270786
Fold 4: 0.46032616247843383
Fold 5: 0.40559769630566017


0.41784559055648013

# Сравним результаты с использованием коррелированных акций, но без индикаторов

In [86]:
all_data = simple_indicators(target_data).drop(["vw", "n", "ticker"], axis=1).set_index("t")
for ticker in additional_data["ticker"].unique():
    ticker_data = additional_data[additional_data["ticker"] == ticker].drop(["vw", "ticker", "n"], axis=1).set_index("t")
    ticker_data.rename(RENAME_COLUMNS, axis=1, inplace=True)
    all_data = all_data.join(ticker_data, rsuffix=f"_{ticker}", how="left")

all_data.reset_index(drop=True, inplace=True)
all_data["target"] = get_direction_with_std_threshold_target(all_data, 10, 5, 5)
all_data.dropna(subset=["target"], inplace=True)

print(all_data.columns)
all_data

Index(['volume', 'open', 'close', 'high', 'low', 'macd', 'macds', 'macdh',
       'macd_xu_macds', 'macd_xd_macds', 'boll', 'boll_ub', 'boll_lb',
       'high_x_boll_ub', 'low_x_boll_lb', 'volume_GOOG', 'open_GOOG',
       'close_GOOG', 'high_GOOG', 'low_GOOG', 'volume_MSFT', 'open_MSFT',
       'close_MSFT', 'high_MSFT', 'low_MSFT', 'volume_TSLA', 'open_TSLA',
       'close_TSLA', 'high_TSLA', 'low_TSLA', 'volume_NVDA', 'open_NVDA',
       'close_NVDA', 'high_NVDA', 'low_NVDA', 'target'],
      dtype='object')


Unnamed: 0,volume,open,close,high,low,macd,macds,macdh,macd_xu_macds,macd_xd_macds,...,open_TSLA,close_TSLA,high_TSLA,low_TSLA,volume_NVDA,open_NVDA,close_NVDA,high_NVDA,low_NVDA,target
0,25452.0,61.7500,61.3800,61.7500,61.3800,0.000000,0.000000,0.000000,False,False,...,,,,,,,,,,0.0
1,28416.0,61.7500,61.5875,61.7500,61.4225,0.004655,0.002586,0.002069,True,False,...,,,,,,,,,,0.0
2,17296.0,61.5400,61.7500,61.7500,61.5250,0.010971,0.006023,0.004948,False,False,...,,,,,,,,,,0.0
3,6776.0,61.6750,61.9100,61.9300,61.6750,0.019351,0.010538,0.008814,False,False,...,,,,,,,,,,0.0
4,17188.0,62.0000,61.7875,62.0250,61.7875,0.018623,0.012943,0.005680,False,False,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21900,17731.0,145.7100,145.6800,145.7500,145.6600,-0.008033,0.051588,-0.059621,False,False,...,706.67,707.2000,707.45,706.67,1232.0,196.87,196.97,196.97,196.80,0.0
21901,10326.0,145.6800,145.7800,145.8000,145.6800,-0.017429,0.037784,-0.055214,False,False,...,707.00,706.7500,707.00,706.75,1477.0,196.85,196.97,196.99,196.84,0.0
21902,5767.0,145.7301,145.7000,145.7301,145.7000,-0.030974,0.024033,-0.055007,False,False,...,706.75,707.1000,707.10,706.50,2065.0,196.88,196.91,196.99,196.88,0.0
21903,38578.0,145.7100,145.7200,145.8600,145.7000,-0.039638,0.011299,-0.050936,False,False,...,707.00,706.6800,707.00,706.68,9506.0,196.95,196.87,196.95,196.65,0.0


In [87]:
TEST_RATIO = 0.2

X = all_data.drop(["target"], axis=1).to_numpy()
y = all_data["target"].to_numpy()

print(X.shape, y.shape)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_RATIO, shuffle=False)
# X_train.shape, X_test.shape

(21905, 35) (21905,)


In [88]:
cross_validate_catboost(X, y, compute_balanced_accuracy)

Fold 1: 0.3883929409196029
Fold 2: 0.4531658230828682
Fold 3: 0.4563532921280271
Fold 4: 0.46251580530980285
Fold 5: 0.4218796144977876


0.43646149518761773