In [14]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import os
import re
import sys
sys.path.append("./utils")

from generate_features import *
from get_targets import *
from dnn_utils import *

from functools import reduce

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from catboost import CatBoostClassifier

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [85]:
DATA_PATH = "./data/prices/technology_prices_15minute_train.csv"
TICKERS = ["AAPL", "MSFT", "GOOG", "TSLA", "NVDA"]
TARGET_TICKER = "AAPL"
ADDITIONAL_TICKERS = ["MSFT", "GOOG", "TSLA", "NVDA"]
RENAME_COLUMNS = {"c": "close", "h": "high", "l": "low", "o": "open", "v": "volume"}

In [8]:
data = pd.read_csv(DATA_PATH, index_col=0)
data.dropna(inplace=True)
data = data[[ticker in TICKERS for ticker in data["ticker"]]]
data

Unnamed: 0,v,vw,o,c,h,l,t,n,ticker
0,25452.0,61.7090,61.750,61.3800,61.750,61.3800,2020-03-30 07:50:00,36,AAPL
1,28416.0,61.5333,61.750,61.5875,61.750,61.4225,2020-03-30 08:05:00,61,AAPL
2,17296.0,61.6269,61.540,61.7500,61.750,61.5250,2020-03-30 08:20:00,43,AAPL
3,6776.0,61.8962,61.675,61.9100,61.930,61.6750,2020-03-30 08:35:00,23,AAPL
4,17188.0,61.9149,62.000,61.7875,62.025,61.7875,2020-03-30 08:50:00,63,AAPL
...,...,...,...,...,...,...,...,...,...
131912,1630.0,196.5607,196.500,196.6000,196.600,196.5000,2021-08-11 22:30:00,18,NVDA
131913,1609.0,196.4820,196.500,196.3200,196.500,196.2100,2021-08-11 22:45:00,93,NVDA
131914,200.0,196.3200,196.320,196.3200,196.320,196.3200,2021-08-11 23:00:00,1,NVDA
131915,4302.0,196.3767,196.540,196.6000,196.600,196.1200,2021-08-11 23:30:00,52,NVDA


In [9]:
target_data = data[data["ticker"] == TARGET_TICKER]
additional_data = data[[ticker in ADDITIONAL_TICKERS for ticker in data["ticker"]]]
display(target_data)
display(additional_data)

Unnamed: 0,v,vw,o,c,h,l,t,n,ticker
0,25452.0,61.7090,61.750,61.3800,61.750,61.3800,2020-03-30 07:50:00,36,AAPL
1,28416.0,61.5333,61.750,61.5875,61.750,61.4225,2020-03-30 08:05:00,61,AAPL
2,17296.0,61.6269,61.540,61.7500,61.750,61.5250,2020-03-30 08:20:00,43,AAPL
3,6776.0,61.8962,61.675,61.9100,61.930,61.6750,2020-03-30 08:35:00,23,AAPL
4,17188.0,61.9149,62.000,61.7875,62.025,61.7875,2020-03-30 08:50:00,63,AAPL
...,...,...,...,...,...,...,...,...,...
21910,5261.0,145.6801,145.700,145.6800,145.700,145.6700,2021-08-11 22:45:00,164,AAPL
21911,1284.0,145.6924,145.700,145.6800,145.700,145.6800,2021-08-11 23:00:00,19,AAPL
21912,3649.0,145.6700,145.670,145.6700,145.710,145.6600,2021-08-11 23:15:00,28,AAPL
21913,9026.0,145.6643,145.670,145.6500,145.690,145.6500,2021-08-11 23:30:00,64,AAPL


Unnamed: 0,v,vw,o,c,h,l,t,n,ticker
31822,184.0,1119.9946,1120.0000,1120.00,1120.0000,1120.00,2020-03-30 11:21:00,4,GOOG
31823,997.0,1115.8603,1116.6600,1115.94,1116.6600,1115.94,2020-03-30 11:51:00,34,GOOG
31824,5765.0,1110.9206,1114.2000,1110.71,1114.2000,1110.71,2020-03-30 12:21:00,6,GOOG
31825,610.0,1124.6381,1122.3500,1127.00,1127.0000,1122.35,2020-03-30 12:36:00,49,GOOG
31826,673.0,1127.0979,1127.0002,1127.00,1127.0002,1127.00,2020-03-30 12:51:00,31,GOOG
...,...,...,...,...,...,...,...,...,...
131912,1630.0,196.5607,196.5000,196.60,196.6000,196.50,2021-08-11 22:30:00,18,NVDA
131913,1609.0,196.4820,196.5000,196.32,196.5000,196.21,2021-08-11 22:45:00,93,NVDA
131914,200.0,196.3200,196.3200,196.32,196.3200,196.32,2021-08-11 23:00:00,1,NVDA
131915,4302.0,196.3767,196.5400,196.60,196.6000,196.12,2021-08-11 23:30:00,52,NVDA


In [10]:
additional_data["ticker"].unique()

array(['GOOG', 'MSFT', 'TSLA', 'NVDA'], dtype=object)

# Define help functions

In [37]:
def get_sample_weight(y: np.ndarray):
    num_observations = len(y)
    sample_weight = np.zeros(num_observations)
    for class_idx in np.unique(y):
        idx = (y == class_idx)
        sample_weight[idx] = idx.sum() / num_observations

    return sample_weight

In [67]:
def compute_balanced_accuracy(y_true: np.ndarray, y_pred: np.ndarray):
    assert y_true.shape == y_pred.shape, f"{y_true.shape}, {y_pred.shape}"
    sample_weight = get_sample_weight(y_true)
    assert y_true.shape == sample_weight.shape, f"{y_true.shape}, {sample_weight.shape}"
    return balanced_accuracy_score(y_true, y_pred, sample_weight=sample_weight)

In [73]:
def cross_validate_catboost(X: np.ndarray, y: np.ndarray, metric):
    tscv = TimeSeriesSplit()
    
    metrics = []
    for fold_idx, (train_idx, test_idx) in enumerate(tscv.split(X)):
        X_train = X[train_idx]
        X_test = X[test_idx]
        
        y_train = y[train_idx]
        y_test = y[test_idx]
        
        assert X_train.shape[0] == y_train.shape[0]
        assert X_test.shape[0] == y_test.shape[0]
        
        class_weight = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
        model = CatBoostClassifier(class_weights=class_weight, verbose=False)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test).flatten()
        assert y_test.shape == predictions.shape, f"{y_test.shape}, {predictions.shape}"
        
        fold_metric = metric(y_test, predictions)
        metrics.append(fold_metric)
        print(f"Fold {fold_idx + 1}: {fold_metric}")
    
    return np.mean(metrics)

In [83]:
all_data = simple_indicators(target_data).drop(["vw", "n", "ticker"], axis=1).set_index("t")
for ticker in additional_data["ticker"].unique():
    ticker_data = additional_data[additional_data["ticker"] == ticker]
    ticker_indicators = simple_indicators(ticker_data).drop(["vw", "ticker", "n"], axis=1).set_index("t")
    all_data = all_data.join(ticker_indicators, rsuffix=f"_{ticker}", how="left")

all_data.reset_index(drop=True, inplace=True)
all_data["target"] = get_direction_with_std_threshold_target(all_data, 10, 5, 5)
all_data.dropna(subset=["target"], inplace=True)

print(all_data.columns)
all_data

Index(['volume', 'open', 'close', 'high', 'low', 'macd', 'macds', 'macdh',
       'macd_xu_macds', 'macd_xd_macds', 'boll', 'boll_ub', 'boll_lb',
       'high_x_boll_ub', 'low_x_boll_lb', 'volume_GOOG', 'open_GOOG',
       'close_GOOG', 'high_GOOG', 'low_GOOG', 'macd_GOOG', 'macds_GOOG',
       'macdh_GOOG', 'macd_xu_macds_GOOG', 'macd_xd_macds_GOOG', 'boll_GOOG',
       'boll_ub_GOOG', 'boll_lb_GOOG', 'high_x_boll_ub_GOOG',
       'low_x_boll_lb_GOOG', 'volume_MSFT', 'open_MSFT', 'close_MSFT',
       'high_MSFT', 'low_MSFT', 'macd_MSFT', 'macds_MSFT', 'macdh_MSFT',
       'macd_xu_macds_MSFT', 'macd_xd_macds_MSFT', 'boll_MSFT', 'boll_ub_MSFT',
       'boll_lb_MSFT', 'high_x_boll_ub_MSFT', 'low_x_boll_lb_MSFT',
       'volume_TSLA', 'open_TSLA', 'close_TSLA', 'high_TSLA', 'low_TSLA',
       'macd_TSLA', 'macds_TSLA', 'macdh_TSLA', 'macd_xu_macds_TSLA',
       'macd_xd_macds_TSLA', 'boll_TSLA', 'boll_ub_TSLA', 'boll_lb_TSLA',
       'high_x_boll_ub_TSLA', 'low_x_boll_lb_TSLA', 'volume_N

Unnamed: 0,volume,open,close,high,low,macd,macds,macdh,macd_xu_macds,macd_xd_macds,...,macds_NVDA,macdh_NVDA,macd_xu_macds_NVDA,macd_xd_macds_NVDA,boll_NVDA,boll_ub_NVDA,boll_lb_NVDA,high_x_boll_ub_NVDA,low_x_boll_lb_NVDA,target
0,25452.0,61.7500,61.3800,61.7500,61.3800,0.000000,0.000000,0.000000,False,False,...,,,,,,,,,,0.0
1,28416.0,61.7500,61.5875,61.7500,61.4225,0.004655,0.002586,0.002069,True,False,...,,,,,,,,,,0.0
2,17296.0,61.5400,61.7500,61.7500,61.5250,0.010971,0.006023,0.004948,False,False,...,,,,,,,,,,0.0
3,6776.0,61.6750,61.9100,61.9300,61.6750,0.019351,0.010538,0.008814,False,False,...,,,,,,,,,,0.0
4,17188.0,62.0000,61.7875,62.0250,61.7875,0.018623,0.012943,0.005680,False,False,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21900,17731.0,145.7100,145.6800,145.7500,145.6600,-0.008033,0.051588,-0.059621,False,False,...,-0.318244,0.168126,False,False,196.31558,197.631764,194.999396,False,False,0.0
21901,10326.0,145.6800,145.7800,145.8000,145.6800,-0.017429,0.037784,-0.055214,False,False,...,-0.278440,0.159218,False,False,196.42158,197.571797,195.271363,False,False,0.0
21902,5767.0,145.7301,145.7000,145.7301,145.7000,-0.030974,0.024033,-0.055007,False,False,...,-0.242440,0.143998,False,False,196.51025,197.506493,195.514007,False,False,0.0
21903,38578.0,145.7100,145.7200,145.8600,145.7000,-0.039638,0.011299,-0.050936,False,False,...,-0.210799,0.126567,False,False,196.57975,197.460632,195.698868,False,False,0.0


In [75]:
all_data["target"].value_counts()

0.0    15026
1.0     3846
2.0     3033
Name: target, dtype: int64

In [76]:
TEST_RATIO = 0.2

X = all_data.drop(["target"], axis=1).to_numpy()
y = all_data["target"].to_numpy()

print(X.shape, y.shape)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_RATIO, shuffle=False)
# X_train.shape, X_test.shape

(21905, 75) (21905,)


In [77]:
cross_validate_catboost(X, y, compute_balanced_accuracy)

Fold 1: 0.3807375451513677
Fold 2: 0.44382856934933446
Fold 3: 0.4730009239589826
Fold 4: 0.47366480844924147
Fold 5: 0.47463909131295196


0.4491741876443756

# Сравним результаты без использования коррелированных акций

In [78]:
one_stock_data = simple_indicators(target_data.reset_index(drop=True)).drop(["vw", "t", "ticker", "n"], axis=1)
one_stock_data["target"] = get_direction_with_std_threshold_target(one_stock_data, 10, 5, 5)
one_stock_data.dropna(subset=["target"], inplace=True)

one_stock_data

Unnamed: 0,volume,open,close,high,low,macd,macds,macdh,macd_xu_macds,macd_xd_macds,boll,boll_ub,boll_lb,high_x_boll_ub,low_x_boll_lb,target
0,25452.0,61.7500,61.3800,61.7500,61.3800,0.000000,0.000000,0.000000,False,False,61.380000,,,False,False,0.0
1,28416.0,61.7500,61.5875,61.7500,61.4225,0.004655,0.002586,0.002069,True,False,61.483750,61.777199,61.190301,False,True,0.0
2,17296.0,61.5400,61.7500,61.7500,61.5250,0.010971,0.006023,0.004948,False,False,61.572500,61.943411,61.201589,False,False,0.0
3,6776.0,61.6750,61.9100,61.9300,61.6750,0.019351,0.010538,0.008814,False,False,61.656875,62.110332,61.203418,False,False,0.0
4,17188.0,62.0000,61.7875,62.0250,61.7875,0.018623,0.012943,0.005680,False,False,61.683000,62.092716,61.273284,False,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21900,17731.0,145.7100,145.6800,145.7500,145.6600,-0.008033,0.051588,-0.059621,False,False,146.030925,146.464263,145.597587,False,False,0.0
21901,10326.0,145.6800,145.7800,145.8000,145.6800,-0.017429,0.037784,-0.055214,False,False,146.028425,146.467187,145.589663,False,False,0.0
21902,5767.0,145.7301,145.7000,145.7301,145.7000,-0.030974,0.024033,-0.055007,False,False,146.018925,146.478061,145.559789,False,False,0.0
21903,38578.0,145.7100,145.7200,145.8600,145.7000,-0.039638,0.011299,-0.050936,False,False,145.993175,146.459004,145.527346,False,False,0.0


In [79]:
TEST_RATIO = 0.2

X = one_stock_data.drop(["target"], axis=1).to_numpy()
y = one_stock_data["target"].to_numpy()

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_RATIO, shuffle=False)
# X_train.shape, X_test.shape

In [80]:
cross_validate_catboost(X, y, compute_balanced_accuracy)

Fold 1: 0.38329649374812996
Fold 2: 0.43347486632309806
Fold 3: 0.4065327339270786
Fold 4: 0.46032616247843383
Fold 5: 0.40559769630566017


0.41784559055648013

# Сравним результаты с использованием коррелированных акций, но без индикаторов

In [86]:
all_data = simple_indicators(target_data).drop(["vw", "n", "ticker"], axis=1).set_index("t")
for ticker in additional_data["ticker"].unique():
    ticker_data = additional_data[additional_data["ticker"] == ticker].drop(["vw", "ticker", "n"], axis=1).set_index("t")
    ticker_data.rename(RENAME_COLUMNS, axis=1, inplace=True)
    all_data = all_data.join(ticker_data, rsuffix=f"_{ticker}", how="left")

all_data.reset_index(drop=True, inplace=True)
all_data["target"] = get_direction_with_std_threshold_target(all_data, 10, 5, 5)
all_data.dropna(subset=["target"], inplace=True)

print(all_data.columns)
all_data

Index(['volume', 'open', 'close', 'high', 'low', 'macd', 'macds', 'macdh',
       'macd_xu_macds', 'macd_xd_macds', 'boll', 'boll_ub', 'boll_lb',
       'high_x_boll_ub', 'low_x_boll_lb', 'volume_GOOG', 'open_GOOG',
       'close_GOOG', 'high_GOOG', 'low_GOOG', 'volume_MSFT', 'open_MSFT',
       'close_MSFT', 'high_MSFT', 'low_MSFT', 'volume_TSLA', 'open_TSLA',
       'close_TSLA', 'high_TSLA', 'low_TSLA', 'volume_NVDA', 'open_NVDA',
       'close_NVDA', 'high_NVDA', 'low_NVDA', 'target'],
      dtype='object')


Unnamed: 0,volume,open,close,high,low,macd,macds,macdh,macd_xu_macds,macd_xd_macds,...,open_TSLA,close_TSLA,high_TSLA,low_TSLA,volume_NVDA,open_NVDA,close_NVDA,high_NVDA,low_NVDA,target
0,25452.0,61.7500,61.3800,61.7500,61.3800,0.000000,0.000000,0.000000,False,False,...,,,,,,,,,,0.0
1,28416.0,61.7500,61.5875,61.7500,61.4225,0.004655,0.002586,0.002069,True,False,...,,,,,,,,,,0.0
2,17296.0,61.5400,61.7500,61.7500,61.5250,0.010971,0.006023,0.004948,False,False,...,,,,,,,,,,0.0
3,6776.0,61.6750,61.9100,61.9300,61.6750,0.019351,0.010538,0.008814,False,False,...,,,,,,,,,,0.0
4,17188.0,62.0000,61.7875,62.0250,61.7875,0.018623,0.012943,0.005680,False,False,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21900,17731.0,145.7100,145.6800,145.7500,145.6600,-0.008033,0.051588,-0.059621,False,False,...,706.67,707.2000,707.45,706.67,1232.0,196.87,196.97,196.97,196.80,0.0
21901,10326.0,145.6800,145.7800,145.8000,145.6800,-0.017429,0.037784,-0.055214,False,False,...,707.00,706.7500,707.00,706.75,1477.0,196.85,196.97,196.99,196.84,0.0
21902,5767.0,145.7301,145.7000,145.7301,145.7000,-0.030974,0.024033,-0.055007,False,False,...,706.75,707.1000,707.10,706.50,2065.0,196.88,196.91,196.99,196.88,0.0
21903,38578.0,145.7100,145.7200,145.8600,145.7000,-0.039638,0.011299,-0.050936,False,False,...,707.00,706.6800,707.00,706.68,9506.0,196.95,196.87,196.95,196.65,0.0


In [87]:
TEST_RATIO = 0.2

X = all_data.drop(["target"], axis=1).to_numpy()
y = all_data["target"].to_numpy()

print(X.shape, y.shape)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_RATIO, shuffle=False)
# X_train.shape, X_test.shape

(21905, 35) (21905,)


In [88]:
cross_validate_catboost(X, y, compute_balanced_accuracy)

Fold 1: 0.3883929409196029
Fold 2: 0.4531658230828682
Fold 3: 0.4563532921280271
Fold 4: 0.46251580530980285
Fold 5: 0.4218796144977876


0.43646149518761773