In [250]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import os
import re
import sys
sys.path.append("./utils")

from generate_features import *
from get_targets import *
from utils.utils import Model, cross_validate_time_series

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import mean_absolute_error

from catboost import CatBoostClassifier
from catboost import CatBoostRegressor

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [179]:
DATA_PATH = "./data/prices/tech_companies_15minute_train.csv"
MODEL_PATH = ""

TICKERS = ["AAPL", "MSFT", "GOOG", "TSLA", "NVDA", "BABA", "AMD", "ATVI", "ZG"]
TARGET_TICKER = "AAPL"
ADDITIONAL_TICKERS = ["MSFT", "GOOG", "TSLA", "NVDA", "BABA", "AMD", "ATVI", "ZG"]

In [180]:
data = pd.read_csv(DATA_PATH, index_col=0)
data.dropna(inplace=True)
data = data[[ticker in TICKERS for ticker in data["ticker"]]]
data

Unnamed: 0,volume,open,close,high,low,date,ticker
0,67612.0,62.9025,62.6025,63.0275,62.6025,2020-04-06 07:59:00,AAPL
1,36348.0,62.6100,62.7500,62.7750,62.5125,2020-04-06 08:14:00,AAPL
2,27440.0,62.8500,62.7875,62.8500,62.4875,2020-04-06 08:29:00,AAPL
3,46384.0,62.8000,62.8675,62.8875,62.6375,2020-04-06 08:44:00,AAPL
4,24700.0,62.8000,62.7175,62.8475,62.5525,2020-04-06 08:59:00,AAPL
...,...,...,...,...,...,...,...
427458,105229.0,66.1100,66.4000,66.4700,66.1010,2021-11-09 20:30:00,ZG
427459,175685.0,66.4400,66.6300,66.7300,66.3100,2021-11-09 20:45:00,ZG
427460,8144.0,66.6200,66.6200,66.6200,66.6200,2021-11-09 21:00:00,ZG
427461,100.0,66.1900,66.1900,66.1900,66.1900,2021-11-09 21:30:00,ZG


In [181]:
target_data = data[data["ticker"] == TARGET_TICKER]
additional_data = data[[ticker in ADDITIONAL_TICKERS for ticker in data["ticker"]]]
display(target_data)
display(additional_data)

Unnamed: 0,volume,open,close,high,low,date,ticker
0,67612.0,62.9025,62.6025,63.0275,62.6025,2020-04-06 07:59:00,AAPL
1,36348.0,62.6100,62.7500,62.7750,62.5125,2020-04-06 08:14:00,AAPL
2,27440.0,62.8500,62.7875,62.8500,62.4875,2020-04-06 08:29:00,AAPL
3,46384.0,62.8000,62.8675,62.8875,62.6375,2020-04-06 08:44:00,AAPL
4,24700.0,62.8000,62.7175,62.8475,62.5525,2020-04-06 08:59:00,AAPL
...,...,...,...,...,...,...,...
25569,3613.0,150.5200,150.5000,150.5500,150.5000,2021-11-09 23:45:00,AAPL
25570,8250.0,150.4900,150.6300,150.6300,150.4900,2021-11-10 00:00:00,AAPL
25571,10936.0,150.6000,150.5500,150.6000,150.4600,2021-11-10 00:15:00,AAPL
25572,9524.0,150.5500,150.6000,150.6700,150.5000,2021-11-10 00:30:00,AAPL


Unnamed: 0,volume,open,close,high,low,date,ticker
31863,1703.0,1138.36,1138.00,1138.36,1138.000,2020-04-06 11:00:00,GOOG
31864,397.0,1138.00,1138.10,1138.10,1138.000,2020-04-06 11:15:00,GOOG
31865,1671.0,1138.00,1138.50,1138.50,1135.880,2020-04-06 11:30:00,GOOG
31866,1591.0,1138.22,1138.22,1138.22,1138.220,2020-04-06 12:00:00,GOOG
31867,1267.0,1135.00,1136.00,1138.00,1135.000,2020-04-06 12:15:00,GOOG
...,...,...,...,...,...,...,...
427458,105229.0,66.11,66.40,66.47,66.101,2021-11-09 20:30:00,ZG
427459,175685.0,66.44,66.63,66.73,66.310,2021-11-09 20:45:00,ZG
427460,8144.0,66.62,66.62,66.62,66.620,2021-11-09 21:00:00,ZG
427461,100.0,66.19,66.19,66.19,66.190,2021-11-09 21:30:00,ZG


In [182]:
additional_data["ticker"].unique()

array(['GOOG', 'MSFT', 'TSLA', 'NVDA', 'BABA', 'AMD', 'ATVI', 'ZG'],
      dtype=object)

In [183]:
def get_sample_weight(y: np.ndarray):
    num_observations = len(y)
    sample_weight = np.zeros(num_observations)
    for class_idx in np.unique(y):
        idx = (y == class_idx)
        sample_weight[idx] = idx.sum() / num_observations

    return sample_weight

In [211]:
def compute_balanced_accuracy(y_true: np.ndarray, y_pred: np.ndarray):
    assert y_true.shape == y_pred.shape, f"{y_true.shape}, {y_pred.shape}"
    sample_weight = get_sample_weight(y_true)
    assert y_true.shape == sample_weight.shape, f"{y_true.shape}, {sample_weight.shape}"
    return balanced_accuracy_score(y_true, y_pred, sample_weight=sample_weight)

In [216]:
def compute_balanced_mae(y_true: np.ndarray, y_pred: np.ndarray):
    assert y_true.shape == y_pred.shape, f"{y_true.shape}, {y_pred.shape}"
    sample_weight = get_sample_weight(y_true)
    assert y_true.shape == sample_weight.shape, f"{y_true.shape}, {sample_weight.shape}"
    return mean_absolute_error(y_true, y_pred, sample_weight=sample_weight)

In [185]:
def cross_validate_catboost(X: np.ndarray, y: np.ndarray, metric):
    tscv = TimeSeriesSplit()
    
    metrics = []
    for fold_idx, (train_idx, test_idx) in enumerate(tscv.split(X)):
        X_train = X[train_idx]
        X_test = X[test_idx]
        
        y_train = y[train_idx]
        y_test = y[test_idx]
        
        assert X_train.shape[0] == y_train.shape[0]
        assert X_test.shape[0] == y_test.shape[0]
        
        class_weight = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
        model = CatBoostClassifier(class_weights=class_weight, verbose=False)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test).flatten()
        assert y_test.shape == predictions.shape, f"{y_test.shape}, {predictions.shape}"
        
        fold_metric = metric(y_test, predictions)
        metrics.append(fold_metric)
        print(f"Fold {fold_idx + 1}: {fold_metric}")
    
    return np.mean(metrics)

In [257]:
THRESHOLD = 1.0
MAX_TIME_LAG = 30

all_data = simple_indicators(target_data).drop(["ticker"], axis=1)
for ticker in additional_data["ticker"].unique():
    ticker_data = additional_data[additional_data["ticker"] == ticker]
    ticker_indicators = simple_indicators(ticker_data).drop(["ticker"], axis=1)
    all_data = all_data.join(ticker_indicators, rsuffix=f"_{ticker}", how="left")

dates = pd.Series(all_data.index).apply(lambda val: pd.Timestamp(val))
all_data.reset_index(drop=True, inplace=True)
all_data["weekday"] = dates.dt.weekday
all_data["day"] = dates.dt.day
all_data["hour"] = dates.dt.hour
all_data["target"] = get_first_threshold_bump(all_data, threshold=THRESHOLD, max_time_lag=MAX_TIME_LAG)
all_data.dropna(subset=["target"], inplace=True)

# all_data.ffill(inplace=True)
# all_data.bfill(inplace=True)

print(all_data.columns)
all_data

Index(['volume', 'open', 'close', 'high', 'low', 'macdh', 'boll', 'rsi_14',
       'chop', 'mfi',
       ...
       'open_-5~0_min_ZG', 'open_-5~0_max_ZG', 'high_-5~0_min_ZG',
       'high_-5~0_max_ZG', 'low_-5~0_min_ZG', 'low_-5~0_max_ZG', 'weekday',
       'day', 'hour', 'target'],
      dtype='object', length=211)


Unnamed: 0,volume,open,close,high,low,macdh,boll,rsi_14,chop,mfi,...,open_-5~0_min_ZG,open_-5~0_max_ZG,high_-5~0_min_ZG,high_-5~0_max_ZG,low_-5~0_min_ZG,low_-5~0_max_ZG,weekday,day,hour,target
0,67612.0,62.9025,62.6025,63.0275,62.6025,0.000000,62.602500,,0.000000,0.500000,...,,,,,,,0,6,7,0.0
1,36348.0,62.6100,62.7500,62.7750,62.5125,0.001471,62.676250,100.000000,10.946899,0.500000,...,,,,,,,0,6,8,0.0
2,27440.0,62.8500,62.7875,62.8500,62.4875,0.002090,62.713333,100.000000,25.197494,0.500000,...,,,,,,,0,6,8,0.0
3,46384.0,62.8000,62.8675,62.8875,62.6375,0.003827,62.751875,100.000000,33.290311,0.500000,...,,,,,,,0,6,8,0.0
4,24700.0,62.8000,62.7175,62.8475,62.5525,-0.000227,62.745000,59.969754,41.511834,0.500000,...,,,,,,,0,6,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25569,3613.0,150.5200,150.5000,150.5500,150.5000,-0.033072,150.681595,44.008130,44.661970,0.566020,...,,,,,,,1,9,23,0.0
25570,8250.0,150.4900,150.6300,150.6300,150.4900,-0.022363,150.677845,49.123774,40.992106,0.468112,...,,,,,,,2,10,0,0.0
25571,10936.0,150.6000,150.5500,150.6000,150.4600,-0.019414,150.670345,46.319195,49.051106,0.019333,...,,,,,,,2,10,0,0.0
25572,9524.0,150.5500,150.6000,150.6700,150.5000,-0.013142,150.656345,48.305672,49.593726,0.157142,...,66.11,66.62,66.19,66.73,65.9043,66.62,2,10,0,0.0


In [258]:
all_data["target"].value_counts()

 0.0    9827
 1.0    8583
-1.0    7164
Name: target, dtype: int64

In [259]:
TEST_RATIO = 0.2

X = all_data.drop(["target"], axis=1).to_numpy()
y = all_data["target"].to_numpy()

print(X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_RATIO, shuffle=False)
X_train.shape, X_test.shape

(25574, 210) (25574,)


((20459, 210), (5115, 210))

In [260]:
class CatBoostStockClassifier(Model):
    def __init__(self, X_train, y_train):
        super(CatBoostStockClassifier, self).__init__(X_train, y_train)
        
        class_weight = compute_class_weight(class_weight="balanced", classes=np.unique(self.y_train), y=self.y_train)
        self.model = CatBoostClassifier(verbose=False, class_weights=class_weight)
        
    def fit(self):
        self.model.fit(self.X_train, self.y_train)
        
    def predict(self, X_test):
        return self.model.predict(X_test)

In [261]:
class CatBoostStockRegressor(Model):
    def __init__(self, X_train, y_train):
        super(CatBoostStockRegressor, self).__init__(X_train, y_train)
        
        class_weight = compute_class_weight(class_weight="balanced", classes=np.unique(self.y_train), y=self.y_train)
        self.model = CatBoostRegressor(verbose=False, loss_function="MAE")
        
    def fit(self):
        self.model.fit(self.X_train, self.y_train)
        
    def predict(self, X_test):
        return self.model.predict(X_test)

In [262]:
cross_validate_time_series(X, y, compute_balanced_accuracy, CatBoostStockClassifier)

Fold 1: 0.37927818186621964
Fold 2: 0.35932133899169266
Fold 3: 0.3403696304910091
Fold 4: 0.4522503853537369
Fold 5: 0.45415632106762915


0.3970751715540575

In [233]:
cross_validate_time_series(X, y, compute_balanced_mae, CatBoostStockRegressor)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Fold 1: 1.066471360436089
Fold 2: 0.7831211159647611
Fold 3: 0.7905622733289464
Fold 4: 0.5456308284234155
Fold 5: 0.5510458691877238


0.7473662894681871

In [254]:
model = CatBoostStockRegressor(X_train, y_train)
model.fit()
predictions = model.predict(X_test)

In [255]:
predictions[predictions < -1] = -1
predictions[predictions > 1] = 1
predictions = np.round(predictions)

print(np.unique(y_test, return_counts=True))
confusion_matrix(y_test, predictions)

(array([-1.,  0.,  1.], dtype=float32), array([1213, 2467, 1435], dtype=int64))


array([[ 169,  983,   61],
       [ 236, 2146,   85],
       [ 250, 1093,   92]], dtype=int64)

In [256]:
feat_importance = [(imp, feat) for imp, feat in zip(model.model.get_feature_importance(), all_data.columns)]
feat_importance.sort(reverse=True)
feat_importance

[(9.857096744330663, 'day'),
 (6.05413953733913, 'weekday'),
 (3.179127518104027, 'rsi_14'),
 (2.3811252393011686, 'close_-5~0_max'),
 (2.2559016856446177, 'boll'),
 (2.102784221503548, 'boll_BABA'),
 (2.0948612690293635, 'close'),
 (1.9430839781400284, 'boll_AMD'),
 (1.7718832419200063, 'hour'),
 (1.7345964774707172, 'close_-5~0_min'),
 (1.708913368732167, 'close_-5~0_max_BABA'),
 (1.684690702171891, 'close_-5~0_max_TSLA'),
 (1.6332521886192515, 'high_BABA'),
 (1.5979443803756948, 'low'),
 (1.546437614856204, 'high'),
 (1.5415682171695329, 'volume_10_ema_AMD'),
 (1.5166089991530955, 'open_BABA'),
 (1.5022057656168681, 'close_-5~0_min_TSLA'),
 (1.4586862543479486, 'low_BABA'),
 (1.4541746028614768, 'close_-5~0_max_AMD'),
 (1.3995640308171504, 'close_-5~0_min_BABA'),
 (1.3597747311576873, 'boll_MSFT'),
 (1.353840381712722, 'volume_10_ema'),
 (1.316592556046942, 'boll_TSLA'),
 (1.2978388158743248, 'macdh'),
 (1.249899306423733, 'close_BABA'),
 (1.2294340679417082, 'chop_BABA'),
 (1.22107

In [263]:
model = CatBoostStockClassifier(X_train, y_train)
model.fit()
predictions = model.predict(X_test)

confusion_matrix(y_test, predictions)

array([[ 533,  513,  167],
       [ 472, 1690,  305],
       [ 582,  553,  300]], dtype=int64)

In [264]:
feat_importance = [(imp, feat) for imp, feat in zip(model.model.get_feature_importance(), all_data.columns)]
feat_importance.sort(reverse=True)
feat_importance

[(9.956795117635108, 'day'),
 (6.712834134838032, 'weekday'),
 (4.030728195521534, 'hour'),
 (3.988249368650029, 'rsi_14'),
 (2.0755914630796903, 'volume_10_sma'),
 (1.919790685348889, 'macdh'),
 (1.4704797905226943, 'boll'),
 (1.4461223786976707, 'close'),
 (1.2661008576072414, 'boll_AMD'),
 (1.257618784385732, 'boll_BABA'),
 (1.2451932891469562, 'high_-5~0_max'),
 (1.20022100740502, 'rsi_14_TSLA'),
 (1.1225745532607951, 'volume_10_mstd'),
 (1.117260417130515, 'volume_-5~0_min'),
 (1.1140770752515734, 'rsi_14_BABA'),
 (1.0882443138036673, 'mfi'),
 (1.0826857122249072, 'chop'),
 (1.059347354985534, 'low'),
 (1.0175703100850548, 'volume_-5~0_max'),
 (0.9061285734444137, 'high_-5~0_max_BABA'),
 (0.9045739365756225, 'close_-5~0_max'),
 (0.8798097384913522, 'low_-5~0_min'),
 (0.8568744139079616, 'low_-5~0_min_BABA'),
 (0.8557384424423855, 'boll_MSFT'),
 (0.8333052161411083, 'boll_TSLA'),
 (0.8046834367945311, 'rsi_14_AMD'),
 (0.7880476141314654, 'volume_10_ema'),
 (0.769772446300661, 'macd