In [7]:
import numpy as np
import pandas as pd

from data.dataset import StockDataset
from data.scaler import HybridScaler
from data.split import StratifiedTimeSeriesSplit
from data.utils import sliding_window
from model.arima import grid_search
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [8]:
dataset = StockDataset('^GSPC')
df = dataset.get_hist(start_date='1950-01-01', end_date='2021-10-23', time_interval='daily')

In [9]:
x = pd.concat(
    [df, 
     dataset.lookback_agg(lookback_len=30), 
     dataset.lookback_agg(lookback_len=60), 
     dataset.lookback_agg(lookback_len=120)],
    axis=1)

# x = df

y = dataset.get_change_forecast_label(forecast_len=30, is_up=False, method='past_all')

In [10]:
TRAIN_START = '1951-01-01'
TEST_START = '2018-01-01'

window_len = 120
scaler = HybridScaler()
scaler.fit(x[TRAIN_START:TEST_START])

indices, windows = sliding_window(scaler.transform(x), window_len=window_len, step_size=1)
train_start = (np.array(indices) <= TRAIN_START).sum()
test_start = (np.array(indices) <= TEST_START).sum()

train_x = windows[train_start:test_start]
train_y = y[indices][train_start:test_start]
test_x = windows[test_start:]
test_y = y[indices][test_start:]

split = TimeSeriesSplit(n_splits=10, test_size=120)
# split = StratifiedTimeSeriesSplit(n_splits=10, test_size=120, min_positive_ratio=0.25)

# Use `keras` to build up some basic seq models

In [11]:
from keras import Sequential
from keras.layers import LSTM, Dense, Conv1D, MaxPooling1D

In [12]:
# define LSTM model
def get_model():
    model = Sequential()
    model.add(LSTM(64, activation='relu', input_shape=windows.shape[1:], return_sequences=True))
    model.add(LSTM(16, activation='relu')) 
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [13]:
# # define CNN-LSTM model
# def get_model():
#     model = Sequential()
#     model.add(Conv1D(filters=64, kernel_size=5, activation='relu', input_shape=windows.shape[1:]))
#     model.add(LSTM(16, activation='relu')) 
#     # model.add(MaxPooling1D(pool_size=window_len))
#     model.add(Dense(8, activation='relu'))
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#     return model

# Use split to cross-validation

In [14]:
scores = []

for i, (train_idx, valid_idx) in enumerate(split.split(train_y.index)):
    print(f"=== Start Fold {i + 1} ===")
    model = get_model()
    model.fit(train_x[train_idx], train_y[train_idx], epochs=10, verbose=0)
    
    pred_y = model.predict(train_x[valid_idx])
    pred_y_int = (pred_y >= 0.5).astype(int).reshape(-1)
    accuracy = (pred_y_int == train_y[valid_idx]).mean()
    print(F"=== Fold {i + 1} Validation Score is {accuracy}")
    scores.append(accuracy)

=== Start Fold 1 ===


2021-12-11 12:51:03.926944: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


=== Fold 1 Validation Score is 0.5
=== Start Fold 2 ===
=== Fold 2 Validation Score is 1.0
=== Start Fold 3 ===
=== Fold 3 Validation Score is 0.9666666666666667
=== Start Fold 4 ===
=== Fold 4 Validation Score is 0.7833333333333333
=== Start Fold 5 ===
=== Fold 5 Validation Score is 0.7583333333333333
=== Start Fold 6 ===
=== Fold 6 Validation Score is 0.36666666666666664
=== Start Fold 7 ===
=== Fold 7 Validation Score is 0.7416666666666667
=== Start Fold 8 ===
=== Fold 8 Validation Score is 1.0
=== Start Fold 9 ===
=== Fold 9 Validation Score is 0.9833333333333333
=== Start Fold 10 ===
=== Fold 10 Validation Score is 0.43333333333333335


In [15]:
np.mean(scores)

0.7533333333333334

# Train with full train set 

In [16]:
model = get_model()
model.fit(train_x, train_y, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x15885c3d0>

In [17]:
pred_y = model.predict(test_x)

In [18]:
pred_y_int = (pred_y >= 0.5).astype(int)

In [19]:
pred_y_int.sum(), test_y.sum()

(21, 323)

In [20]:
confusion_matrix(test_y, pred_y_int.reshape(-1))

array([[618,  19],
       [321,   2]])

In [21]:
(test_y == pred_y_int.reshape(-1)).mean()

0.6458333333333334