In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn

import pytorch_lightning as pl
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import torchmetrics as TM
pl.utilities.seed.seed_everything(seed=42)
import numpy as np
import pandas as pd

import sys, os
source_path = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(source_path)
source_path = os.path.join(os.getcwd(), os.pardir, 'preprocessing')
sys.path.append(source_path)
source_path = os.path.join(os.getcwd(), os.pardir, 'metrics')
sys.path.append(source_path)

from dl import NeuralNetwork, Trainer
from preprocess import (
    show_df, 
    date_features, 
    preprocess, 
    ToTorch, 
    get_loader, 
    ts_split,
    cont_cat_split
)
from metrics import calc_spread_return_sharpe

Global seed set to 42
Global seed set to 42


##  Get Data and train a Neural Network

In [2]:
ROOT_PATH = 'c:/Users/gilbe/Documents/TokyoData'


'/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/financials.csv'
'/train_files/trades.csv'

train_df = pd.read_csv(f'{ROOT_PATH}/train_files/stock_prices.csv')
train_df['Date'] = pd.to_datetime(train_df['Date']) 
train_df.set_index('Date', inplace=True)
# train_df = date_features(train_df)

train_options = pd.read_csv(f'{ROOT_PATH}/train_files/options.csv', low_memory=False)
train_financials = pd.read_csv(f'{ROOT_PATH}/train_files/financials.csv', low_memory=False)
train_trades = pd.read_csv(f'{ROOT_PATH}/train_files/trades.csv', low_memory=False)

import matplotlib.pyplot as plt
print('Raw Time Series data shape:', train_df.shape)
print('No Unique Securities code:', train_df['SecuritiesCode'].nunique())

df_1301 = train_df[train_df['SecuritiesCode'] == 1301].drop(['SecuritiesCode', 'Volume'], axis=1)

df_1301 = date_features(df_1301)

cont, cat = cont_cat_split(df_1301, 'int64')
df_train_cat, df_val_cat = ts_split(cat)
df_train, df_val = ts_split(cont)

xtrain, ytrain = preprocess(df_train, 'Target', 1, continous_cols=['Close'])
xval, yval = preprocess(df_val, 'Target', 1, continous_cols=['Close'])

print('xtrain.shape:', xtrain.shape)
print(xtrain[:5])
print()
print('ytrain.shape:', ytrain.shape)
print(ytrain[:5])
print('df_train_cat.shape:', df_train_cat.shape)
print(df_train_cat.head())

""" xtrain and df_train_cat have different shapes!!!!!"""

Raw Time Series data shape: (2332531, 11)
No Unique Securities code: 2000
xtrain.shape: (900, 1)
[[-0.00145879]
 [ 0.00073046]
 [ 0.00291971]
 [-0.0010917 ]
 [-0.00510018]]

ytrain.shape: (901, 1)
[[ 0.00073046]
 [ 0.00291971]
 [-0.0010917 ]
 [-0.00510018]
 [-0.0032955 ]]
df_train_cat.shape: (901, 3)
            day_of_year  month  day_of_week
Date                                       
2017-01-04            4      1            4
2017-01-05            5      1            5
2017-01-06            6      1            6
2017-01-10           10      1           10
2017-01-11           11      1           11


' xtrain and df_train_cat have different shapes!!!!!'

### Train the model

In [3]:

%%time
import torch
from sklearn.impute import SimpleImputer


imp = SimpleImputer(missing_values=np.nan, strategy='mean')
batch_size = 64
train_dataloader = get_loader(x=xtrain, y=ytrain, batch_size=batch_size, x_cat=df_train_cat.to_numpy())
val_dataloader = get_loader(x=xval, y=yval, batch_size=batch_size, x_cat=df_val_cat.to_numpy())


cat_features = 3 
embedding_dim = 10
# cat_features = cat_features * embedding_dim
# print('in_features:', xtrain.shape[1] + cat_features)

model = NeuralNetwork(
    in_features=xtrain.shape[1], 
    units=1024,
    out_features=1, 
    categorical_dim=cat_features,
    no_embedding=len(df_train_cat), 
    emb_dim=embedding_dim
)

print(model)

trainer = Trainer(model, lr=3e-5)
trainer.fit_epochs(train_dataloader, val_dataloader, use_cyclic_lr=True, x_cat=True, epochs=60)


NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (embedding): Embedding(901, 10)
  (embedding_to_hidden): Linear(in_features=10, out_features=1024, bias=True)
  (embedding_output): Linear(in_features=1024, out_features=1, bias=True)
  (cont_input): Linear(in_features=1, out_features=1024, bias=True)
  (hidden_layer): Linear(in_features=1027, out_features=1027, bias=True)
  (output_layer): Linear(in_features=1027, out_features=1, bias=True)
  (layernom_embedding): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
  (layernorm_cont): LayerNorm((1,), eps=1e-05, elementwise_affine=True)
  (batch_norm_emb): BatchNorm2d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm_cont): BatchNorm1d(1027, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.1, inplace=False)
)
Using cpu-device
Epoch: <<< 0 >>>
Train-Loss: 0.23054751753807068 [0/15]
Train-Loss: 0.007708520628511906 [1/15]
Train-Loss: 1.2378548383712769 [

  Variable._execution_engine.run_backward(



Train-Loss: 1.66890549659729 [7/15]
Train-Loss: 2.2491190433502197 [8/15]
Train-Loss: 1.9662394523620605 [9/15]
Train-Loss: 1.0834407806396484 [10/15]
Train-Loss: 1.4593175649642944 [11/15]
Train-Loss: 1.3014317750930786 [12/15]
Train-Loss: 0.9487611651420593 [13/15]
Train-Loss: 2.2905216217041016 [14/15]
Val-Loss: 0.41815271973609924 [1/5]
Val-Loss: 0.5358166098594666 [2/5]
Val-Loss: 0.6240096092224121 [3/5]
Val-Loss: 0.5117567181587219 [4/5]
Val-Loss: 0.3937821388244629 [5/5]
Epoch: <<< 1 >>>
Train-Loss: 0.6435340642929077 [0/15]
Train-Loss: 0.6047403812408447 [1/15]
Train-Loss: 0.2005728930234909 [2/15]
Train-Loss: 0.10708507150411606 [3/15]
Train-Loss: 0.3578179180622101 [4/15]
Train-Loss: 0.7647048234939575 [5/15]
Train-Loss: 0.8530535697937012 [6/15]
Train-Loss: 0.0657423660159111 [7/15]
Train-Loss: 0.6083141565322876 [8/15]
Train-Loss: 0.7817118763923645 [9/15]
Train-Loss: 0.27181848883628845 [10/15]
Train-Loss: 0.03490080684423447 [11/15]
Train-Loss: 0.3917311728000641 [12/15]