In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn

import pytorch_lightning as pl
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import torchmetrics as TM
pl.utilities.seed.seed_everything(seed=42)
import numpy as np
import pandas as pd

import sys, os
source_path = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(source_path)
source_path = os.path.join(os.getcwd(), os.pardir, 'preprocessing')
sys.path.append(source_path)
source_path = os.path.join(os.getcwd(), os.pardir, 'metrics')
sys.path.append(source_path)

from dl import NeuralNetwork, Trainer
from preprocess import (
    show_df, 
    date_features, 
    preprocess, 
    ToTorch, 
    get_loader, 
    ts_split,
    cont_cat_split
)
from metrics import calc_spread_return_sharpe

Global seed set to 42
Global seed set to 42


##  Get Data and train a Neural Network

In [2]:
ROOT_PATH = 'c:/Users/gilbe/Documents/TokyoData'


'/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/financials.csv'
'/train_files/trades.csv'

train_df = pd.read_csv(f'{ROOT_PATH}/train_files/stock_prices.csv')
train_df['Date'] = pd.to_datetime(train_df['Date']) 
train_df.set_index('Date', inplace=True)
# train_df = date_features(train_df)
print(train_df.head())

train_options = pd.read_csv(f'{ROOT_PATH}/train_files/options.csv', low_memory=False)
train_financials = pd.read_csv(f'{ROOT_PATH}/train_files/financials.csv', low_memory=False)
train_trades = pd.read_csv(f'{ROOT_PATH}/train_files/trades.csv', low_memory=False)

import matplotlib.pyplot as plt
print('Raw Time Series data shape:', train_df.shape)
print('No Unique Securities code:', train_df['SecuritiesCode'].nunique())

df_1301 = train_df[train_df['SecuritiesCode'] == 1301].drop(['SecuritiesCode', 'Volume'], axis=1)

print('df_1301.head()')
print(df_1301.head())
print(df_1301.info())

df_1301 = date_features(df_1301)


# cont, cat = cont_cat_split(df_1301, 'int64')
cat_cols = ['day_of_year', 'month', 'day_of_week', 'RowId']
cont, cat = cont_cat_split(df_1301, cat_cols=cat_cols)

df_train_cat, df_val_cat = ts_split(cat)
df_train, df_val = ts_split(cont)

xtrain, ytrain = preprocess(df_train, 'Target', 1, continous_cols=['Close'])
xval, yval = preprocess(df_val, 'Target', 1, continous_cols=['Close'])

print('xtrain.shape:', xtrain.shape)
print(xtrain[:5])
print()
print('ytrain.shape:', ytrain.shape)
print(ytrain[:5])
print('df_train_cat.shape:', df_train_cat.shape)
print(df_train_cat.head())

""" xtrain and df_train_cat have different shapes!!!!!"""

                    RowId  SecuritiesCode    Open    High     Low   Close  \
Date                                                                        
2017-01-04  20170104_1301            1301  2734.0  2755.0  2730.0  2742.0   
2017-01-04  20170104_1332            1332   568.0   576.0   563.0   571.0   
2017-01-04  20170104_1333            1333  3150.0  3210.0  3140.0  3210.0   
2017-01-04  20170104_1376            1376  1510.0  1550.0  1510.0  1550.0   
2017-01-04  20170104_1377            1377  3270.0  3350.0  3270.0  3330.0   

             Volume  AdjustmentFactor  ExpectedDividend  SupervisionFlag  \
Date                                                                       
2017-01-04    31400               1.0               NaN            False   
2017-01-04  2798500               1.0               NaN            False   
2017-01-04   270800               1.0               NaN            False   
2017-01-04    11300               1.0               NaN            False   
2017

' xtrain and df_train_cat have different shapes!!!!!'

### Train the model

In [3]:

%%time
import torch
from sklearn.impute import SimpleImputer


imp = SimpleImputer(missing_values=np.nan, strategy='mean')
batch_size = 64
train_dataloader = get_loader(x=xtrain, y=ytrain, batch_size=batch_size, x_cat=df_train_cat.to_numpy())
val_dataloader = get_loader(x=xval, y=yval, batch_size=batch_size, x_cat=df_val_cat.to_numpy())


cat_features = 3 
embedding_dim = 10
# cat_features = cat_features * embedding_dim
# print('in_features:', xtrain.shape[1] + cat_features)

model = NeuralNetwork(
    in_features=xtrain.shape[1], 
    units=1024,
    out_features=1, 
    categorical_dim=cat_features,
    no_embedding=len(df_train_cat), 
    emb_dim=embedding_dim
)

print(model)

trainer = Trainer(model, lr=3.3e-6)
trainer.fit_epochs(train_dataloader, val_dataloader, use_cyclic_lr=True, x_cat=True, epochs=25)


NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (embedding): Embedding(901, 10)
  (embedding_to_hidden): Linear(in_features=10, out_features=1024, bias=True)
  (embedding_output): Linear(in_features=1024, out_features=1, bias=True)
  (cont_input): Linear(in_features=1, out_features=1024, bias=True)
  (hidden_layer): Linear(in_features=1027, out_features=1027, bias=True)
  (output_layer): Linear(in_features=1027, out_features=1, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (position_enc): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
)
Using cpu-device
Epoch: <<< 0 >>>
Train-Loss: 0.2804062068462372 [0/15]

Train-Loss: 0.09930188953876495 [1/15]
train metrics: <<< {'mse': 0.09930188953876495, 'mae': 0.3113971948623657} >>>

Train-Loss: 21.489505767822266 [2/15]

  Variable._execution_engine.run_backward(



train metrics: <<< {'mse': 21.489505767822266, 'mae': 4.63192892074585} >>>

Train-Loss: 15.379437446594238 [3/15]
train metrics: <<< {'mse': 15.379437446594238, 'mae': 3.917937755584717} >>>

Train-Loss: 18.010099411010742 [4/15]
train metrics: <<< {'mse': 18.010099411010742, 'mae': 4.236782073974609} >>>

Train-Loss: 11.682656288146973 [5/15]
train metrics: <<< {'mse': 11.682656288146973, 'mae': 3.408567190170288} >>>

Train-Loss: 72.7455062866211 [6/15]
train metrics: <<< {'mse': 72.7455062866211, 'mae': 8.507243156433105} >>>

Train-Loss: 37.55063247680664 [7/15]
train metrics: <<< {'mse': 37.55063247680664, 'mae': 6.101986885070801} >>>

Train-Loss: 20.961393356323242 [8/15]
train metrics: <<< {'mse': 20.961393356323242, 'mae': 4.547069072723389} >>>

Train-Loss: 25.526771545410156 [9/15]
train metrics: <<< {'mse': 25.526771545410156, 'mae': 5.022468090057373} >>>

Train-Loss: 0.21198077499866486 [10/15]

Train-Loss: 13.507098197937012 [11/15]
train metrics: <<< {'mse': 13.507098