In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [9]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn


import pytorch_lightning as pl
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import torchmetrics as TM
pl.utilities.seed.seed_everything(seed=42)

# Utility functions and classes

In [17]:


def show_df(
    df, 
    show_info=True, 
    show_head=True, 
    show_tail=True, 
    dataframe_name='financials'
):
    print(f'<<< {dataframe_name} >>>')
    print(df.shape)
    if show_info:
        display(df.info())
    if show_head:
        display(df.head())
    if show_tail:
        display(df.tail())
    print('-.' * 80)
    
    
def date_features(df, date_col=None):
    # Check if index is datetime.
#     if isinstance(df, pd.core.series.Series):
#         df = pd.DataFrame(df, index=df.index)
    if date_col:
        df[date_col] = pd.to_datetime(df[date_col])
        df.set_index(date_col, inplace=True)

    df.loc[:, 'day_of_year'] = df.index.dayofyear
    df.loc[:, 'month'] = df.index.month
    df.loc[:, 'day_of_week'] = df.index.day
#     df.loc[:, 'hour'] = df.index.hour
    return  df


def preprocess(df, diff_cols=['Open', 'Close', 'High', 'Low', 'Volume']):
    if diff_cols:
        df[diff_cols] = df[diff_cols].pct_change()
    df = df.select_dtypes(include=[int, float])
    return df



class ToTorch(Dataset):

    def __init__(
            self,
            features,
            target
            ):
        self.features = features
        self.target = target

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        features = self.features[idx]
        target = self.target[idx]
        return {
            'features': torch.from_numpy(np.array(features)).float(), 
            'target': torch.from_numpy(np.array(target)).float()
            }
    

def get_loader(x, y, batch_size):
    # Return dict with {'features', 'targets'}
    return DataLoader(ToTorch(x, y), batch_size=batch_size)

# Models

In [25]:
class NeuralNetwork(nn.Module):
    def __init__(self, in_features, out_features):
        super(NeuralNetwork, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
#         self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
#         x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    




class Trainer:
    def __init__(
        self, 
        model, 
        optimizer_name='rmsprop', 
        lr=0.003, 
        loss_fn_name='mse'
        ):

        self.model = model
        self.lr = lr
        self.optimizer_name=optimizer_name
        self.loss_fn_name = loss_fn_name
        self.train_loss = []
        self.valid_loss = []
        self.test_loss = []
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using {self.device}-device")

        if self.loss_fn_name == 'mse':
            self.loss_fn = nn.MSELoss()

        if self.optimizer_name.lower() == 'rmsprop': 
            self.optimizer = torch.optim.RMSprop(self.model.parameters(), self.lr)

        elif self.optimizer_name.lower() == 'adam':
            pass

    def _set_optimizer(self):
        try:
            pass
        except:
            pass

    def _set_loss(self):
        try:
            pass
        except:
            pass

    def check_optimizer_loss_args(self):
        print(f'Allowed opmimizer names are:')
        print(f'Allowed loss function names are:')

    def fit_epochs(self, train_loader, valid_loader=None, epochs=5):
        for epoch in epochs:
            self.fit_one_epoch(train_loader, valid_loader)


    def fit_one_epoch(self, train_loader, valid_loader=None):
        size = len(train_loader)
        self.model.to(self.device).train()
        for batch, data in enumerate(train_loader):
            x = data['features']
            y = data['target']
            print('x.shap1e:', x.shape)
            x, y = x.to(self.device), y.to(self.device)
            self._run_train_step(x, y, batch, size)

        if valid_loader is not None:
            with torch.no_grad():
                pass

    def evaluate(self, test_loader, batch, size):
        self.model.eval()
        for x, y in range(len(test_loader)):
            x, y = x.to(self.device), y.to(self.device)
            pred = self.model(x)
            loss += self.loss_fn(pred, y).item()
            print(f'val-loss: {loss.item()} [{batch * len(x)/{size}}]')


    def _run_train_step(self, x, y, batch, size):
        pred = self.model(x)
        loss = self.loss_fn(pred, y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.train_loss.append(loss.item())
        # if batch % 100 == 0:
        print(f'loss: {loss.item()} [{batch * len(x)}/{size}]')


In [12]:
ROOT_PATH = '/kaggle/input/jpx-tokyo-stock-exchange-prediction'

# Start data analysis 

### API data

In [13]:
'/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/financials.csv'
'/train_files/trades.csv'

train_df = pd.read_csv(f'{ROOT_PATH}/train_files/stock_prices.csv')
train_df['Date'] = pd.to_datetime(train_df['Date']) 
train_df.set_index('Date', inplace=True)
# train_df = date_features(train_df)

train_options = pd.read_csv(f'{ROOT_PATH}/train_files/options.csv', low_memory=False)
train_financials = pd.read_csv(f'{ROOT_PATH}/train_files/financials.csv', low_memory=False)
train_trades = pd.read_csv(f'{ROOT_PATH}/train_files/trades.csv', low_memory=False)

In [14]:
print(train_df.shape)
print()
print('Unique values for Adjustment factor:')
print(train_df.AdjustmentFactor.unique())
print()
print('Number of Unique Securities code:')
print(train_df.SecuritiesCode.nunique())
print()
print('Number of Unique Expected dividends:')
print(train_df.ExpectedDividend.nunique())
# print(train_df.ExpectedDividend.unique())
show_df(train_df, dataframe_name='stock_data')

show_df(train_options, dataframe_name='options_data')

show_df(train_financials, dataframe_name='financials')

show_df(train_trades, dataframe_name='trades')

In [18]:
import matplotlib.pyplot as plt

df_1301 = train_df[train_df['SecuritiesCode'] == 1301].drop('SecuritiesCode', axis=1)
# df_1301.set_index('Date', inplace=True)
# df_1301[['Open', 'Close', 'High', 'Low', 'Volume']] = df_1301[['Open', 'Close', 'High', 'Low', 'Volume']].pct_change()
df_1301 = preprocess(df_1301)
df_1301.plot(figsize=(15, 20), subplots=True);
plt.show();
df_1301.Target.hist(bins=50);
plt.show();


# pd.plotting.scatter_matrix(df_1301);
# plt.show();
print(df_1301.info())

## Models
Todo:

    1) Preprocesses data and use correct features (date features, etc)
    2) Deep learning for prediction of stock-returns
    3) Ranking with XGBoost or other methods

In [27]:
import torch
ytrain_stock = df_1301['Target'].to_numpy()
xtrain_stock = df_1301.drop('Target', axis=1).to_numpy()
print(ytrain_stock)
train_dataloader = get_loader(xtrain_stock, ytrain_stock, 64)


# train_set = TimeSeriesDataset(x_num, y_num)
# train_loader = DataLoader(train_set, batch_size=64)
model = NeuralNetwork(in_features=xtrain_stock.shape[1], out_features=1)
trainer = Trainer(model)


# # Display image and label.
# train_features, train_labels = next(iter(train_dataloader))
# print(train_features['features'])
# print(f"Feature batch shape: {train_features.size()}")
# print(f"Labels batch shape: {train_labels.size()}")
# # img = train_features[0].squeeze()
# # label = train_labels[0]
# # plt.imshow(img, cmap="gray")
# # plt.show()
# # print(f"Label: {label}")