# Stock Predict Model

## Set Up

### Import Modules

In [1]:
# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# For Progress Bar
from tqdm import tqdm

# For Graph
import matplotlib.pyplot as plt

# Pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter

# Yahoo API
import yfinance as yf

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

### Configurations

In [2]:
device = 'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')
scaler = StandardScaler()
config = {
    'seed': 666999,
    'select_all': False,
    'valid_ratio': 0.2,
    'test_ratio': 0.2,
    'n_epochs': 5000,
    'learning_rate': 1e-5,
    'early_stop': 300,
    'save_path': './models/stock.ckpt',
    'data_loader': {
        'batch_size': 128,
        'pin_memory': True,
        'num_workers': 0,
        'shuffle': True
    }
}

### Utility Functions

In [20]:
def same_seed(seed):
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)

def predict(test_loader, model, device):
    model.eval()
    result = {'features': [], 'preds': [], 'ans': []}
    for x, y in tqdm(test_loader):
        x, y = x.to(device), y.to(device)
        with torch.no_grad():
            pred = model(x)
            result['features'].append(x.detach().cpu())
            result['preds'].append(pred.detach().cpu())
            result['ans'].append(y.detach().cpu())

    # Combination tensor
    result['features'] = torch.cat(result['features'], 0).numpy()
    result['preds'] = torch.cat(result['preds'], 0).numpy()
    result['ans'] = torch.cat(result['ans'], 0).numpy()

    predstack = np.column_stack((result['features'], result['preds']))
    targetstack = np.column_stack((result['features'], result['ans']))

    return np.column_stack((predstack[:, -1], targetstack[:, -1]))

def plot_error_rate(pred, target):
    '''Plot error rate.'''
    errors = (np.absolute(pred - target) / target) * 100
    plt.plot(errors, '-r')

    plt.xlabel('')
    plt.ylabel('Percentage')
    plt.title('Error Rate')

    plt.show()

def plot_pred(pred, target):
    '''Plot predict.'''
    plt.plot(pred, '-g', label='Prediction')
    plt.plot(target, '--r', label='Target')

    plt.xlabel('Predictions')
    plt.ylabel('Targets')
    plt.title('Predictions')
    plt.legend()

    plt.show()

def plot_trend(pred, target):
    X = np.array(pred).reshape(-1, 1)
    y = np.array(target)

    model = LinearRegression()
    model.fit(X, y)

    y_pred = model.predict(X)

    plt.scatter(pred, target, c='b', label='Data')
    plt.plot(pred, y_pred, c='r', label='Trend Line')

    plt.xlabel('Predictions')
    plt.ylabel('Targets')
    plt.title('Trend')
    plt.legend()

    plt.show()

def tqdm_span(elapsed, total, n):
    return elapsed * (total or 0) / max(n, 1)


## Data

### Dataset

In [4]:
class Stock_Dataset(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self, x, y=None, normalized=False, initscaler=True):
        # Normalized
        stack = np.column_stack((x, y))

        if (normalized):
            normalstack = scaler.fit_transform(stack) if initscaler else scaler.transform(stack)
            x = normalstack[:, :-1]
            y = y if y is None else normalstack[:, -1]

        self.y = y if y is None else torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

### DataLoader

In [5]:
def stock_dataloader(config, x, y=None, normalized=False, initscaler=True):
    ''' Generates a dataset, then is put into a dataloader. '''

    x = np.array(x)
    y = np.array(y) if y is not None else y

    dataset = Stock_Dataset(x, y, normalized, initscaler)
    dataloader = DataLoader(dataset, **config['data_loader'])

    return dataloader

### Feature Selection

In [6]:
def select_feat(train_data, valid_data, test_data, select_all=True):
    '''Selects useful features to perform regression'''
    y_train, y_valid, y_test = train_data[:,-1], valid_data[:,-1], test_data[:, -1]
    raw_x_train, raw_x_valid, raw_x_test = train_data[:, :-1], valid_data[:, :-1], test_data[:, :-1]

    if select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        feat_idx = [0,1,2,3] # TODO: Select suitable feature columns.

    return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid, y_test

### Set Data

In [7]:
same_seed(config['seed'])

# re-index
index = ["Open", "High", "Low", "Adj Close", "Volume", "Close"]
raw_data = pd.DataFrame(yf.download('2330.TW', start='2000-01-01', end='2023-12-31'))
# raw_data.to_csv("raw.csv", index=False)
train_data, test_data = train_test_split(
    raw_data[index].values,
    test_size=config['test_ratio'],
    random_state=config['seed'],
    shuffle=False
)

train_data, valid_data = train_test_split(
    train_data,
    test_size=config['valid_ratio'],
    random_state=config['seed'],
    shuffle=True
)

# Print out the data size.
print(f"""train_data size: {train_data.shape}
valid_data size: {valid_data.shape}
test_data size: {test_data.shape}""")

# Select features
x_train, x_valid, x_test, y_train, y_valid, y_test = select_feat(train_data, valid_data, test_data, config['select_all'])

# Print out the number of features.
print(f'number of features: {x_train.shape[1]}')

# Pytorch data loader loads pytorch dataset into batches.
train_loader = stock_dataloader(config, x_train, y_train)
valid_loader = stock_dataloader(config, x_valid, y_valid, initscaler=False)
test_loader = stock_dataloader(config, x_test, y_test, initscaler=False)

[*********************100%***********************]  1 of 1 completed
train_data size: (3737, 6)
valid_data size: (935, 6)
test_data size: (1168, 6)
number of features: 4


## Neural Network

### Model

In [8]:
class Stock_Model(nn.Module):
    def __init__(self, input_dim):
        super(Stock_Model, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1)
        return x

### Training Loop

In [29]:
def trainer(train_loader, valid_loader, model, config, device):

    criterion = nn.MSELoss(reduction='mean')
    optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.7)

    writer = SummaryWriter()

    if not os.path.isdir('./models'):
        os.mkdir('./models')

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):

        postfix = [{"Loss": 0, "Span": "", "Title": "Train Loss"}, {"Loss": 0, "Span": "", "Title": "Valid Loss"}]
        bar_format = "{l_bar}{bar}| {n_fmt}/{total_fmt} {postfix[0][Span]} {postfix[0][Title]} {postfix[0][Loss]:>2.4f}, {postfix[1][Span]} {postfix[1][Title]} {postfix[1][Loss]:>2.4f}"

        # train
        model.train()
        loss_train_record = []

        # with torch.inference_mode():
        with tqdm(train_loader,
                bar_format=bar_format,
                desc=f'Epoch [{epoch+1}/{n_epochs}]',
                postfix=postfix,
                leave=False) as t:
            for x, y in t:
                optimizer.zero_grad()
                x, y = x.to(device), y.to(device)
                pred = model(x)
                loss = criterion(pred, y)
                loss.backward()
                optimizer.step()
                step += 1
                loss_train_record.append(loss.detach().item())
                mean_train_loss = sum(loss_train_record)/len(loss_train_record)

                t.postfix[0]["Loss"] = mean_train_loss
                t.postfix[0]["Span"] = f"[{t.format_interval(tqdm_span(t.format_dict['elapsed'], t.format_dict['total'], t.format_dict['n']))}]"
                t.update()

        writer.add_scalar('Loss/train', mean_train_loss, step)

        # valid
        model.eval()
        loss_valid_record = []

        with tqdm(valid_loader,
                bar_format=bar_format,
                desc=f'Epoch [{epoch+1}/{n_epochs}]',
                postfix=postfix) as t:
            for x, y in t:
                x, y = x.to(device), y.to(device)
                with torch.no_grad():
                    pred = model(x)
                    loss = criterion(pred, y)
                    loss_valid_record.append(loss.item())
                    mean_valid_loss = sum(loss_valid_record)/len(loss_valid_record)

                    t.postfix[1]["Loss"] = mean_valid_loss
                    t.postfix[1]["Span"] = f"[{t.format_interval(tqdm_span(t.format_dict['elapsed'], t.format_dict['total'], t.format_dict['n']))}]"
                    t.update()

        writer.add_scalar('Loss/valid', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path']) # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            return

## Training

In [30]:
model = Stock_Model(input_dim=x_train.shape[1]).to(device)
trainer(train_loader, valid_loader, model, config, device)

# with torch.profiler.profile(
#     activities=[torch.profiler.ProfilerActivity.CPU],
#     schedule = torch.profiler.schedule(
#         wait=0,
#         warmup=0,
#         active=1
#     ),
#     on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')
# ) as prof:
#     trainer(train_loader, valid_loader, model, config, device)
#     prof.step()

Epoch [1/5000]: 100%|██████████| 8/8 [00:00] Train Loss 10334.7887, [00:00] Valid Loss 9700.0710


Saving model with loss 9700.071...


Epoch [2/5000]: 100%|██████████| 8/8 [00:00] Train Loss 10199.9362, [00:00] Valid Loss 10215.7963
Epoch [3/5000]: 100%|██████████| 8/8 [00:00] Train Loss 9957.7567, [00:00] Valid Loss 9593.1353


Saving model with loss 9593.135...


Epoch [4/5000]: 100%|██████████| 8/8 [00:00] Train Loss 9726.9792, [00:00] Valid Loss 9299.7559


Saving model with loss 9299.756...


Epoch [5/5000]: 100%|██████████| 8/8 [00:00] Train Loss 9247.8996, [00:00] Valid Loss 9029.3652


Saving model with loss 9029.365...


Epoch [6/5000]: 100%|██████████| 8/8 [00:00] Train Loss 8759.7200, [00:00] Valid Loss 8363.1643


Saving model with loss 8363.164...


Epoch [7/5000]: 100%|██████████| 8/8 [00:00] Train Loss 8274.3751, [00:00] Valid Loss 7518.2339


Saving model with loss 7518.234...


Epoch [8/5000]: 100%|██████████| 8/8 [00:00] Train Loss 7616.4768, [00:00] Valid Loss 7036.3840


Saving model with loss 7036.384...


Epoch [9/5000]: 100%|██████████| 8/8 [00:00] Train Loss 6517.1173, [00:00] Valid Loss 5611.0981


Saving model with loss 5611.098...


Epoch [10/5000]: 100%|██████████| 8/8 [00:00] Train Loss 5486.7646, [00:00] Valid Loss 4884.0630


Saving model with loss 4884.063...


Epoch [11/5000]: 100%|██████████| 8/8 [00:00] Train Loss 4529.3223, [00:00] Valid Loss 4057.2119


Saving model with loss 4057.212...


Epoch [12/5000]: 100%|██████████| 8/8 [00:00] Train Loss 3645.8864, [00:00] Valid Loss 2789.0328


Saving model with loss 2789.033...


Epoch [13/5000]: 100%|██████████| 8/8 [00:00] Train Loss 2706.3340, [00:00] Valid Loss 2240.4183


Saving model with loss 2240.418...


Epoch [14/5000]: 100%|██████████| 8/8 [00:00] Train Loss 2036.3561, [00:00] Valid Loss 1541.2503


Saving model with loss 1541.250...


Epoch [15/5000]: 100%|██████████| 8/8 [00:00] Train Loss 1497.8151, [00:00] Valid Loss 1631.4460
Epoch [16/5000]: 100%|██████████| 8/8 [00:00] Train Loss 1035.7303, [00:00] Valid Loss 543.7464


Saving model with loss 543.746...


Epoch [17/5000]: 100%|██████████| 8/8 [00:00] Train Loss 828.0910, [00:00] Valid Loss 270.9149


Saving model with loss 270.915...


Epoch [18/5000]: 100%|██████████| 8/8 [00:00] Train Loss 570.4146, [00:00] Valid Loss 989.1570
Epoch [19/5000]: 100%|██████████| 8/8 [00:00] Train Loss 395.6162, [00:00] Valid Loss 207.1596


Saving model with loss 207.160...


Epoch [20/5000]: 100%|██████████| 8/8 [00:00] Train Loss 276.6129, [00:00] Valid Loss 557.7354
Epoch [21/5000]: 100%|██████████| 8/8 [00:00] Train Loss 204.4479, [00:00] Valid Loss 274.7121
Epoch [22/5000]: 100%|██████████| 8/8 [00:00] Train Loss 173.9715, [00:00] Valid Loss 93.1218


Saving model with loss 93.122...


Epoch [23/5000]: 100%|██████████| 8/8 [00:00] Train Loss 122.8432, [00:00] Valid Loss 68.0684


Saving model with loss 68.068...


Epoch [24/5000]: 100%|██████████| 8/8 [00:00] Train Loss 99.8710, [00:00] Valid Loss 28.1981


Saving model with loss 28.198...


Epoch [25/5000]: 100%|██████████| 8/8 [00:00] Train Loss 85.0881, [00:00] Valid Loss 49.1583
Epoch [26/5000]: 100%|██████████| 8/8 [00:00] Train Loss 64.3533, [00:00] Valid Loss 20.7448


Saving model with loss 20.745...


                                                                                      

KeyboardInterrupt: 

In [None]:
# print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=10))


In [None]:
%tensorboard --logdir=./log/

## Plot learning curves with tensorboard

In [None]:
%reload_ext tensorboard
%tensorboard --logdir=./runs/

## Testing

In [None]:
def save_pred(preds, file):
    ''' Save predictions to specified file '''
    np.savetxt(file, preds, delimiter=',', fmt='%d', header='pred,target', comments='')

model = Stock_Model(input_dim=x_train.shape[1]).to(device)
model.load_state_dict(torch.load(config['save_path']))
preds = predict(test_loader, model, device)
save_pred(preds, 'pred.csv')
plot_error_rate(preds[:,0], preds[:,1])
plot_pred(preds[:,0], preds[:,1])
plot_trend(preds[:,0], preds[:,1])

In [None]:
model = Stock_Model(input_dim=x_train.shape[1]).to(device)
model.load_state_dict(torch.load(config['save_path']))

single_loader = stock_dataloader(config, [[100,110,90,101]], [103])
predict(single_loader, model, device)

# for pred, target in single_pred:
#     print(f"Prediction: {pred}, Target: {target}")