# 引入套件

In [1]:
from get_data import IB_data, YF_data, AV_data
from pytorch_fit import set_seed, setup_dataloader, Test

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import talib

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support as precision_recall_fscore

import torch.nn as nn
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


# 資料前處理

## 抓取資料(IB, yfinance or alpha_vantage)

In [2]:
# df_IB = IB_data('EUR', 'USD', endDateTime='20220726 23:59:59')
# df_YF = YF_data('EUR', 'USD', start='2012-07-30', end='2022-07-26')
df_AV = AV_data('EUR', 'USD', start='2012-07-30', end='2022-07-26')

## 計算指標(頻率為小時)及製作模型輸入

In [3]:
# # 取出匯率每小時收盤價
# data = df[['Close']]

# # 計算匯率日報酬
# day_data = data.resample('D').last().dropna()
# day_rtn = day_data.pct_change().dropna()
# day_rtn

# hourly_std_byday = data.pct_change().resample('D').std().dropna()
# hourly_skew_byday = 3 * (data.pct_change().resample('D').mean().dropna() - data.pct_change().resample('D').median().dropna()) \
#                        / data.pct_change().resample('D').std().dropna()
# cumu_24day_return = ((day_rtn + 1).rolling(window=24).apply(np.prod, raw=True) - 1).dropna()

# data = df
# data = data.drop(index=data.index[-1])
# # print(data.info())

# rtn = data.pct_change().dropna()
# # print(rtn.info())

# mean_last24hour = rtn.rolling(24).mean().dropna()
# # print(mean_last24hour.info())

# std_last24hour = rtn.rolling(24).std().dropna()
# # print(std_last24hour.info())

# median_last24hour = rtn.rolling(24).median().dropna()
# # print(median_last24hour.info())

# skew_last24hour = 3 * (mean_last24hour - median_last24hour) / std_last24hour
# # print(skew_last24hour.info())

# cumu_24hour_return = ((rtn + 1).rolling(24).apply(np.prod, raw=True) - 1).dropna()
# # print(cumu_24hour_return.info())


# # 製作label和對應之features
# y = np.array(rtn.Close[cumu_24hour_return.index[1]:cumu_24hour_return.index[-1]]).flatten()

# x0 = np.array(rtn[cumu_24hour_return.index[0]:cumu_24hour_return.index[-2]])
# x1 = np.array(mean_last24hour[cumu_24hour_return.index[0]:cumu_24hour_return.index[-2]])
# x2 = np.array(std_last24hour[cumu_24hour_return.index[0]:cumu_24hour_return.index[-2]])
# x3 = np.array(median_last24hour[cumu_24hour_return.index[0]:cumu_24hour_return.index[-2]])
# x4 = np.array(skew_last24hour[cumu_24hour_return.index[0]:cumu_24hour_return.index[-2]])
# x5 = np.array(cumu_24hour_return[cumu_24hour_return.index[0]:cumu_24hour_return.index[-2]])
# X = np.hstack([x0, x1, x2, x3, x4, x5])

# print(x0.shape, x1.shape, x2.shape, x3.shape, x4.shape, x5.shape)
# print(y.shape, X.shape)

## 計算指標及製作模型輸入

In [4]:
data = df_AV
# print(data.info())

rtn = data.pct_change().dropna()
# print(rtn.info())

mean_last20day = rtn.rolling(20).mean().dropna()
# print(mean_last20day.info())

std_last20day = rtn.rolling(20).std().dropna()
# print(std_last20day.info())

median_last20day = rtn.rolling(20).median().dropna()
# print(median_last20day.info())

skew_last20day = 3 * (mean_last20day - median_last20day) / std_last20day
# print(skew_last20day.info())

cumu_20day_return = ((rtn + 1).rolling(20).apply(np.prod, raw=True) - 1).dropna()
# print(cumu_20day_return.info())


# 製作label和對應之features
label = rtn.Close[cumu_20day_return.index[1]:cumu_20day_return.index[-1]]
label[label.between(-.0015, .0015)] = 0
label[label > 0] = 1
label[label < 0] = 2
print((label == 0).sum()/len(label), (label == 1).sum()/len(label), (label == 2).sum()/len(label))
y = np.array(label).flatten()

x0 = np.array(rtn[cumu_20day_return.index[0]:cumu_20day_return.index[-2]])
x1 = np.array(std_last20day[cumu_20day_return.index[0]:cumu_20day_return.index[-2]])
x2 = np.array(skew_last20day[cumu_20day_return.index[0]:cumu_20day_return.index[-2]])
x3 = np.array(cumu_20day_return[cumu_20day_return.index[0]:cumu_20day_return.index[-2]])
X = np.hstack([x0, x1, x2, x3])

print(x0.shape, x1.shape, x2.shape, x3.shape)
print(y.shape, X.shape)

0.29466357308584684 0.34493426140757927 0.36040216550657383
(2586, 4) (2586, 4) (2586, 4) (2586, 4)
(2586,) (2586, 16)


# 模型訓練

In [5]:
random_seed = 323014

In [6]:
# 資料集分割
train_test_num = len(y)
train_num = int(0.9 * train_test_num)
X_train, X_test, y_train, y_test = X[:train_num], X[train_num:], y[:train_num], y[train_num:]

In [7]:
# 隨機森林

# 透過Cross-Validation得到最佳超參數，並在整個訓練集上訓練
parameters = {'n_estimators':list(range(10, 21, 10))}
RFC = RandomForestClassifier(random_state=random_seed)
RFCCV = GridSearchCV(RFC, parameters, cv=5, return_train_score=True)
RFCCV.fit(X_train, y_train)

# 衡量in_sample和out_sample表現
in_sample_pred = RFCCV.predict(X_train)
in_sample_acc = (in_sample_pred == y_train).sum()/len(y_train)
in_sample_precision_recall_f1score = precision_recall_fscore(y_train, in_sample_pred, labels=[0, 1, 2])
out_sample_pred = RFCCV.predict(X_test)
out_sample_acc = (out_sample_pred == y_test).sum()/len(y_test)
out_sample_precision_recall_f1score = precision_recall_fscore(y_test, out_sample_pred, labels=[0, 1, 2])

# 配適結果：
print(f'in_sample_acc: {in_sample_acc*100:.2f}%')
print('in_sample_precision:', in_sample_precision_recall_f1score[0])
print('in_sample_recall:', in_sample_precision_recall_f1score[1])
print('in_sample_f1score:', in_sample_precision_recall_f1score[2])
print()
print(f'out_sample_acc: {out_sample_acc*100:.2f}%')
print('out_sample_precision:', out_sample_precision_recall_f1score[0])
print('out_sample_recall:', out_sample_precision_recall_f1score[1])
print('out_sample_f1score:', out_sample_precision_recall_f1score[2])

RFCCV.best_params_

in_sample_acc: 99.83%
in_sample_precision: [0.99709724 0.997558   1.        ]
in_sample_recall: [1.         1.         0.99513973]
in_sample_f1score: [0.99854651 0.99877751 0.99756395]

out_sample_acc: 34.36%
out_sample_precision: [0.35106383 0.23863636 0.45454545]
out_sample_recall: [0.44       0.28       0.32110092]
out_sample_f1score: [0.39053254 0.25766871 0.37634409]


{'n_estimators': 20}

In [8]:
# def Optim_hidden_size_with_earlystop(input_size, hidden_size_list, train_dataloader, valid_dataloader, epochs):
    
#     min_valid_loss_of_best_hidden_size = 100000
#     min_valid_loss_hidden_size_with_epoch = (0, 0)
#     for hidden_size in hidden_size_list:
#         # model, criterion & optimizer
#         model = NN(input_size, hidden_size)
#         criterion = nn.CrossEntropyLoss()
#         optimizer = optim.Adam(model.parameters())
        
#         min_valid_loss_at_best_epoch = 100000
#         min_valid_loss_epoch = 0
#         for epoch in range(epochs):
#             # training
#             train_acc = 0
#             len_train = 0
#             model.train()
#             for X, y in train_dataloader:
#                 len_train += len(X)
#                 # X, y = X.to(device), y.to(device)
#                 optimizer.zero_grad()
#                 ypred = model(X)
#                 # print('ypred: ', ypred, '\n', 'y: ', y)
#                 loss = criterion(ypred, y)
#                 _, train_pred = torch.max(ypred, 1)
#                 # print('ypred: ', ypred, '\n', 'train_pred: ', train_pred)
#                 loss.backward()
#                 optimizer.step()
                
#                 train_acc += (train_pred.cpu() == y.cpu()).sum().item()
                
#             # evaluating
#             valid_loss = 0
#             valid_acc = 0
#             len_valid = 0
#             model.eval()
#             for X, y in  valid_dataloader:
#                 len_valid += len(X)
#                 # X, y = X.to(device), y.to(device)
#                 with torch.no_grad():
#                     ypred = model(X)
#                     loss = criterion(ypred, y)
#                     _, valid_pred = torch.max(ypred, 1)
                
#                     valid_acc += (valid_pred.cpu() == y.cpu()).sum().item()
#                     valid_loss += loss.item() * len(X)
#             valid_loss = valid_loss / len_valid
#             # print(f'Train on hidden size: {hidden_size}, [{(epoch + 1):02d}/{epochs}]Epochs: Train_acc: {train_acc*100/len_train:.2f}% | Valid_acc: {valid_acc*100/len_valid:.2f}%')
#             if valid_loss < min_valid_loss_at_best_epoch:
#                 min_valid_loss_at_best_epoch = valid_loss
#                 min_valid_loss_epoch = epoch + 1
#         if min_valid_loss_at_best_epoch < min_valid_loss_of_best_hidden_size:
#             min_valid_loss_of_best_hidden_size = min_valid_loss_at_best_epoch
#             min_valid_loss_hidden_size_with_epoch = (hidden_size, min_valid_loss_epoch)
    
#     return min_valid_loss_hidden_size_with_epoch

In [9]:
# # 找到最佳超參數hidden_size和對應之epochs(early stopping)
# Optim_hidden_size, Optim_epoch = Optim_hidden_size_with_earlystop(input_size, hidden_size_list, train_dataloader, valid_dataloader, epochs)

# # 以前述超參數訓練模型(在train_valid_set)
# model = NN(input_size, Optim_hidden_size)
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters())

# for epoch in range(Optim_epoch):
#     model.train()
#     for X, y in train_valid_dataloader:
#         # X, y = X.to(device), y.to(device)
#         optimizer.zero_grad()
#         ypred = model(X)
#         loss = criterion(ypred, y)
#         loss.backward()
#         optimizer.step()

# # 衡量in_sample和out_sample表現
# model.eval()

# in_sample_pred = np.array([])
# for X, y in train_valid_dataloader:
#     # X= X.to(device)
#     with torch.no_grad():
#         ypred = model(X)
#         _, ypred = torch.max(ypred, 1)
#         in_sample_pred = np.append(in_sample_pred, ypred.cpu().numpy())

# in_sample_acc = (in_sample_pred == y_train).sum()/len(y_train)
# in_sample_precision_recall_f1score = precision_recall_fscore(y_train, in_sample_pred, labels=[0, 1, 2])

# out_sample_pred = np.array([])
# for X, y in test_dataloader:
#     # X= X.to(device)
#     with torch.no_grad():
#         ypred = model(X)
#         _, ypred = torch.max(ypred, 1)
#         out_sample_pred = np.append(out_sample_pred, ypred.cpu().numpy())

# out_sample_acc = (out_sample_pred == y_test).sum()/len(y_test)
# out_sample_precision_recall_f1score = precision_recall_fscore(y_test, out_sample_pred, labels=[0, 1, 2])

# # 配適結果：
# print(f'in_sample_acc: {in_sample_acc*100:.2f}%')
# print('in_sample_precision:', in_sample_precision_recall_f1score[0])
# print('in_sample_recall:', in_sample_precision_recall_f1score[1])
# print('in_sample_f1score:', in_sample_precision_recall_f1score[2])
# print()
# print(f'out_sample_acc: {out_sample_acc*100:.2f}%')
# print('out_sample_precision:', out_sample_precision_recall_f1score[0])
# print('out_sample_recall:', out_sample_precision_recall_f1score[1])
# print('out_sample_f1score:', out_sample_precision_recall_f1score[2])

# Optim_hidden_size, Optim_epoch

In [10]:
class CostumDataset(Dataset):
    def __init__(self, X, y):
        self.X = X.astype('float32')
        self.y = y.astype('long')
        
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return (self.X[idx], self.y[idx])

class model_structure(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.InputLayer = nn.Linear(input_size, hidden_size)
        self.HiddenLayer =  nn.Sequential(nn.Linear(hidden_size, 2*hidden_size),
                                          nn.ReLU(),
                                          nn.Linear(2*hidden_size, 4*hidden_size),
                                          nn.ReLU(),
                                          nn.Linear(4*hidden_size, 2*hidden_size),
                                          nn.ReLU(),
                                          nn.Linear(2*hidden_size, hidden_size),
                                          nn.ReLU()
                                          )
        self.OutputLayer = nn.Linear(hidden_size, 3)

    def forward(self, input):
        hidden = self.InputLayer(input)
        hidden = self.HiddenLayer(hidden)
        output = self.OutputLayer(hidden)
        return output

In [12]:
hparams = {'input_size': X_train.shape[1],
           'hidden_size': 64,
           'valid_ratio': 0.1,
           'batch_size': 8,
           'lr': 1e-4,
           'num_epochs': 100}

hidden_size_list = [64, 128, 256]

# 隨機種子
set_seed()

# Dataset與DataLoader
train_valid_set = CostumDataset(X_train, y_train)
test_set = CostumDataset(X_test, y_test)

train_dataloader, valid_dataloader, train_valid_dataloader, test_dataloader = \
    setup_dataloader(hparams, train_valid_set, test_set)

# 在 valid set 上最好的參數, 在 test set 上的準確度, 在 train_valid set 上的準確度, 超參數尋找過程中在 valid set 上最高的準確度
best_hparams, best_model_valid_acc, best_model_train_acc, best_valid_acc = \
    Test(hidden_size_list, hparams, model_structure, train_dataloader, valid_dataloader, train_valid_dataloader, test_dataloader)

print(best_hparams)
print('best_model_valid_acc, best_model_train_acc, best_valid_acc')
print(best_model_valid_acc, best_model_train_acc, best_valid_acc)

hparams: {'input_size': 16, 'hidden_size': 64, 'valid_ratio': 0.1, 'batch_size': 8, 'lr': 0.0001, 'num_epochs': 100}


100%|██████████| 100/100 [00:24<00:00,  4.05it/s]


train_acc: 0.5052531041069723
valid_acc: 0.3261802575107296
hparams: {'input_size': 16, 'hidden_size': 128, 'valid_ratio': 0.1, 'batch_size': 8, 'lr': 0.0001, 'num_epochs': 100}


 75%|███████▌  | 75/100 [00:38<00:13,  1.92it/s]