In [None]:
%matplotlib inline
import torch
import random
import pandas as pd
import numpy as np
import pandas_datareader.data as web
from matplotlib import pyplot
import seaborn as sns
import os

#Plotting 
from pandas.plotting import scatter_matrix

#Libraries for Statistical Models
import statsmodels.api as sm

#logging
from myutil.logconf import logging
log = logging.getLogger(__name__)
# log.setLevel(logging.ERROR)
log.setLevel(logging.INFO)
# log.setLevel(logging.WARN)
# log.setLevel(logging.DEBUG)


#Diable the warnings
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None
pd.options.display.expand_frame_repr = False
pd.options.display.float_format = '{:.3f}'.format

torch.seed = 42
random.seed(42)
np.random.seed(42)

%run 'nb_utils.ipynb'
task_name = get_filename_of_ipynb()
print(task_name)
data_dir = f'{os.getcwd()}/data/'
log_dir_base = f'{os.getcwd()}/runs/{task_name}'
log_dir = log_dir_base
print(f'{data_dir}\n{log_dir}')


In [None]:
# hyperparameters turning
from ray import tune, train, ray
from ray.tune.schedulers import ASHAScheduler

ray.init(log_to_driver=False)

In [None]:
from datetime import datetime
import yfinance as yfin

# Loading the data
stk_symbols = [
    "AAPL",
    "MSFT",
    "AMZN",
    "NVDA",
    "GOOGL",
    "GOOG",
    "META",
    "TSLA",
    "UNH",
    "LLY",
    "JPM",
    "XOM",
    "JNJ",
    "V",
    "PG",
    "AVGO",
    "MA",
    "HD",
    "CVX",
    "MRK",
    "ABBV",
    "PEP",
    "COST",
    "ADBE",
    "KO",
    "CSCO",
    "WMT",
    "TMO",
    "MCD",
    "PFE",
    "CRM",
    "BAC",
    "ACN",
    "CMCSA",
    "LIN",
    "NFLX",
    "ABT",
    "ORCL",
    "DHR",
    "AMD",
    "WFC",
    "DIS",
    "TXN",
    "PM",
    "VZ",
    "INTU",
    "COP",
    "CAT",
    "AMGN",
    "NEE",
    "INTC",
    "UNP",
    "LOW",
    "IBM",
    "BMY",
    "SPGI",
    "RTX",
    "HON",
    "BA",
    "UPS",
    "GE",
    "QCOM",
    "AMAT",
    "NKE",
    "PLD",
    "NOW",
    "BKNG",
    "SBUX",
    "MS",
    "ELV",
    "MDT",
    "GS",
    "DE",
    "ADP",
    "LMT",
    "TJX",
    "T",
    "BLK",
    "ISRG",
    "MDLZ",
    "GILD",
    "MMC",
    "AXP",
    "SYK",
    "REGN",
    "VRTX",
    "ETN",
    "LRCX",
    "ADI",
    "SCHW",
    "CVS",
    "ZTS",
    "CI",
    "CB",
    "AMT",
    "SLB",
    "C",
    "BDX",
    "MO",
    "PGR",
    "TMUS",
    "FI",
    "SO",
    "EOG",
    "BSX",
    "CME",
    "EQIX",
    "MU",
    "DUK",
    "PANW",
    "PYPL",
    "AON",
    "SNPS",
    "ITW",
    "KLAC",
    "LULU",
    "ICE",
    "APD",
    "SHW",
    "CDNS",
    "CSX",
    "NOC",
    "CL",
    "MPC",
    "HUM",
    "FDX",
    "WM",
    "MCK",
    "TGT",
    "ORLY",
    "HCA",
    "FCX",
    "EMR",
    "PXD",
    "MMM",
    "MCO",
    "ROP",
    "CMG",
    "PSX",
    "MAR",
    "PH",
    "APH",
    "GD",
    "USB",
    "NXPI",
    "AJG",
    "NSC",
    "PNC",
    "VLO",
    "GBP",
    "F",
    "MSI",
    "GM",
    "TT",
    "EW",
    "CARR",
    "AZO",
    "ADSK",
    "TDG",
    "ANET",
    "SRE",
    "ECL",
    "OXY",
    "PCAR",
    "ADM",
    "MNST",
    "KMB",
    "PSA",
    "CCI",
    "CHTR",
    "MCHP",
    "MSCI",
    "CTAS",
    "WMB",
    "AIG",
    "STZ",
    "HES",
    "NUE",
    "ROST",
    "AFL",
    "AEP",
    "IDXX",
    "D",
    "TEL",
    "JCI",
    "MET",
    "GIS",
    "IQV",
    "EXC",
    "WELL",
    "DXCM",
    "HLT",
    "ON",
    "COF",
    "PAYX",
    "TFC",
    "USD",
    "BIIB",
    "O",
    "FTNT",
    "DOW",
    "TRV",
    "DLR",
    "MRNA",
    "CPRT",
    "ODFL",
    "DHI",
    "YUM",
    "SPG",
    "CTSH",
    "AME",
    "BKR",
    "SYY",
    "A",
    "CTVA",
    "CNC",
    "EL",
    "AMP",
    "CEG",  # PCT <= -0.05,  size = 0
    "HAL",
    "OTIS",  # PCT <= -0.05,  size = 0
    "ROK",
    "PRU",
    "DD",
    "KMI",
    "VRSK",
    "LHX",
    "DG",
    "FIS",
    "CMI",
    "CSGP",
    "FAST",
    "PPG",
    "GPN",
    "GWW",
    "HSY",
    "BK",
    "XEL",
    "DVN",
    "EA",
    "NEM",
    "ED",
    "URI",
    "VICI",
    "PEG",
    "KR",
    "RSG",
    "LEN",
    "PWR",
    "WST",
    "COR",
    "OKE",
    "VMC",
    "KDP",
    "WBD",
    "ACGL",
    "ALL",
    "IR",
    "CDW",
    "FANG",
    "MLM",
    "PCG",
    "DAL",
    "EXR",
    "FTV",
    "AWK",
    "IT",
    "KHC",
    "GEHC",  # PCT <= -0.05,  size = 0
    "WEC",
    "HPQ",
    "EIX",
    "CBRE",
    "APTV",
    "ANSS",
    "MTD",
    "DLTR",
    "AVB",
    "ILMN",
    "ALGN",
    "LYB",
    "TROW",
    "GLW",
    "EFX",
    "WY",
    "ZBH",
    "XYL",
    "SBAC",
    "RMD",
    "TSCO",
    "EBAY",
    "KEYS",
    "CHD",
    "STT",
    "DFS",
    "HIG",
    "ALB",
    "STE",
    "ES",
    "TTWO",
    "MPWR",
    "CAH",
    "EQR",
    "RCL",
    "WTW",
    "HPE",
    "DTE",
    "GPC",
    "BR",
    "ULTA",
    "FICO",
    "CTRA",
    "BAX",
    "AEE",
    "MTB",
    "MKC",
    "ETR",
    "WAB",
    "DOV",
    "FE",
    "RJF",
    "INVH",
    "FLT",
    "CLX",
    "TDY",
    "TRGP",
    "DRI",
    "LH",
    "HOLX",
    "VRSN",
    "MOH",
    "LUV",
    "PPL",
    "ARE",
    "NVR",
    "COO",
    "WBA",
    "PHM",
    "NDAQ",
    "HWM",
    "RF",
    "CNP",
    "IRM",
    "LVS",
    "FITB",
    "EXPD",
    "VTR",
    "FSLR",
    "PFG",
    "BRO",
    "J",
    "IEX",
    "BG",
    "ATO",
    "FDS",
    "ENPH",
    "MAA",
    "CMS",
    "IFF",
    "BALL",
    "SWKS",
    "CINF",
    "NTAP",
    "STLD",
    "UAL",
    "WAT",
    "OMC",
    "TER",
    "CCL",
    "JBHT",
    "MRO",
    "TYL",
    "HBAN",
    "K",
    "GRMN",
    "CBOE",
    "NTRS",
    "TSN",
    "AKAM",
    "EG",
    "ESS",
    "EQT",
    "TXT",
    "EXPE",
    "SJM",
    "PTC",
    "DGX",
    "AVY",
    "RVTY",
    "BBY",
    "CF",
    "CAG",
    "EPAM",
    "AMCR",
    "LW",
    "PAYC",
    "SNA",
    "AXON",
    "POOL",
    "SYF",
    "SWK",
    "ZBRA",
    "DPZ",
    "PKG",
    "CFG",
    "LDOS",
    "VTRS",
    "PODD",
    "LKQ",
    "MOS",
    "APA",
    "EVRG",
    "TRMB",
    "MGM",
    "NDSN",
    "WDC",
    "MAS",
    "LNT",
    "IPG",
    "MTCH",
    "STX",
    "KMX",
    "TECH",
    "WRB",
    "LYV",
    "IP",
    "UDR",
    "AES",
    "CE",
    "INCY",
    "L",
    "TAP",
    "GEN",
    "CPT",
    "KIM",
    "JKHY",
    "HRL",
    "HST",
    "FMC",
    "CZR",
    "PEAK",
    "CDAY",
    "PNR",
    "NI",
    "CHRW",
    "HSIC",
    "CRL",
    "REG",
    "QRVO",
    "TFX",
    "KEY",
    "GL",
    "EMN",
    "WYNN",
    "ALLE",
    "AAL",
    "FFIV",
    "BWA",
    "BXP",
    "MKTX",
    "ROL",
    "JNPR",
    "PNW",
    "ETSY",
    "BLDR",
    "FOXA",
    "AOS",
    "HAS",
    "HII",
    "NRG",
    "CPB",
    "UHS",
    "BIO",
    "WRK",
    "RHI",
    "CTLT",
    "XRAY",
    "BBWI",
    "NWSA",
    "TPR",
    "PARA",
    "WHR",
    "BEN",
    "AIZ",
    "NCLH",
    "GNRC",
    "FRT",
    "IVZ",
    "VFC",
    "CMA",
    "DVA",
    "JBL",
    "HUBB",
    "ZION",
    "UBER",
    "MHK",
    "RL",
    "FOX",
    "BX",
    "ABNB",
    "NWS",
]
# # stk_symbols = [
# #     "AAPL",
# #     "MSFT",
# #     "AMZN",
# #     "NVDA",
# #     "GOOGL",
# #     "TSLA",
# #     "META",
# #     "GOOG",
# #     "ADBE",
# #     "NFLX",
# #     "CSCO",
# #     "INTC",
# #     "INTU",
# #     "CMCSA",
# #     "TXN",
# #     "AMAT",
# #     "ADSK",
# #     "AMD",
# #     "QCOM",
# #     "MU",
# # ]
# stk_symbols = [
#     "AAPL",
#     "MSFT",
#     "AMZN",
#     "NVDA",
#     "GOOGL",
#     "TSLA",
#     "META",
#     "GOOG",
# ]

start = datetime(2014, 1, 1)
end = datetime(2023, 12, 31)

ticks_data = []
for symbol in stk_symbols:
    stk_file = f"{data_dir}{symbol}.csv"
    bLoad = False
    if os.path.isfile(stk_file):
        try:
            _stk_data = pd.read_csv(stk_file).set_index("Date")
            bLoad = True
            print(f"read {stk_file} completely!")
        except:
            None
    if bLoad == False:
        # _stk_data = web.get_data_yahoo(stk_tickers, start, end)
        _stk_data = yfin.download([symbol], start, end).dropna()
        _stk_data.to_csv(stk_file)
        print(f"download {symbol} from yfin and write to {stk_file} completely!")
    ticks_data.append(_stk_data)
    print(f"{symbol}, size:{len(_stk_data)}")

In [None]:
import torch

device_name = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
device = torch.device(device_name)
return_period = 5
seq_len = 3
validation_size = 0.2
epoch_num = 100
batch_size = 32
num_workers = 3
pin_memory = True
shuffle = True
print(f"device_name:{device}")

In [None]:
pct_threshold = 0.05
class_percentage_threshold = 0.08  # percentage threshold for class size
classificationThreshold = 0.5


# number of classes = 3
# 0: PCT <= -0.05
# 1: 0.05 < PCT < -0.05
# 2: PCT >= -0.05
# num_classes = 3

# def gen_pct_label(stk_data, _return_period):
#     max_price_period = (
#         stk_data["Adj Close"].rolling(_return_period).max().shift(-_return_period)
#     )
#     max_pct_period = (max_price_period - stk_data["Adj Close"]) / stk_data["Adj Close"]
#     pct_label = max_pct_period.apply(
#         lambda x: 2 if x >= pct_threshold else 0 if x <= -pct_threshold else 1
#     ).astype("int8")
#     pct_label.name = "label"
#     return pct_label


# number of classes = 2
# 0: PCT < 0.05
# 1: PCT >= -0.05
num_classes = 2


def gen_pct_label(stk_data, _return_period):
    max_price_period = (
        stk_data["Adj Close"].rolling(_return_period).max().shift(-_return_period)
    )
    max_pct_period = (max_price_period - stk_data["Adj Close"]) / stk_data["Adj Close"]
    pct_label = max_pct_period.apply(lambda x: 1 if x >= pct_threshold else 0).astype(
        "int8"
    )
    pct_label.name = "label"
    return pct_label


def class_percentage(analysis_data):
    stat = analysis_data.groupby("label").size()
    total = len(analysis_data)
    p = []
    for i in range(num_classes):
        p.append(stat[i] / total if i in stat.index else 0.0)
    return p

In [None]:
def gen_buy_sell_signal(stk_data):
    import pandas_ta as ta

    sma = pd.concat(
        [
            stk_data.ta.sma(close="Adj Close", length=10),
            stk_data.ta.sma(close="Adj Close", length=60),
        ],
        axis=1,
    ).dropna()
    buy_signal = sma["SMA_10"] > sma["SMA_60"]

    buy_sell_signal = stk_data[[]].copy()
    buy_sell_signal["Signal"] = (buy_signal).astype("int")

    return buy_sell_signal


def gen_analysis_data(stk_data, _return_period):
    import pandas_ta as ta

    data = pd.concat(
        [
            stk_data.ta.adosc(),
            stk_data.ta.kvo(),
            stk_data.ta.rsi(close="Adj Close", length=10) / 100,
            stk_data.ta.rsi(close="Adj Close", length=30) / 100,
            stk_data.ta.rsi(close="Adj Close", length=200) / 100,
            stk_data.ta.stoch(k=10) / 100,
            stk_data.ta.stoch(k=30) / 100,
            stk_data.ta.stoch(k=200) / 100,
            gen_buy_sell_signal(stk_data),
        ],
        axis=1,
    )

    data = pd.concat(
        [data.astype("float32"), gen_pct_label(stk_data, _return_period)],
        axis=1,
    ).dropna()
    return data


def prepare_dataset(_return_period, verbose=False):
    from tqdm import tqdm

    ticks_dataset = []
    ignore_ticks_data_count = 0
    for i, tick_data in enumerate(tqdm(ticks_data)):
        analysis_data = gen_analysis_data(tick_data, _return_period)
        classes_percentage = class_percentage(analysis_data)
        if 0 in classes_percentage:
            if verbose:
                print(
                    f"Some classes don't have any data  : {stk_symbols[i]}, {classes_percentage}"
                )
            ignore_ticks_data_count += 1
        elif any(p < class_percentage_threshold for p in classes_percentage):
            if verbose:
                print(
                    f"Some classes are too small  : {stk_symbols[i]}, {classes_percentage}"
                )
            ignore_ticks_data_count += 1
        else:
            ticks_dataset.append(analysis_data)
    if ignore_ticks_data_count > 0:
        print(
            f"There are {ignore_ticks_data_count} stocks in total, some classes have no data or are too small"
        )
    return ticks_dataset

In [None]:
ttt = prepare_dataset(5)

In [None]:
def class_percentage(stk_data):
    stat = stk_data.groupby("label").size()
    total = len(stk_data)
    p = []
    for i in range(num_classes):
        p.append(stat[i] / total if i in stat.index else 0.0)
    return p


r = class_percentage(gen_analysis_data(ticks_data[0], return_period))

print(r)

In [None]:
classes_df = pd.DataFrame()
for i, stk_data in enumerate(ticks_data):
    label_stat = gen_analysis_data(stk_data, return_period).groupby("label").size()
    label_stat.name = stk_symbols[i]
    classes_df = pd.concat([classes_df, label_stat], axis=1)
print(classes_df)
classes_count = [classes_df.iloc[i].sum() for i in range(num_classes)]
total_recs = sum(classes_count)
for i, v in enumerate(classes_count):
    print(f"class {i}: {v*100/total_recs:.3f}%, {v}")
pyplot.bar(["< 0.05% ", ">= 0.05%"], classes_count)
pyplot.show()

In [None]:
classes_df = pd.DataFrame()
for i, stk_data in enumerate(ticks_data):
    label_stat = gen_analysis_data(stk_data, return_period).groupby("label").size()
    label_stat.name = stk_symbols[i]
    classes_df = pd.concat([classes_df, label_stat], axis=1)
print(classes_df)
# classes_count = [classes_df.iloc[i].sum() for i in range(num_classes)]
# total_recs = sum(classes_count)
# for i, v in enumerate(classes_count):
#     print(f"class {i}: {v*100/total_recs:.3f}%, {v}")
# pyplot.bar(["<= -0.5%", " between ", ">= 0.05%"], classes_count)
# pyplot.show()

In [None]:
classes_df_t = classes_df.T
classes_df_t["pos_%"] = classes_df_t[1] / (classes_df_t[0] + classes_df_t[1])
sorted = classes_df_t.sort_values("pos_%")
print(sorted)

In [None]:
too_small = classes_df_t[classes_df_t["pos_%"] < stk_pos_threshold]  # 0.08
print(too_small)

In [None]:
classes_df_t = classes_df.T
print(classes_df_t)
fig, ax = pyplot.subplots(figsize=(10, 200))
# pyplot.tight_layout()
labels = classes_df_t.index
width = 0.4  # the width of the bars
y = np.arange(len(labels))
ax.barh(y + width, classes_df_t[0], width, label="< 0.5%")
ax.barh(y - width, classes_df_t[1], width, label=">= 0.05%")

ax.legend()

pyplot.show()

In [None]:
print(stk_symbols.index("CEG"))
print(stk_symbols.index("OTIS"))
print(stk_symbols.index("GEHC"))

In [None]:
with pd.option_context("display.max_rows", None):
    classes_df_t = classes_df.T
    print(classes_df_t[classes_df_t.isna().any(axis=1)])

In [None]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import math
from myutil.util import ivmax


class LSTMDataSet(Dataset):
    def __init__(self, ticks_data_X, ticks_data_Y, _seq_len, balanced, pattern):
        self.ticks_data_X = ticks_data_X
        self.ticks_data_Y = ticks_data_Y
        self.seq_len = _seq_len
        self.balanced = balanced
        self.pattern = pattern
        self.pattern_size = 0 if pattern == None else len(pattern)

        len_array = [len(d) - self.seq_len + 1 for d in ticks_data_X]
        self.idx_boundary = [len_array[0]]

        for i in range(1, len(len_array)):
            self.idx_boundary.append(len_array[i] + self.idx_boundary[i - 1])

        self.build_class_indices()
        if self.balanced and self.pattern != None and len(self.pattern) > 0:
            self.build_pattern_info()
        # print(self.idx_boundary[-1])
        # print(self.__len__())

    def build_class_indices(self):
        total_y = pd.concat(
            [t[self.seq_len - 1 :]["label"] for t in self.ticks_data_Y]
        ).reset_index()
        self.class_indices = []
        for i in range(num_classes):
            class_idx_list = total_y.index[total_y["label"] == i].tolist()
            random.shuffle(class_idx_list)
            self.class_indices.append(class_idx_list)

        self.class_num_of_max_size, self.max_class_size = ivmax(
            [len(class_idx_list) for class_idx_list in self.class_indices]
        )

    def build_pattern_info(self):
        self.inner_class_count_of_pattern = list(np.zeros(num_classes, dtype=int))
        self.inner_offset_of_pattern = list(np.zeros(self.pattern_size, dtype=int))
        for i, c in enumerate(self.pattern):
            self.inner_offset_of_pattern[i] = self.inner_class_count_of_pattern[c]
            self.inner_class_count_of_pattern[c] += 1

    def __len__(self):
        # print(f"len of dataset:{self.idx_boundary[-1]}")
        # return self.idx_boundary[-1]  # len(self.X) - self.seq_len + 1
        if self.balanced:
            if self.pattern != None and len(self.pattern) > 0:
                return math.ceil(
                    self.max_class_size
                    * (
                        len(self.pattern)
                        / self.inner_class_count_of_pattern[self.class_num_of_max_size]
                    )
                )
            else:
                return self.max_class_size * num_classes

        return self.idx_boundary[-1]

    def idx_of_balanced_data_to_original_idx(self, idx_of_balanced_data):
        if self.pattern != None and self.pattern_size > 0:
            pattern_idx = idx_of_balanced_data % self.pattern_size
            selected_class = self.pattern[pattern_idx]
            idx_of_balanced_class = (
                (idx_of_balanced_data // self.pattern_size)
                * self.inner_class_count_of_pattern[selected_class]
            ) + self.inner_offset_of_pattern[pattern_idx]
        else:
            selected_class = idx_of_balanced_data % num_classes
            idx_of_balanced_class = idx_of_balanced_data // num_classes

        offset_balanced_class = idx_of_balanced_class % len(
            self.class_indices[selected_class]
        )
        return self.class_indices[selected_class][offset_balanced_class]

    def __getitem__(self, idx_of_balanced_data):
        idx = (
            self.idx_of_balanced_data_to_original_idx(idx_of_balanced_data)
            if self.balanced
            else idx_of_balanced_data
        )

        # print(f"getitem, idx_of_balanced_data:{idx_of_balanced_data}, idx:{idx}")
        for ticks_data_idx in range(len(self.ticks_data_X)):
            if self.idx_boundary[ticks_data_idx] > idx:
                break
        offset = (
            idx if ticks_data_idx == 0 else idx - self.idx_boundary[ticks_data_idx - 1]
        )
        # print(f"{ticks_data_idx}, {offset}")
        # print(f"{len(self.ticks_data_Y[ticks_data_idx])}, {offset + self.seq_len - 1}")
        x = np.array(self.ticks_data_X[ticks_data_idx][offset : offset + self.seq_len])
        y = int(self.ticks_data_Y[ticks_data_idx].iloc[offset + self.seq_len - 1, :])
        # if x.shape[1] == 34:
        #     print(f"sssssssssssssssssssssssssss {ticks_data_idx}, {idx}")
        return (x, y)

In [None]:
t = prepare_dataset(return_period)

In [None]:
print(len(t))

In [None]:
import random
import math
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn_pandas import DataFrameMapper

_return_period = return_period
_seq_len = seq_len


ticks_dataset = [gen_analysis_data(d, _return_period) for d in ticks_data]
ticks_X_train_data = []
ticks_Y_train_data = []
ticks_X_test_data = []
ticks_Y_test_data = []
ticks_X_dfm = []
for dataset in ticks_dataset:
    # test_size = int(dataset.shape[0] * validation_size)
    train_size = int(dataset.shape[0] * (1 - validation_size))
    # random.seed(42)
    train_data = dataset.iloc[0:train_size]
    test_data = dataset.iloc[train_size - seq_len + 1 :]

    X_train_data = train_data.iloc[:, :-1]
    Y_train_data = train_data.iloc[:, -1:]

    X_test_data = test_data.iloc[:, :-1]
    Y_test_data = test_data.iloc[:, -1:]

    features = [
        ([column], StandardScaler()) for column in X_train_data.columns[:3].values
    ]
    features.extend([([column], None) for column in X_train_data.columns[3:].values])
    # print(features)
    X_dfm = DataFrameMapper(features, input_df=True, df_out=True)
    X_train_data = X_dfm.fit_transform(X_train_data)
    X_test_data = X_dfm.transform(X_test_data)

    ticks_X_dfm.append(X_dfm)
    ticks_X_train_data.append(X_train_data)
    ticks_Y_train_data.append(Y_train_data)
    ticks_X_test_data.append(X_test_data)
    ticks_Y_test_data.append(Y_test_data)

train_ds = LSTMDataSet(
    ticks_X_train_data, ticks_Y_train_data, _seq_len, True, [0, 0, 1]
)
# test_ds = LSTMDataSet(ticks_X_test_data, ticks_Y_test_data, _seq_len)

In [None]:
print(train_ds.pattern)
print(train_ds.__len__())
print(train_ds.idx_boundary)
for i in train_ds.class_indices:
    print(len(i) * 1.5)

In [None]:
d = ticks_data[180]
x = gen_analysis_data(d, return_period)
print(f"{stk_tickers[180]}, {len(x.columns)}")
# print(ticks_data[180])
# print(x)

print(d.ta.adosc())
print(d.ta.kvo())
print(d.ta.rsi(close="Adj Close", length=10) / 100)
print(d.ta.rsi(close="Adj Close", length=30) / 100)
print(d.ta.rsi(close="Adj Close", length=200) / 100)
print(d.ta.stoch(k=10) / 100)
print(d.ta.stoch(k=30) / 100)
print(d.ta.stoch(k=200) / 100)
print(gen_buy_sell_signal(d))

# data = pd.concat(
#     [data.astype("float32"), gen_pct_label(stk_data, _return_period)],
#     axis=1,
# ).dropna()
# return data

In [None]:
import random
import math
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn_pandas import DataFrameMapper


def prepare_LSTMDataset(_return_period, _seq_len, train_data_pattern=None):
    ticks_dataset = prepare_dataset(_return_period)
    ticks_X_train_data = []
    ticks_Y_train_data = []
    ticks_X_test_data = []
    ticks_Y_test_data = []
    ticks_X_dfm = []
    for dataset in ticks_dataset:
        # test_size = int(dataset.shape[0] * validation_size)
        train_size = int(dataset.shape[0] * (1 - validation_size))
        # random.seed(42)
        train_data = dataset.iloc[0:train_size]
        test_data = dataset.iloc[train_size - seq_len + 1 :]

        X_train_data = train_data.iloc[:, :-1]
        Y_train_data = train_data.iloc[:, -1:]

        X_test_data = test_data.iloc[:, :-1]
        Y_test_data = test_data.iloc[:, -1:]

        features = [
            ([column], StandardScaler()) for column in X_train_data.columns[:3].values
        ]
        features.extend(
            [([column], None) for column in X_train_data.columns[3:].values]
        )
        # print(features)
        X_dfm = DataFrameMapper(features, input_df=True, df_out=True)
        X_train_data = X_dfm.fit_transform(X_train_data)
        X_test_data = X_dfm.transform(X_test_data)

        ticks_X_dfm.append(X_dfm)
        ticks_X_train_data.append(X_train_data)
        ticks_Y_train_data.append(Y_train_data)
        ticks_X_test_data.append(X_test_data)
        ticks_Y_test_data.append(Y_test_data)

    train_dataset = LSTMDataSet(
        ticks_X_train_data,
        ticks_Y_train_data,
        _seq_len,
        balanced=True,
        pattern=train_data_pattern,
    )
    test_dataset = LSTMDataSet(
        ticks_X_test_data, ticks_Y_test_data, _seq_len, balanced=False, pattern=None
    )

    print(f"Original training data size: {train_dataset.idx_boundary[-1]}")
    for i in range(num_classes):
        print(
            f"class {i}: {len(train_dataset.class_indices[i]) * 100 /train_dataset.idx_boundary[-1]:.1f}% {len(train_dataset.class_indices[i])}"
        )
    print(f"Training dataset size: {train_dataset.__len__()}")

    print(f"Original testing data size: {test_dataset.idx_boundary[-1]}")
    for i in range(num_classes):
        print(
            f"class {i}: {len(test_dataset.class_indices[i]) * 100 /test_dataset.idx_boundary[-1]:.1f}% {len(test_dataset.class_indices[i])}"
        )
    print(f"Testing dataset size: {test_dataset.__len__()}")

    return [train_dataset, test_dataset]

In [None]:
tr_ds, test_ds = prepare_LSTMDataset(return_period, seq_len, [0, 0, 1])

In [None]:
print(f"{len(tr_ds.class_indices[0])}, {len(tr_ds.class_indices[1])}")
print(f"{tr_ds.class_indices[0][:20]}")
print(f"{tr_ds.class_indices[1][:10]}")
print(tr_ds.idx_boundary)
print(tr_ds.ticks_data_X[0].iloc[1847])
print(tr_ds.ticks_data_Y[0].iloc[1847])
print(tr_ds.__getitem__(1844))
# for i in range(7):
#     print(tr_ds.__getitem__(i))

In [None]:
import random
import math
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn_pandas import DataFrameMapper


def prepare_dataloader(_return_period, _seq_len, train_data_pattern=None):
    data = prepare_LSTMDataset(_return_period, _seq_len, train_data_pattern)

    train_loader = DataLoader(
        data[0],
        batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory,
        pin_memory_device=device_name,
    )
    test_loader = DataLoader(
        data[1],
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=pin_memory,
        pin_memory_device=device_name,
    )

    return train_loader, test_loader, data[0].ticks_data_X[0].shape[1]

In [None]:
ds = t_r.dataset
# print(ds.ticks_data_Y[0][2:39])
print(ds.class_indices[1][:20])
print(ds.class_indices[0][:20])
print(ds.__getitem__(1))
print(ds.ticks_data_Y[0].head(20))
print(ds.ticks_data_X[0].head(20))
print(ds.ticks_data_X[0][12:15])

In [None]:
from torch import nn


class StockPCTLabelPredictLSTM(nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size,
        num_layers,
        num_fc_layers,
        activation_type,
    ):
        super().__init__()
        self.setup_model(
            input_size,
            hidden_size,
            num_layers,
            num_fc_layers,
            activation_type,
        )

    def __init__(self, input_size, config):
        super().__init__()
        self.setup_model(
            input_size=input_size,
            hidden_size=config["hidden_size"],
            num_layers=config["num_layers"],
            num_fc_layers=config["num_fc_layers"],
            activation_type=config["activation_type"],
        )

    def setup_model(
        self,
        input_size,
        hidden_size,
        num_layers,
        num_fc_layers,
        activation_type,
    ):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        """
            input_size    : The number of expected features in the input x
            hidden_size   : The number of features in the hidden state h
            num_layers    : Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs of the first LSTM and computing the final results. Default: 1
            bias          : If False, then the layer does not use bias weights b_ih and b_hh. Default: True
            batch_first   : If True, then the input and output tensors are provided as (batch, seq, feature) instead of (seq, batch, feature). Note that this does not apply to hidden or cell states. See the Inputs/Outputs sections below for details. Default: False
            dropout       : If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout. Default: 0
            bidirectional : If True, becomes a bidirectional LSTM. Default: False
            proj_size     : If > 0, will use LSTM with projections of corresponding size. Default: 0
        """
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        layers = []
        in_features = self.hidden_size
        for i in range(1, num_fc_layers):
            out_features = int(in_features / 2)
            if out_features <= num_classes:
                break
            layers.append(nn.Linear(in_features, out_features))
            (
                layers.append(nn.ReLU() if activation_type == 1 else nn.Sigmoid())
                if activation_type == 2
                else nn.Tanh()
            )
            in_features = out_features

        layers.append(nn.Linear(in_features, num_classes))
        self.fc = nn.Sequential(*layers)
        self.fc.apply(self.init_weights)

    def init_weights(self, m):
        if isinstance(m, nn.Linear):
            initrange = 0.5
            nn.init.uniform_(m.weight, -initrange, initrange)
            nn.init.zeros_(m.bias)
            # print(f"{m.in_features},{m.out_features}")

    def forward(self, x):
        h_0 = torch.zeros(self.num_layers, x.shape[0], self.hidden_size).to(device)
        c_0 = torch.zeros(self.num_layers, x.shape[0], self.hidden_size).to(device)
        out, (h_out, _) = self.rnn(x, (h_0, c_0))

        fc_input = h_out[-1].view(-1, self.hidden_size)
        return self.fc(fc_input)


def save_model(model, hyper_parameters, file_path, epoch_num=None):
    state = {
        "epoch_num": epoch_num,
        "time": str(datetime.now),
        "model_state": model.state_dict(),
        "input_size": model.input_size,
        "hyper_parameters": hyper_parameters,
    }
    # print(f"save model:{file_path}")
    torch.save(state, file_path)


def load_model(file_path):
    data_dict = torch.load(file_path)
    hyper_parameters = data_dict["hyper_parameters"]
    model = StockPCTLabelPredictLSTM(
        input_size=data_dict["input_size"],
        hidden_size=int(hyper_parameters["hidden_size"]),
        num_layers=int(hyper_parameters["num_layers"]),
        num_fc_layers=int(hyper_parameters["num_fc_layers"]),
        activation_type=int(hyper_parameters["activation_type"]),
    )
    model.load_state_dict(data_dict["model_state"])
    return model, hyper_parameters

In [None]:
import torch.nn as nn

METRICS_LABEL_NDX = 0  # ground_truth
METRICS_PBTY_NDX = 1  # Probability of predicition
METRICS_PRED_NDX = 2  # class(label) of predicition
METRICS_LOSS_NDX = 3
METRICS_SIZE = 4
softmax = nn.Softmax(dim=1)
totalTrainingSamples_count = 0

In [None]:
from collections import namedtuple
from torch.utils.tensorboard import SummaryWriter


def logMetrics(
    epoch_ndx,
    mode_str,
    metrics_t,
    classificationThreshold=0.5,
    config=None,
    log_hparam=False,
):
    log.info(
        "E{} {}".format(
            epoch_ndx,
            task_name,
        )
    )
    F1_rec = namedtuple(
        "f1_rec",
        "target_class pos_correct neg_correct pos_count neg_count pos_loss neg_loss precision recall F1",
    )
    F1_metrics = []
    for target_class in reversed(range(num_classes)):
        posLabel_mask = metrics_t[METRICS_LABEL_NDX] == target_class
        pos_count = posLabel_mask.sum()
        negLabel_mask = metrics_t[METRICS_LABEL_NDX] != target_class
        neg_count = negLabel_mask.sum()

        posPred_mask = metrics_t[METRICS_PRED_NDX] == target_class
        threshold_mask = metrics_t[METRICS_PBTY_NDX] > classificationThreshold
        # TP, truePos_count
        TP = pos_correct = int((posLabel_mask & posPred_mask & threshold_mask).sum())

        negPred_mask = metrics_t[METRICS_PRED_NDX] != target_class
        # TN, trueNeg_count
        TN = neg_correct = int((negLabel_mask & negPred_mask).sum())

        # FP, falsePos_count
        FP = neg_count - neg_correct
        # FN, falseNeg_count
        FN = pos_count - pos_correct

        # precision = TP / (TP + FP)
        precision = 0.0 if (TP + FP) == 0 else TP / np.float32(TP + FP)
        # recall = TP / (TP + FN)
        recall = 0.0 if (TP + FN) == 0 else TP / np.float32(TP + FN)
        # F1 = 2 * precision * recall / (precision + recall)
        F1 = (
            0.0
            if (precision + recall) == 0.0
            else (2 * precision * recall) / np.float32(precision + recall)
        )
        F1_metrics.append(
            F1_rec(
                target_class,
                pos_correct,
                neg_correct,
                pos_count,
                neg_count,
                metrics_t[METRICS_LOSS_NDX, posLabel_mask].mean(),
                metrics_t[METRICS_LOSS_NDX, negLabel_mask].mean(),
                precision,
                recall,
                F1,
            )
        )

        if num_classes == 2:
            break

    metrics_dict = {}
    metrics_dict[" e_loss/all"] = metrics_t[METRICS_LOSS_NDX].mean()
    log.info(
        ("E{} {:8} { e_loss/all:.4f} loss").format(
            epoch_ndx,
            mode_str,
            **metrics_dict,
        )
    )

    for target_class, rec in enumerate(F1_metrics):
        target_class_str = f"class {rec.target_class}" if num_classes > 2 else ""
        metrics_dict[f"{target_class_str} e_loss/pos"] = rec.pos_loss
        metrics_dict[f"{target_class_str} e_loss/neg"] = rec.neg_loss

        metrics_dict[f"{target_class_str} correct/all"] = (
            (rec.pos_correct + rec.neg_correct) / metrics_t.shape[1] * 100
        )
        metrics_dict[f"{target_class_str} correct/neg"] = (
            (rec.neg_correct) / rec.neg_count * 100
        )
        metrics_dict[f"{target_class_str} correct/pos"] = (
            (rec.pos_correct) / rec.pos_count * 100
        )
        metrics_dict[f"{target_class_str} pr/precision"] = rec.precision
        metrics_dict[f"{target_class_str} pr/recall"] = rec.recall
        metrics_dict[f"{target_class_str} pr/f1_score"] = rec.F1

        log.info(
            (
                "E{} {:8} {} {"
                + " correct/all:-5.1f}% correct, "
                + "{"
                + f"{target_class_str}"
                + " pr/precision:.4f} precision, "
                + "{"
                + f"{target_class_str}"
                + " pr/recall:.4f} recall, "
                + "{"
                + f"{target_class_str}"
                + " pr/f1_score:.4f} f1 score"
            ).format(epoch_ndx, mode_str, target_class_str, **metrics_dict)
        )
        log.info(
            (
                "E{} {:8} {} {"
                + " e_loss/neg:.4f} loss, "
                + "{"
                + f"{target_class_str}"
                + " correct/neg:-5.1f}% correct ({neg_correct:} of {neg_count:})"
            ).format(
                epoch_ndx,
                mode_str + "_neg",
                target_class_str,
                neg_correct=rec.neg_correct,
                neg_count=rec.neg_count,
                **metrics_dict,
            )
        )
        log.info(
            (
                "E{} {:8} {} {"
                + " e_loss/pos:.4f} loss, "
                + "{"
                + f"{target_class_str}"
                + " correct/pos:-5.1f}% correct ({pos_correct:} of {pos_count:})"
            ).format(
                epoch_ndx,
                mode_str + "_pos",
                target_class_str,
                pos_correct=rec.pos_correct,
                pos_count=rec.pos_count,
                **metrics_dict,
            )
        )

    writer = SummaryWriter(log_dir=log_dir + f"/{mode_str}_cls")
    for key, value in metrics_dict.items():
        writer.add_scalar(key, value, totalTrainingSamples_count)

    writer.add_pr_curve(
        "pr",
        metrics_t[METRICS_LABEL_NDX],
        metrics_t[METRICS_PRED_NDX],
        totalTrainingSamples_count,
    )

    bins = [x / 50.0 for x in range(51)]
    negHist_mask = negLabel_mask & (metrics_t[METRICS_PBTY_NDX] > 0.01)
    posHist_mask = posLabel_mask & (metrics_t[METRICS_PBTY_NDX] < 0.99)
    if negHist_mask.any():
        writer.add_histogram(
            "is_neg",
            metrics_t[METRICS_PBTY_NDX, negHist_mask],
            totalTrainingSamples_count,
            bins=bins,
        )
    if posHist_mask.any():
        writer.add_histogram(
            "is_pos",
            metrics_t[METRICS_PBTY_NDX, posHist_mask],
            totalTrainingSamples_count,
            bins=bins,
        )

    if log_hparam:
        hparam = config.copy()
        hparam["0:trn,1:val"] = 0 if mode_str == "trn" else 1
        writer.add_hparams(
            hparam,
            {
                "loss": metrics_t[METRICS_LOSS_NDX].mean(),
                "F1": F1_metrics[-1].F1,
            },
        )

    writer.close()

    return float(metrics_dict[" e_loss/all"]), F1_metrics

In [None]:
def computeBatchLoss(model, loss_fn, x, y, metrics_g, batch_idx):
    x_g = x.to(device)
    y_g = y.to(device)
    outputs = model(x_g)
    loss_g = loss_fn(outputs, y_g)
    probability_g, predition_g = torch.max(softmax(outputs), dim=1)

    start_ndx = batch_idx * batch_size
    end_ndx = start_ndx + y.size(0)

    metrics_g[METRICS_LABEL_NDX, start_ndx:end_ndx] = y_g
    metrics_g[METRICS_PBTY_NDX, start_ndx:end_ndx] = probability_g
    metrics_g[METRICS_PRED_NDX, start_ndx:end_ndx] = predition_g
    metrics_g[METRICS_LOSS_NDX, start_ndx:end_ndx] = loss_g

    return loss_g.mean()

In [None]:
from myutil.util import enumerateWithEstimate
import torch
from tqdm import tqdm


def doTraining(model, optimizer, loss_fn, epoch_ndx, train_dl):
    global totalTrainingSamples_count
    model.train()
    trnMetrics_g = torch.zeros(
        METRICS_SIZE,
        len(train_dl.dataset),
        device=device,
    )

    batch_iter = enumerateWithEstimate(
        train_dl,
        "E{} Training".format(epoch_ndx),
        start_ndx=train_dl.num_workers,
    )
    for batch_ndx, (x, y) in batch_iter:
        # for batch_ndx, (x, y) in enumerate(tqdm(train_dl)):
        optimizer.zero_grad()

        loss = computeBatchLoss(
            model,
            loss_fn,
            x,
            y,
            trnMetrics_g,
            batch_ndx,
        )

        loss.backward()
        optimizer.step()

    totalTrainingSamples_count += len(train_dl.dataset)
    return trnMetrics_g.to("cpu")

In [None]:
def doValidation(model, loss_fn, epoch_ndx, val_dl):
    from sklearn.metrics import f1_score

    with torch.no_grad():
        model.eval()
        valMetrics_g = torch.zeros(
            METRICS_SIZE,
            len(val_dl.dataset),
            device=device,
        )

        batch_iter = enumerateWithEstimate(
            val_dl,
            "E{} Validation ".format(epoch_ndx),
            start_ndx=val_dl.num_workers,
        )
        for batch_ndx, (x, y) in batch_iter:
            # for batch_ndx, (x, y) in enumerate(tqdm(val_dl)):
            computeBatchLoss(model, loss_fn, x, y, valMetrics_g, batch_ndx)

    return valMetrics_g.to("cpu")

In [None]:
def train_LSTM(config):
    global totalTrainingSamples_count
    best_f1 = 0

    lr = config["lr"]
    momentum = config["momentum"]
    optim_type = config["optim_type"]
    totalTrainingSamples_count = 0

    id_str = "_".join(str(v) if v < 1 else f"{v:g}" for v in config.values())
    # print(id_str)
    model_name = f"{log_dir}/{id_str}.pt"

    train_loader, test_loader, features_size = prepare_dataloader(
        config["return_period"], config["seq_len"], train_data_pattern=[0, 0, 1]
    )

    model = StockPCTLabelPredictLSTM(input_size=features_size, config=config)
    model = model.to(device)

    optimizer = (
        torch.optim.Adam(model.parameters(), lr=lr)
        if optim_type == 1
        else torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    )
    loss_fn = torch.nn.CrossEntropyLoss(reduction="none")

    for epoch_ndx in range(epoch_num):
        trnMetrics_t = doTraining(model, optimizer, loss_fn, epoch_ndx, train_loader)
        loss, _ = logMetrics(
            epoch_ndx,
            "trn",
            trnMetrics_t,
            classificationThreshold,
            config,
            (epoch_ndx == epoch_num - 1),
        )

        valMetrics_t = doValidation(model, loss_fn, epoch_ndx, test_loader)
        _, F1_metrics = logMetrics(
            epoch_ndx,
            "val",
            valMetrics_t,
            classificationThreshold,
            config,
            (epoch_ndx == epoch_num - 1),
        )
        if F1_metrics[0].F1 > best_f1:
            best_f1 = F1_metrics[0].F1
            save_model(model, config, model_name)
            print(f"current loss: {loss}")

In [None]:
time_str = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
log_dir = f"{log_dir_base}/{time_str}"
config = {
    "return_period": return_period,
    "seq_len": seq_len,
    "lr": 0.1,
    "momentum": 0.11646759543664197,
    "optim_type": 2,  # 1: Adam, 2: SGD  => Adam bad result
    "weight decay": 0.00001,
    "num_layers": 4,
    "hidden_size": 256,
    "num_fc_layers": 1,
    "activation_type": 2,  # Sigmoid
}
epoch_num = 3
# os.mkdir(log_dir)
report_f1 = False
print(log_dir)
start = datetime.now()
train_LSTM(config)
print(f"Elapsed time:{datetime.now() - start}")

In [None]:
def ray_train_task(config, data):
    global totalTrainingSamples_count
    global log_dir

    best_f1 = 0

    lr = config["lr"]
    momentum = config["momentum"]
    optim_type = config["optim_type"]
    totalTrainingSamples_count = 0

    id_str = "_".join(str(v) if v < 1 else f"{v:g}" for v in config.values())
    # print(id_str)
    log_dir = f"{log_dir_base}/{time_str}/{id_str}"
    os.mkdir(log_dir)

    model_name = f"{log_dir}/{id_str}.pt"

    train_loader = DataLoader(
        data[0],
        batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory,
        pin_memory_device=device_name,
    )
    test_loader = DataLoader(
        data[1],
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=pin_memory,
        pin_memory_device=device_name,
    )

    features_size = data[0].ticks_data_X[0].shape[1]

    model = StockPCTLabelPredictLSTM(input_size=features_size, config=config)
    model = model.to(device)

    optimizer = (
        torch.optim.Adam(model.parameters(), lr=lr)
        if optim_type == 1
        else torch.optim.SGD(
            model.parameters(),
            lr=lr,
            momentum=momentum,
            weight_decay=config["weight decay"],
        )
    )
    loss_fn = torch.nn.CrossEntropyLoss(reduction="none")

    for epoch_ndx in range(epoch_num):
        trnMetrics_t = doTraining(model, optimizer, loss_fn, epoch_ndx, train_loader)
        loss, _ = logMetrics(
            epoch_ndx,
            "trn",
            trnMetrics_t,
            classificationThreshold,
            config,
            (epoch_ndx == epoch_num - 1),
        )

        valMetrics_t = doValidation(model, loss_fn, epoch_ndx, test_loader)
        _, F1_metrics = logMetrics(
            epoch_ndx,
            "val",
            valMetrics_t,
            classificationThreshold,
            config,
            (epoch_ndx == epoch_num - 1),
        )
        if F1_metrics[0].F1 > best_f1:
            best_f1 = F1_metrics[0].F1
            save_model(model, config, model_name)

        train.report(
            {
                "loss": loss,
                "f1_score": F1_metrics[0].F1,
                "precision": F1_metrics[0].precision,
                "recall": F1_metrics[0].recall,
            }
        )

In [None]:
search_space = {
    # "return_period": tune.grid_search([5]),  # [2,3,5,10]
    # "seq_len": tune.grid_search([3]),  # 10]),
    "lr": tune.grid_search([0.1]),  # , 0.01, 0.1, 0.08, 0.12]
    "momentum": tune.grid_search([0.14647, 0]),  # tune.uniform(0.1, 0.9),
    "optim_type": tune.grid_search([2]),  # 1: Adam, 2: SGD  => Adam bad result
    "weight decay": tune.grid_search([0.00001]),  # best value
    "num_layers": tune.grid_search([1, 2, 3]),  # [1, 2, 4, 8] best value = 4
    "hidden_size": tune.grid_search([256]),  # [8, 16, 32, 64, 128]
    "num_fc_layers": tune.grid_search([1]),  # 1, 2, 3]),
    "activation_type": tune.grid_search(
        [2]
    ),  # 1: ReLU(),  2: Sigmoid(),  3: Tanh()  => meaningless num_fc_layers == 1
}

turning_parameters = []
total_configs = 1
for k, v in search_space.items():
    if (
        type(v).__name__ == "dict"
        and list(v.keys())[0] == "grid_search"
        and len(list(v.values())[0]) > 1
    ):
        turning_parameters.append(k)
        total_configs *= len(list(v.values())[0])
print(turning_parameters)
print(f"Total count of configs = {total_configs}")

In [None]:
import warnings


warnings.filterwarnings("ignore", category=Warning)

time_str = datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
log_dir = f"{log_dir_base}/{time_str}"
os.mkdir(log_dir)

data = prepare_LSTMDataset(return_period, seq_len, train_data_pattern=[0, 1, 0, 1, 0])
# analysis = tune.run(
#     train_LSTM,
#     config=search_space,
#     resources_per_trial={"cpu": 0.1, "gpu": 0.1},
#     metric="f1_score",
#     mode="max",
# )
tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(ray_train_task, data=data),
        resources={"cpu": 0.33, "gpu": 0.33},
    ),
    tune_config=tune.TuneConfig(
        metric="f1_score",
        mode="max",
    ),
    param_space=search_space,
)
results = tuner.fit()

In [None]:
accuracy_list = []
trial_list = list(analysis.trial_dataframes.values())
for i, trial in enumerate(trial_list):
    if trial.empty == False:
        d = pd.DataFrame.from_dict(
            {
                "mean_accuracy": trial.describe().loc["mean", "mean_accuracy"],
                "trial_id": trial.loc[0:0, "trial_id"],
            }
        )
    else:
        d = pd.DataFrame.from_dict({"mean_accuracy": [np.NaN], "trial_id": [np.NaN]})
    accuracy_list.append(d)
accuracy_df = pd.concat(accuracy_list)
accuracy_df = accuracy_df.reset_index().loc[:, ["mean_accuracy", "trial_id"]]
print(accuracy_df)

In [None]:
import shutil

config_df = pd.DataFrame(analysis.get_all_configs().values())
print(config_df)

results = pd.concat([accuracy_df, config_df], axis=1)
print(results)

sorted_results = results.sort_values(by="mean_accuracy", ascending=False)
print(sorted_results.head(100))
sorted_results_file = f"{log_dir}/sorted_results.csv"
sorted_results.to_csv(sorted_results_file)

best_config = config_df.iloc[sorted_results.index[0]]
id_str = "_".join(str(v) if v < 1 else f"{v:g}" for v in best_config.to_list())
best_model_name = f"{log_dir}/{id_str}.pt"
print(best_model_name)

In [None]:
shutil.copy(best_model_name, f"{log_dir_base}/{task_name}.pt")

In [None]:
accuracy_desc = sorted_results["mean_accuracy"].astype("float32").describe()
xlimit_range = [
    accuracy_desc["min"] - accuracy_desc["std"],
    accuracy_desc["max"] + accuracy_desc["std"],
]
for hperparameter_name in turning_parameters:
    parameter_group = sorted_results.groupby(hperparameter_name)
    fix, axs = pyplot.subplots(
        1,
        len(parameter_group),
        layout="constrained",
        sharex=False,
        sharey=True,
        figsize=(12, 2),
    )
    for i, g in enumerate(parameter_group):
        g[1]["mean_accuracy"].astype("float32").plot(
            kind="hist", bins=50, subplots=True, sharex=False, sharey=True, ax=axs[i]
        )
        axs[i].set_title(f"{hperparameter_name}_{g[0]}")

pyplot.xlim(xlimit_range)
pyplot.show()

In [None]:
sorted_results_file = f"{log_dir}/sorted_results.csv"
sorted_results = pd.read_csv(sorted_results_file, dtype="str")
best_config = sorted_results.loc[0]
print(best_config)
# id_str_of_best = f"5_5_0.01_{best_config.momentum}_{best_config.optim_type}_{best_config.num_layers}_{best_config.hidden_size}_{best_config.num_fc_layers}_{best_config.activation_type}"
# best_model_name = f"/mnt/AIWorkSpace/work/fin-ml/runs/{_TARGET_STK}/{time_str}/{id_str_of_best}.pt"
# print(best_model_name)

In [None]:
import math
from sklearn.metrics import mean_squared_error

pd.set_option("display.precision", 5)

model, config = load_model(f"{log_dir_base}/{task_name}.pt")
model.to(device)

train_loader, test_loader, features_size = prepare_dataloader(config["return_period"])
model.eval()

(trainAccuracy, trainF1) = eval_dl_method(model, train_loader, device=device)
(testAccuracy, testF1) = eval_dl_method(model, test_loader, device=device)
print(f"Train Accuracy: {trainAccuracy:.2f}\nTest Accuracy: {testAccuracy:.5f}")
print(f"Train F1: {trainF1:.2f}\nTest F1: {testF1:.5f}")