In [1]:
import sys
sys.path.append("..")

# Reload modules in /src/ when changed
%load_ext autoreload
%autoreload 2

from fxml.data.labeling.barrier_method import BarrierMethod

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
# plt.rcParams["figure.figsize"] = (12, 2)
import seaborn as sns
sns.set(style="ticks", palette="tab10")

In [2]:
data = pd.read_pickle('../../data/processed/USDJPY-15m-20210101-20241231_TBM.pkl')

In [3]:
data.columns

Index(['open', 'high', 'low', 'close', 'volume', 'spread', 'close_pct_return',
       'close_return', 'close_log_return', 'close_fd_return',
       'close_log_fd_return', 'EMA_5', 'EMA_9', 'EMA_20', 'EMA_24', 'EMA_36',
       'EMA_50', 'EMA_100', 'ATRr_14', 'ATRr_60', 'ATRr_120', 'ADX_14',
       'ADXR_14_2', 'DMP_14', 'DMN_14', 'RSI_14', 'RSI_25', 'RSI_50',
       'BBL_5_2.0_2.0', 'BBM_5_2.0_2.0', 'BBU_5_2.0_2.0', 'BBB_5_2.0_2.0',
       'BBP_5_2.0_2.0', 'BBL_10_2.0_2.0', 'BBM_10_2.0_2.0', 'BBU_10_2.0_2.0',
       'BBB_10_2.0_2.0', 'BBP_10_2.0_2.0', 'BBL_15_2.0_2.0', 'BBM_15_2.0_2.0',
       'BBU_15_2.0_2.0', 'BBB_15_2.0_2.0', 'BBP_15_2.0_2.0', 'BBL_20_2.0_2.0',
       'BBM_20_2.0_2.0', 'BBU_20_2.0_2.0', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0',
       'MACD_8_17_9', 'MACDh_8_17_9', 'MACDs_8_17_9', 'MACD_12_26_9',
       'MACDh_12_26_9', 'MACDs_12_26_9', 'unix_time', 'minute', 'hour', 'dow',
       'dom', 'month', 'minute_sin', 'minute_cos', 'hour_sin', 'hour_cos',
       'dow_sin', 'dow_co

In [4]:
data.index.year.value_counts()

timestamp
2022    24958
2023    24895
2024    24791
2021    24031
Name: count, dtype: int64

In [5]:
train_data = data[data.index.year < 2024]
temp_data = data[data.index.year >= 2024]

In [6]:
train_data.shape

(73884, 71)

In [7]:
temp_data.shape

(24791, 71)

In [8]:
lookback = 30
len_df = data.shape[0]
feature_cols = ['open', 'high', 'low', 'close']
target_col = 'label'

In [30]:
def create_sequences(df, lookback, feat_cols, target_col): 
    len_df = df.shape[0]
    feature_values = df[feat_cols].values
    target_values = df[target_col].values
    X = []
    y = []
    for i in range(0, len_df - lookback):
        idx_start, idx_end = i, i + lookback # i ~ i+lookback-1
        X.append(feature_values[idx_start:idx_end])
    for i in range(lookback, len_df):
        y.append(target_values[i])
    
    return np.array(X), np.array(y)

In [31]:
X_train, y_train = create_sequences(train_data, lookback, feature_cols, target_col)
X_train.shape, y_train.shape

((73854, 30, 4), (73854,))

In [32]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [33]:
scaler = MinMaxScaler()

In [34]:
train_data.loc[:,feature_cols] = scaler.fit_transform(train_data[feature_cols].values)
temp_data.loc[:,feature_cols] = scaler.transform(temp_data[feature_cols].values)

In [35]:
X_train, y_train = create_sequences(train_data, lookback, feature_cols, target_col)
X_train.shape, y_train.shape

((73854, 30, 4), (73854,))

In [36]:
X_test, y_test = create_sequences(temp_data, lookback, feature_cols, target_col)
X_test.shape, y_test.shape

((24761, 30, 4), (24761,))

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.50, random_state=42)