In [1]:
import pandas as pd
import numpy as np

import os

In [2]:
PROCESSED_DIR = '../data/processed/'
PROCESSED_FILENAME = "usdjpy-bar-2020-01-01-2024-12-31_processed.pkl"
PROCESSED_FILE_PATH = os.path.join(PROCESSED_DIR, PROCESSED_FILENAME)

In [3]:
df = pd.read_pickle(PROCESSED_FILE_PATH)
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,time_group,close_delta,close_return,close_direction,label
1,2020-01-01 22:01:00,108.757,108.759,108.7495,108.7495,13300.000012,1,-0.0095,-8.7e-05,down,0
2,2020-01-01 22:02:00,108.7495,108.7535,108.7495,108.7535,4500.0,1,0.004,3.7e-05,up,2
3,2020-01-01 22:03:00,108.754,108.7555,108.7535,108.7555,10490.00001,1,0.002,1.8e-05,flat,1
4,2020-01-01 22:04:00,108.7575,108.765,108.7555,108.765,11600.000024,1,0.0095,8.7e-05,up,2
5,2020-01-01 22:05:00,108.77,108.77,108.769,108.77,1059.999987,1,0.005,4.6e-05,up,2


In [5]:
df.describe()

Unnamed: 0,timestamp,open,high,low,close,volume,time_group,close_delta,close_return,label
count,1818676,1818676.0,1818676.0,1818676.0,1818676.0,1818676.0,1818676.0,1818676.0,1818676.0,1818676.0
mean,2022-07-05 13:20:55.965527552,128.1896,128.2,128.1792,128.1896,431148.9,4203.974,3.065142e-05,2.461495e-07,1.005389
min,2020-01-01 22:01:00,101.202,101.338,101.182,101.2045,0.0,1.0,-2.065,-0.01380804,0.0
25%,2021-04-01 18:33:45,109.4595,109.465,109.454,109.46,118037.5,1847.0,-0.007,-5.705061e-05,0.0
50%,2022-07-11 20:03:30,131.215,131.234,131.199,131.215,272600.0,4956.0,0.0,0.0,1.0
75%,2023-10-04 07:22:15,145.661,145.674,145.6491,145.6615,585840.0,6143.0,0.0075,5.846136e-05,2.0
max,2024-12-30 23:59:00,161.95,161.951,161.9435,161.9495,122645500.0,7375.0,1.5795,0.01228753,2.0
std,,18.39441,18.39714,18.39161,18.39442,505069.8,2341.641,0.02094446,0.00015585,0.8257596


In [6]:
GROUP_COL = 'time_group'
SEQ_LEN = 30
HORIZON = 1

In [7]:
def get_sequence_start_indices(df, sequence_length=30, horizon=1, group_col='time_group'):
    indices = []

    for _, group in df.groupby(group_col):
        if len(group) >= sequence_length:
            group_indices = group.index.to_numpy()
            # Only take indices where a full sequence fits
            valid_starts = group_indices[:len(group) - sequence_length - horizon + 1]
            indices.extend(valid_starts)

    return indices


In [8]:
valid_start_idxs = get_sequence_start_indices(
    df, 
    sequence_length=30,
    horizon=1,
    group_col='time_group',
)

In [9]:
valid_start_idxs[0]

np.int64(1)

In [10]:
df.iloc[1:1+SEQ_LEN], df.iloc[1+SEQ_LEN: 1+SEQ_LEN+HORIZON]

(             timestamp      open      high       low     close         volume  \
 2  2020-01-01 22:02:00  108.7495  108.7535  108.7495  108.7535    4500.000000   
 3  2020-01-01 22:03:00  108.7540  108.7555  108.7535  108.7555   10490.000010   
 4  2020-01-01 22:04:00  108.7575  108.7650  108.7555  108.7650   11600.000024   
 5  2020-01-01 22:05:00  108.7700  108.7700  108.7690  108.7700    1059.999987   
 6  2020-01-01 22:06:00  108.7685  108.7685  108.7515  108.7570  603299.998939   
 7  2020-01-01 22:07:00  108.7595  108.7645  108.7510  108.7575  777979.996204   
 8  2020-01-01 22:08:00  108.7580  108.7635  108.7495  108.7545  251150.001049   
 9  2020-01-01 22:09:00  108.7565  108.7625  108.7435  108.7460  487400.000215   
 10 2020-01-01 22:10:00  108.7455  108.7480  108.7430  108.7450  145529.998422   
 11 2020-01-01 22:11:00  108.7445  108.7470  108.7395  108.7400   56379.999280   
 12 2020-01-01 22:12:00  108.7340  108.7600  108.7340  108.7575   95219.999909   
 13 2020-01-01 2

In [45]:
import torch
from torch.utils.data import Dataset

class ForexClassificationDataset(Dataset):
    """Dataset for sequence classification/forecasting with multi-step horizon."""

    def __init__(self, data, sequence_length, horizon, features, target, group_col='time_group'):
        self.sequence_length = sequence_length
        self.horizon = horizon
        self.features = features
        self.target = target
        self.group_col = group_col

        # Reset index to ensure integer indexing is valid
        self.data = data.reset_index(drop=True)
        self.feature_data = self.data[self.features].values
        self.target_data = self.data[self.target].values
        self.group_labels = self.data[self.group_col].values

        self.IDs = self._get_valid_sequence_starts()

    def _get_valid_sequence_starts(self):
        indices = []
        group_indices = {}

        for idx, group in enumerate(self.group_labels):
            group_indices.setdefault(group, []).append(idx)

        for group, idxs in group_indices.items():
            if len(idxs) >= self.sequence_length + self.horizon:
                valid = idxs[:len(idxs) - (self.sequence_length + self.horizon) + 1]
                indices.extend(valid)

        return indices

    def __len__(self):
        return len(self.IDs)

    def __getitem__(self, idx):
        start = self.IDs[idx]
        end = start + self.sequence_length
        target_idx = start + self.sequence_length + self.horizon - 1

        x = torch.tensor(self.feature_data[start:end], dtype=torch.float32)
        y = torch.tensor(self.target_data[target_idx], dtype=torch.long)

        return x, y


In [56]:
fx_dataset = ForexClassificationDataset(
    data=df, 
    sequence_length=SEQ_LEN, 
    horizon=HORIZON, 
    features=['close_return'],
    target='label',
    group_col='time_group'
)
    

In [60]:
fx_dataset[777][0].shape

torch.Size([30, 1])

## Good old train / test split

In [None]:
from sklearn.model_selection import train_test_split

train_idx, val_idx = train_test_split(
    valid_start_idxs,
    test_size=0.2,
    shuffle=True,
    random_state=42
)

dataset = fx_dataset
train_dataset = torch.utils.data.Subset(dataset, train_idx)
val_dataset = torch.utils.data.Subset(dataset, val_idx)


In [None]:
len(train_dataset), len(val_dataset)

## Stratified Shuffle

In [None]:
valid_start_idxs[:5]

In [None]:
target_indices = np.array(valid_start_idxs) + SEQ_LEN + HORIZON - 1


In [None]:
target_indices

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

def get_stratified_split(df, valid_start_indices, sequence_length, horizon, target_col, test_size=0.2):
    valid_start_indices = np.array(valid_start_indices)  # ensure array

    target_indices = valid_start_indices + sequence_length + horizon - 1
    labels = df.loc[target_indices, target_col].values  # get labels efficiently

    splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
    train_idx, val_idx = next(splitter.split(valid_start_indices, labels))

    train_start_indices = valid_start_indices[train_idx]
    val_start_indices = valid_start_indices[val_idx]

    return train_start_indices.tolist(), val_start_indices.tolist()


In [None]:
train_idx, test_idx = get_stratified_split(
    df,
    valid_start_indices=valid_start_idxs,
    sequence_length=SEQ_LEN,
    horizon=HORIZON,
    target_col='label'
)

In [None]:
len(train_idx), len(test_idx)