In [29]:
import pandas as pd
import numpy as np

import os

In [30]:
PROCESSED_DIR = '../data/processed/'
PROCESSED_FILENAME = "usdjpy-bar-2020-01-01-2024-12-31_processed.pkl"
PROCESSED_FILE_PATH = os.path.join(PROCESSED_DIR, PROCESSED_FILENAME)

In [31]:
df = pd.read_pickle(PROCESSED_FILE_PATH)
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,time_group,close_delta,close_return,close_direction,prob_down,prob_flat,prob_up,label
1,2020-01-01 22:01:00,108.757,108.759,108.7495,108.7495,13300.000012,1,-0.0095,-8.7e-05,down,1.0,0.0,0.0,0
2,2020-01-01 22:02:00,108.7495,108.7535,108.7495,108.7535,4500.0,1,0.004,3.7e-05,up,0.0,0.0,1.0,2
3,2020-01-01 22:03:00,108.754,108.7555,108.7535,108.7555,10490.00001,1,0.002,1.8e-05,flat,0.0,1.0,0.0,1
4,2020-01-01 22:04:00,108.7575,108.765,108.7555,108.765,11600.000024,1,0.0095,8.7e-05,up,0.0,0.0,1.0,2
5,2020-01-01 22:05:00,108.77,108.77,108.769,108.77,1059.999987,1,0.005,4.6e-05,up,0.0,0.0,1.0,2


In [32]:
df.describe()

Unnamed: 0,timestamp,open,high,low,close,volume,time_group,close_delta,close_return,prob_down,prob_flat,prob_up,label
count,1818676,1818676.0,1818676.0,1818676.0,1818676.0,1818676.0,1818676.0,1818676.0,1818676.0,1818676.0,1818676.0,1818676.0,1818676.0
mean,2022-07-05 13:20:55.965527552,128.1896,128.2,128.1792,128.1896,431148.9,4203.974,3.065142e-05,2.461495e-07,0.3382592,0.3180924,0.3436483,1.005389
min,2020-01-01 22:01:00,101.202,101.338,101.182,101.2045,0.0,1.0,-2.065,-0.01380804,0.0,0.0,0.0,0.0
25%,2021-04-01 18:33:45,109.4595,109.465,109.454,109.46,118037.5,1847.0,-0.007,-5.705061e-05,0.0,0.0,0.0,0.0
50%,2022-07-11 20:03:30,131.215,131.234,131.199,131.215,272600.0,4956.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,2023-10-04 07:22:15,145.661,145.674,145.6491,145.6615,585840.0,6143.0,0.0075,5.846136e-05,1.0,1.0,1.0,2.0
max,2024-12-30 23:59:00,161.95,161.951,161.9435,161.9495,122645500.0,7375.0,1.5795,0.01228753,1.0,1.0,1.0,2.0
std,,18.39441,18.39714,18.39161,18.39442,505069.8,2341.641,0.02094446,0.00015585,0.4731174,0.4657357,0.4749255,0.8257596


In [33]:
GROUP_COL = 'time_group'
SEQ_LEN = 30
HORIZON = 1

In [34]:
def get_sequence_start_indices(df, sequence_length=30, horizon=1, stride=1, group_col='time_group'):
    indices = []
    group_to_indices = {}
    
    for idx, group in zip(df.index, df[group_col]):
        group_to_indices.setdefault(group, []).append(idx)

    for group, idxs in group_to_indices.items():
        if len(idxs) < sequence_length + horizon:
            continue

        idxs = sorted(idxs)
        max_start = len(idxs) - sequence_length - horizon + 1
        for start in range(0, max_start, stride):
            indices.append(idxs[start])

    return indices


In [35]:
IDs = get_sequence_start_indices(
    df, 
    sequence_length=SEQ_LEN,
    horizon=HORIZON,
    stride=5,
    group_col='time_group',
)

In [46]:
idx = IDs[0]

print(df.loc[idx:idx+SEQ_LEN-1])
print(df.loc[idx+SEQ_LEN+HORIZON-1])

             timestamp      open      high       low     close         volume  \
1  2020-01-01 22:01:00  108.7570  108.7590  108.7495  108.7495   13300.000012   
2  2020-01-01 22:02:00  108.7495  108.7535  108.7495  108.7535    4500.000000   
3  2020-01-01 22:03:00  108.7540  108.7555  108.7535  108.7555   10490.000010   
4  2020-01-01 22:04:00  108.7575  108.7650  108.7555  108.7650   11600.000024   
5  2020-01-01 22:05:00  108.7700  108.7700  108.7690  108.7700    1059.999987   
6  2020-01-01 22:06:00  108.7685  108.7685  108.7515  108.7570  603299.998939   
7  2020-01-01 22:07:00  108.7595  108.7645  108.7510  108.7575  777979.996204   
8  2020-01-01 22:08:00  108.7580  108.7635  108.7495  108.7545  251150.001049   
9  2020-01-01 22:09:00  108.7565  108.7625  108.7435  108.7460  487400.000215   
10 2020-01-01 22:10:00  108.7455  108.7480  108.7430  108.7450  145529.998422   
11 2020-01-01 22:11:00  108.7445  108.7470  108.7395  108.7400   56379.999280   
12 2020-01-01 22:12:00  108.

In [37]:
import torch
from torch.utils.data import Dataset
import numpy as np

class ForexDataset(Dataset):
    """Dataset for sequence classification/forecasting with multi-step horizon."""

    def __init__(self, data, IDs, sequence_length, horizon, features, target):
        self.data = data
        self.IDs = IDs
        self.sequence_length = sequence_length
        self.horizon = horizon
        self.feature_data = data[features]
        self.target_data = data[target]
    def __len__(self):
        return len(self.IDs)

    def __getitem__(self, idx):
        i = self.IDs[idx]
        
        # Extract feature sequence
        X = self.feature_data.loc[i:i + self.sequence_length - 1].values

        # Extract target(s)
        y = self.target_data.loc[i+self.sequence_length + self.horizon - 1].values

        return torch.from_numpy(X), torch.from_numpy(y), i


In [38]:
fx_dataset = ForexDataset(
    data=df, 
    IDs=IDs,
    sequence_length=SEQ_LEN, 
    horizon=HORIZON, 
    features=['close_return'],
    target=['close_return'],
)
    

In [43]:
fx_dataset[1]

(tensor([[-1.1952e-04],
         [ 4.5974e-06],
         [-2.7584e-05],
         [-7.8158e-05],
         [-9.1957e-06],
         [-4.5979e-05],
         [ 1.6093e-04],
         [-9.1948e-05],
         [-2.2989e-05],
         [ 2.2990e-05],
         [-4.5978e-06],
         [-9.1957e-06],
         [-8.6440e-04],
         [ 3.5894e-04],
         [ 1.3801e-05],
         [ 5.0601e-05],
         [-4.5999e-06],
         [ 1.6560e-04],
         [ 5.0591e-05],
         [-1.0578e-04],
         [ 4.5994e-06],
         [-4.5994e-06],
         [ 1.7478e-04],
         [-1.4256e-04],
         [-4.5993e-06],
         [-4.5993e-06],
         [-3.2195e-05],
         [ 0.0000e+00],
         [ 9.1989e-06],
         [-1.3798e-04]], dtype=torch.float64),
 tensor([-0.0001], dtype=torch.float64),
 6)

In [135]:
df.loc[51:82, ['close_return']]

Unnamed: 0,close_return
51,-9e-06
52,0.0
53,5e-06
54,-5e-06
55,-9e-06
56,2.3e-05
57,-5e-06
58,1.4e-05
59,5e-06
60,-0.00057


In [105]:
fx_dataset = ForexClassificationDataset(
    data=df, 
    IDs=IDs,
    sequence_length=SEQ_LEN, 
    horizon=1, 
    features=['close_return'],
    target=['close_return'],
)
    

In [106]:
fx_dataset[5]

(tensor([[ 4.5994e-06],
         [-4.5994e-06],
         [ 1.7478e-04],
         [-1.4256e-04],
         [-4.5993e-06],
         [-4.5993e-06],
         [-3.2195e-05],
         [ 0.0000e+00],
         [ 9.1989e-06],
         [-1.3798e-04],
         [-1.2420e-04],
         [ 4.6006e-06],
         [ 4.6006e-06],
         [-1.3802e-05],
         [ 1.3802e-05],
         [-2.3003e-05],
         [ 1.8403e-05],
         [-4.6006e-06],
         [ 9.2012e-06],
         [ 3.2204e-05],
         [ 1.8402e-05],
         [ 0.0000e+00],
         [ 9.2007e-06],
         [ 0.0000e+00],
         [ 1.8401e-05],
         [-9.2004e-06],
         [ 0.0000e+00],
         [ 4.6003e-06],
         [-4.6002e-06],
         [-9.2005e-06]]),
 tensor([2.3002e-05]),
 26)

## Good old train / test split

In [79]:
from sklearn.model_selection import train_test_split

train_idx, val_idx = train_test_split(
    IDs,
    test_size=0.2,
    shuffle=True,
    random_state=42
)


In [80]:
train_dataset = ForexClassificationDataset(
    data=df, 
    IDs=train_idx,
    sequence_length=SEQ_LEN, 
    horizon=HORIZON, 
    features=['close_return'],
    target=['prob_down', 'prob_flat', 'prob_up'],
)

val_dataset = ForexClassificationDataset(
    data=df, 
    IDs=val_idx,
    sequence_length=SEQ_LEN, 
    horizon=HORIZON, 
    features=['close_return'],
    target=['prob_down', 'prob_flat', 'prob_up'],
)

In [81]:
len(train_dataset), len(val_dataset)

(278025, 69507)

In [85]:
train_dataset[-1]

(tensor([[-3.9572e-05],
         [-1.7588e-04],
         [ 4.8376e-05],
         [-8.7952e-06],
         [ 4.3976e-06],
         [ 7.9157e-05],
         [-8.3548e-05],
         [ 4.3976e-06],
         [ 4.8374e-05],
         [-4.3974e-06],
         [ 7.0358e-05],
         [-1.1432e-04],
         [ 4.3976e-05],
         [-8.7948e-06],
         [-3.9577e-05],
         [ 1.8470e-04],
         [-4.3968e-05],
         [-3.9573e-05],
         [ 1.0993e-04],
         [-7.9140e-05],
         [ 3.5176e-05],
         [-7.9144e-05],
         [ 5.7164e-05],
         [-7.0352e-05],
         [ 5.7165e-05],
         [ 1.4071e-04],
         [ 8.7928e-06],
         [-4.3964e-05],
         [ 3.5173e-05],
         [ 0.0000e+00]]),
 tensor([[0., 0., 1.]]),
 678787)

## Stratified Shuffle

In [None]:
valid_start_idxs[:5]

In [None]:
target_indices = np.array(valid_start_idxs) + SEQ_LEN + HORIZON - 1


In [None]:
target_indices

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

def get_stratified_split(df, valid_start_indices, sequence_length, horizon, target_col, test_size=0.2):
    valid_start_indices = np.array(valid_start_indices)  # ensure array

    target_indices = valid_start_indices + sequence_length + horizon - 1
    labels = df.loc[target_indices, target_col].values  # get labels efficiently

    splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
    train_idx, val_idx = next(splitter.split(valid_start_indices, labels))

    train_start_indices = valid_start_indices[train_idx]
    val_start_indices = valid_start_indices[val_idx]

    return train_start_indices.tolist(), val_start_indices.tolist()


In [None]:
train_idx, test_idx = get_stratified_split(
    df,
    valid_start_indices=valid_start_idxs,
    sequence_length=SEQ_LEN,
    horizon=HORIZON,
    target_col='label'
)

In [None]:
len(train_idx), len(test_idx)