In [1]:
import pandas as pd
import numpy as np

import os

In [2]:
from pathlib import Path

# Configurable parts
SOURCE = "dukascopy"
SYMBOL = "usdjpy"
TIMEFRAME = "m5"
START_DATE = "2020-01-01"
END_DATE = "2024-12-31"

# Build base name
BASE_NAME = f"{SOURCE}-{SYMBOL}-{TIMEFRAME}-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RAW_DIR = BASE_DIR / "raw"
PROCESSED_DIR = BASE_DIR / "processed"
NORMALIZED_DIR = BASE_DIR / "normalized"
SCALER_DIR = BASE_DIR / "scalers"

# Final paths
RAW_FILE_PATH = RAW_DIR / f"{BASE_NAME}.csv"
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{BASE_NAME}_processed.pkl"
NORMALIZED_FILE_PATH = NORMALIZED_DIR / f"{BASE_NAME}_normalized.pkl"
STD_SCALER_PATH = SCALER_DIR / f"{BASE_NAME}_standard_scaler.pkl"
MINMAX_SCALER_PATH = SCALER_DIR / f"{BASE_NAME}_minmax_scaler.pkl"

In [3]:
df = pd.read_pickle(PROCESSED_FILE_PATH)
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,spread,log_volume,close_delta,close_return,...,dow_sin,dow_cos,dom,dom_sin,dom_cos,month,month_sin,month_cos,label,train_label
0,2020-01-02 02:55:00+00:00,108.675,108.679,108.675,108.6785,103450.000763,0.002567,11.546853,0.004,3.7e-05,...,0.433884,-0.900969,2,0.394356,0.918958,1,0.5,0.866025,0,1
1,2020-01-02 03:00:00+00:00,108.68,108.6895,108.68,108.683,218760.001183,0.002691,12.295735,0.0045,4.1e-05,...,0.433884,-0.900969,2,0.394356,0.918958,1,0.5,0.866025,0,1
2,2020-01-02 03:05:00+00:00,108.682,108.682,108.6765,108.6785,162950.000763,0.002311,12.001205,-0.0045,-4.1e-05,...,0.433884,-0.900969,2,0.394356,0.918958,1,0.5,0.866025,0,1
3,2020-01-02 03:10:00+00:00,108.68,108.682,108.6725,108.6765,108110.00061,0.002414,11.590914,-0.002,-1.8e-05,...,0.433884,-0.900969,2,0.394356,0.918958,1,0.5,0.866025,0,1
4,2020-01-02 03:15:00+00:00,108.676,108.6915,108.676,108.6895,222470.000625,0.002458,12.312552,0.013,0.00012,...,0.433884,-0.900969,2,0.394356,0.918958,1,0.5,0.866025,0,1


In [4]:
df.describe()

Unnamed: 0,open,high,low,close,volume,spread,log_volume,close_delta,close_return,close_log_return,...,dow_sin,dow_cos,dom,dom_sin,dom_cos,month,month_sin,month_cos,label,train_label
count,375217.0,375217.0,375217.0,375217.0,375217.0,375217.0,375217.0,375217.0,375217.0,375217.0,...,375217.0,375217.0,375217.0,375217.0,375217.0,375217.0,375217.0,375217.0,375217.0,375217.0
mean,128.000752,128.026106,127.975028,128.000897,2094723.0,0.006989,13.920938,0.000129,1e-06,9.78601e-07,...,0.344738,-0.055133,15.725178,0.001203956,-0.019419,6.524003,-0.004575586,-0.003618632,0.013739,1.013739
std,18.413384,18.419955,18.406509,18.413497,2221860.0,0.011741,1.71708,0.046421,0.000345,0.0003449617,...,0.524687,0.776416,8.792097,0.7135848,0.7003,3.444146,0.706442,0.7077488,0.805673,0.805673
min,101.349,101.401,101.182,101.349,0.0,0.001167,0.0,-3.1725,-0.021205,-0.0214329,...,-0.781831,-0.900969,1.0,-0.9987165,-0.994869,1.0,-1.0,-1.0,-1.0,0.0
25%,109.4265,109.441,109.4115,109.4265,604880.0,0.003184,13.312787,-0.0155,-0.000124,-0.0001244239,...,0.0,-0.900969,8.0,-0.7247928,-0.758758,4.0,-0.8660254,-0.8660254,-1.0,0.0
50%,130.815,130.851,130.779,130.815,1362000.0,0.00532,14.124465,0.0,0.0,0.0,...,0.433884,-0.222521,16.0,-2.449294e-16,-0.050649,7.0,-2.449294e-16,-1.83697e-16,0.0,1.0
75%,145.523,145.5545,145.493,145.523,2865900.0,0.007012,14.868393,0.0165,0.00013,0.0001302023,...,0.781831,0.62349,23.0,0.7247928,0.688967,10.0,0.5,0.8660254,1.0,2.0
max,161.95,161.951,161.9315,161.9495,126289400.0,0.328607,18.654087,1.749,0.01298,0.01289637,...,0.974928,1.0,31.0,0.9987165,1.0,12.0,1.0,1.0,1.0,2.0


In [5]:
SEQ_LEN = 30
HORIZON = 1
STRIDE = 1

In [6]:
FEATURES_COLS = [
    # Basic Data
    'close_log_return',
    'log_volume',
    'spread',

    # Other
    'ret_mean_5',
    'ret_mean_10',


    # TA
    'rsi_14',
    'ema_21',
    'sma_50',
    'atr_14',

    'bb_upper',
    'bb_lower',
    'bb_mavg',
    'bb_width',

    'donchian_upper',
    'donchian_lower',
    'donchian_mid',

    'stoch_k',
    'stoch_d',

    'macd',
    'macd_signal',
    'macd_diff',
]

TIME_COLS = [
    'timestamp',
    'hour_cos',
    'dow_cos',
    'dom_cos',
    'month_cos',
]

TARGET_COLS = ['train_label']

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [8]:
class ForexDataset(Dataset):
    def __init__(self, data, sequence_length, features, target, stride=1):
        self.data = data.reset_index(drop=True)
        self.sequence_length = sequence_length
        self.features = features
        self.target = target
        self.stride = stride

        # Convert features/labels to NumPy for fast slicing
        self.feature_data = data[features].to_numpy(dtype=np.float32)
        self.target_data = data[target].to_numpy(dtype=np.int64)

        # Precompute valid sequence start indices
        self.indices = self._generate_indices()

    def _generate_indices(self):
        # Sequences must have sequence_length + 1 (for target)
        max_start = len(self.data) - self.sequence_length - 1
        return list(range(0, max_start + 1, self.stride))

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, i):
        idx = self.indices[i]
        X = self.feature_data[idx : idx + self.sequence_length]
        y = self.target_data[idx + self.sequence_length]  # classification target
        return torch.from_numpy(X), torch.tensor(y, dtype=torch.long), idx

In [9]:
fx_dataset = ForexDataset(df, SEQ_LEN, FEATURES_COLS, TARGET_COLS, STRIDE)

In [10]:
fx_dataset[100]

(tensor([[-2.4358e-04,  1.3397e+01,  2.1437e-03, -7.6286e-05, -5.1007e-05,
          -9.6593e-01, -9.0097e-01,  9.1896e-01,  8.6603e-01,  3.7123e+01,
           1.0882e+02,  1.0881e+02,  1.7073e-02,  1.0886e+02,  1.0879e+02,
           1.0883e+02,  7.3268e-02,  1.0886e+02,  1.0878e+02,  1.0882e+02,
           4.3478e+00,  1.5605e+01, -1.4308e-03,  5.9459e-03, -7.3767e-03],
         [ 5.9752e-05,  1.3130e+01,  2.2110e-03, -3.7685e-05, -3.4006e-05,
          -9.6593e-01, -9.0097e-01,  9.1896e-01,  8.6603e-01,  4.0018e+01,
           1.0881e+02,  1.0881e+02,  1.6889e-02,  1.0887e+02,  1.0878e+02,
           1.0883e+02,  8.1321e-02,  1.0886e+02,  1.0878e+02,  1.0882e+02,
           1.2422e+01,  1.2439e+01, -3.4887e-03,  4.0590e-03, -7.5477e-03],
         [ 2.7576e-05,  1.3205e+01,  2.1533e-03, -2.6655e-05, -3.5842e-05,
          -9.6593e-01, -9.0097e-01,  9.1896e-01,  8.6603e-01,  4.1360e+01,
           1.0881e+02,  1.0881e+02,  1.6325e-02,  1.0887e+02,  1.0878e+02,
           1.0882e+02, 

In [11]:
fx_loader = DataLoader(fx_dataset, batch_size=64, shuffle=False)

In [14]:
features, labels, indices = next(iter(fx_loader))

print("Features:", features.shape)   # (64, seq_len, num_features)
print("Labels:", labels.squeeze().tolist())    # [1, 1, 1, 1, ..., 0]
print("Indices:", indices[:10])      # First 10 index values
print("First sample shape:", features[0].shape)  # (seq_len, num_features)
print("First sample:", features[0])  # Print actual data for one sample


Features: torch.Size([64, 30, 25])
Labels: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0]
Indices: tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
First sample shape: torch.Size([30, 25])
First sample: tensor([[ 3.6806e-05,  1.1547e+01,  2.5667e-03,  5.5209e-06, -5.9808e-06,
          8.6603e-01, -9.0097e-01,  9.1896e-01,  8.6603e-01,  5.1624e+01,
          1.0867e+02,  1.0868e+02,  1.5972e-02,  1.0869e+02,  1.0865e+02,
          1.0867e+02,  3.9474e-02,  1.0869e+02,  1.0864e+02,  1.0866e+02,
          7.2500e+01,  5.8333e+01, -3.3173e-03, -4.7999e-03,  1.4826e-03],
        [ 4.1406e-05,  1.2296e+01,  2.6909e-03,  2.4844e-05,  5.5208e-06,
          7.0711e-01, -9.0097e-01,  9.1896e-01,  8.6603e-01,  5.3536e+01,
          1.0867e+02,  1.0868e+02,  1.5617e-02,  1.0869e+02,  1.0865e+02,
          1.0867e+02,  4.0710e-02,  1.0869e+02,  1.0865e+02,  1.0867