In [14]:
import pandas as pd
import numpy as np

import os

In [15]:
# Load Forex tick data
data_dir = "../data/processed/"
file_name = "usdjpy-bar-processed-2020-01-01-2024-12-31.csv"
file_path = os.path.join(data_dir, file_name)

In [16]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,timestamp,open,high,low,close,volume,avg_spread,tick_count,close_delta,close_pct_change,sma_15,rsi_14,direction
0,0,1577916000.0,108.757,108.759,108.7495,108.7495,13300.000012,0.060333,9,,,,,-1
1,1,1577916000.0,108.7495,108.7555,108.7495,108.7555,10500.0,0.059143,7,0.006,0.005517,,,-1
2,2,1577916000.0,108.755,108.765,108.754,108.765,16090.000033,0.056308,13,0.0095,0.008735,108.756667,,-1
3,3,1577916000.0,108.77,108.77,108.769,108.77,1059.999987,0.021333,3,0.005,0.004597,108.7635,,1
4,4,1577916000.0,108.7685,108.7685,108.7515,108.7565,631789.998829,0.021845,239,-0.0135,-0.012412,108.763833,,1


In [20]:
class BaseData:
    def __init__(self, file_path: str) -> None:
        self.file_path = file_path
        self.df = self.read_data()
        self.features = self.df.columns
        self.IDs = self.df.index.unique()
        pass

    def read_data(self):
        df = pd.read_csv(self.file_path)
        return df

    def select_columns(self, df):
        return df

    def save_data(self, output_path: str):
        if self.df == None:
            self.df = self.read_data()
        try:
            self.df.to_pickle(output_path)
        except Exception as e:
            raise ValueError(f"Cannot save pkl file to {output_path}: {e}")


class ForexData(BaseData):
    def __init__(self, file_path):
        super().__init__(file_path)

        self.df = self.load_single()
        self.df = self.df.sort_values(by=["timestamp"])  # datasets is presorted
        self.df = self.df.set_index("timestamp")
        self.all_IDs = self.df.index.unique()  # all sample (session) IDs
        self.max_seq_len = 66

        self.feature_names = ["open", "high", "low", "close", "volume"]
        self.feature_df = self.df[self.feature_names]

    def load_single(self):
        df = self.read_data()
        df = self.select_columns(df)

        return df

    def select_columns(self, df):
        """"""
        df["timestamp"] = pd.to_datetime(df["timestamp"], unit='s')
        keep_cols = ["timestamp", "open", "high", "low", "close", "volume"]
        df = df[keep_cols]

        return df


In [21]:
fx_data = ForexData(file_path)

In [22]:
fx_data.all_IDs

DatetimeIndex(['2020-01-01 22:02:21.078000069',
               '2020-01-01 22:03:39.944999933',
               '2020-01-01 22:05:01.982000113',
               '2020-01-01 22:06:01.993000031',
                         '2020-01-01 22:07:02',
               '2020-01-01 22:08:02.361999989',
               '2020-01-01 22:09:02.516000032',
               '2020-01-01 22:10:14.404999971',
               '2020-01-01 22:11:14.525000095',
               '2020-01-01 22:12:14.578999996',
               ...
               '2024-12-30 23:49:28.072999954',
               '2024-12-30 23:50:45.842000008',
               '2024-12-30 23:51:47.213000059',
               '2024-12-30 23:52:48.740999937',
               '2024-12-30 23:53:52.779000044',
               '2024-12-30 23:55:02.053999901',
               '2024-12-30 23:56:02.312999964',
               '2024-12-30 23:57:11.861000061',
               '2024-12-30 23:58:13.168999910',
               '2024-12-30 23:59:13.348999977'],
              dtype=

In [41]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset

class DatasetSplitter:
    def __init__(self, dataset, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
        assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6
        self.dataset = dataset
        self.indices = np.arange(len(dataset))
        self.train_ratio = train_ratio
        self.val_ratio = val_ratio
        self.test_ratio = test_ratio

    def get_splits(self):
        train_size = int(len(self.indices) * self.train_ratio)
        val_size = int(len(self.indices) * self.val_ratio)

        train_indices = self.indices[:train_size]
        val_indices = self.indices[train_size:train_size + val_size]
        test_indices = self.indices[train_size + val_size:]

        train_dataset = Subset(self.dataset, train_indices)
        val_dataset = Subset(self.dataset, val_indices)
        test_dataset = Subset(self.dataset, test_indices)

        return train_dataset, val_dataset, test_dataset

In [43]:
DatasetSplitter(fx_data.df).get_splits()

(<torch.utils.data.dataset.Subset at 0x72d1315ae2a0>,
 <torch.utils.data.dataset.Subset at 0x72d131563e30>,
 <torch.utils.data.dataset.Subset at 0x72d131111af0>)