In [1]:
import pandas as pd
import numpy as np

import os

In [2]:
from pathlib import Path

# Configurable parts
SOURCE = "dukascopy"
SYMBOL = "USDJPY"
MINUTES = 1
EVENT = '58m-dollar'
START_DATE = "20210101"
END_DATE = "20241231"

# Build base name
RESAMPLED_NAME = f"{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"
# RESAMPLED_NAME = f"{SOURCE}-{SYMBOL}-{EVENT}-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"
NORMALIZED_DIR = BASE_DIR / "normalized"
SCALER_DIR = BASE_DIR / "scalers"

# Final paths
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}_processed.pkl"
NORMALIZED_FILE_PATH = NORMALIZED_DIR / f"{RESAMPLED_NAME}_normalized.pkl"
STD_SCALER_PATH = SCALER_DIR / f"{RESAMPLED_NAME}_standard_scaler.pkl"
MINMAX_SCALER_PATH = SCALER_DIR / f"{RESAMPLED_NAME}_minmax_scaler.pkl"

In [3]:
df = pd.read_pickle(PROCESSED_FILE_PATH)
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,log_volume,close_delta,close_return,close_log_return,...,hour_cos,dow,dow_sin,dow_cos,dom,dom_sin,dom_cos,month,month_sin,month_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-03 22:47:00,103.2105,103.2165,103.2045,103.2135,36890000000.0,0.031562,24.331206,0.0,0.0,0.0,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025
2021-01-03 22:48:00,103.2155,103.2175,103.2095,103.216,57100000000.0,0.02881,24.76807,0.0025,2.4e-05,2.4e-05,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025
2021-01-03 22:49:00,103.212,103.214,103.2105,103.212,57910000000.0,0.03795,24.782156,-0.004,-3.9e-05,-3.9e-05,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025
2021-01-03 22:50:00,103.2105,103.2135,103.2105,103.212,28590000000.0,0.037818,24.076323,0.0,0.0,0.0,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025
2021-01-03 22:51:00,103.207,103.2135,103.2045,103.2115,22050000000.0,0.034875,23.816578,-0.0005,-5e-06,-5e-06,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025


In [4]:
df.columns

Index(['open', 'high', 'low', 'close', 'volume', 'spread', 'log_volume',
       'close_delta', 'close_return', 'close_log_return', 'ret_mean_5',
       'ret_mean_10', 'ema5', 'ema5_slope', 'ema20', 'ema20_slope', 'atr14',
       'atr20', 'vol_adj_return', 'close_to_atr', 'bb_upper', 'bb_lower',
       'bb_mavg', 'bb_width', 'bb_position', 'donchian_upper',
       'donchian_lower', 'donchian_mid', 'donchian_width', 'stoch_k',
       'stoch_d', 'rsi14', 'macd', 'macd_signal', 'macd_diff', 'unix_time',
       'hour', 'hour_sin', 'hour_cos', 'dow', 'dow_sin', 'dow_cos', 'dom',
       'dom_sin', 'dom_cos', 'month', 'month_sin', 'month_cos'],
      dtype='object')

In [5]:
df.describe()

Unnamed: 0,open,high,low,close,volume,spread,log_volume,close_delta,close_return,close_log_return,...,hour_cos,dow,dow_sin,dow_cos,dom,dom_sin,dom_cos,month,month_sin,month_cos
count,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,...,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0
mean,133.3599,133.3709,133.3489,133.36,462173400000.0,0.007371343,26.24949,3.617044e-05,2.947861e-07,2.820617e-07,...,-0.003159345,2.043148,0.3444055,-0.04867108,15.70863,0.002627637,-0.02241271,6.520214,-0.00498098,-0.006688097
std,16.75104,16.75366,16.74836,16.75104,480436600000.0,0.01091137,1.300103,0.02215688,0.0001595134,0.000159533,...,0.7070467,1.499731,0.5273959,0.7751581,8.773026,0.7139855,0.6997972,3.434585,0.7077746,0.7063896
min,102.6025,102.6055,102.5925,102.6025,0.0,0.001,0.0,-2.065,-0.01380804,-0.01390426,...,-1.0,0.0,-0.7818315,-0.9009689,1.0,-0.9987165,-0.9948693,1.0,-1.0,-1.0
25%,114.5905,114.5965,114.584,114.5905,119740000000.0,0.003973684,25.50859,-0.0075,-5.791981e-05,-5.792149e-05,...,-0.7071068,1.0,0.0,-0.9009689,8.0,-0.7247928,-0.7587581,4.0,-0.8660254,-0.8660254
50%,136.5375,136.5507,136.524,136.5375,303970000000.0,0.005715596,26.44019,0.0,0.0,0.0,...,-1.83697e-16,2.0,0.4338837,-0.2225209,16.0,-2.449294e-16,-0.05064917,7.0,-2.449294e-16,-1.83697e-16
75%,147.704,147.7155,147.692,147.704,646320000000.0,0.007305263,27.19456,0.008,5.96859e-05,5.968412e-05,...,0.7071068,3.0,0.7818315,0.6234898,23.0,0.7247928,0.6889669,9.0,0.5,0.5
max,161.95,161.951,161.9435,161.9495,5114110000000.0,0.374,29.26302,1.5795,0.01228753,0.01221265,...,1.0,6.0,0.9749279,1.0,31.0,0.9987165,1.0,12.0,1.0,1.0


In [6]:
FEATURES_COLS = [
    # Basic Data
    'close_log_return',
    'log_volume',
    'spread',

    # Other
    'ret_mean_5',
    'ret_mean_10'
]
TIME_FEATURE_COLS = [
    'hour_cos',
    'dow_cos',
    'dom_cos',
    'month_cos',
]
TA_MIN_MAX = [
    # TA
    'rsi14',
]
TA_STD = [
    "ema5",
    "ema5_slope",
    "ema20",
    "ema20_slope",
    "atr14",
    "atr20",
    "vol_adj_return",
    "close_to_atr",
    "bb_upper",
    "bb_lower",
    "bb_mavg",
    "bb_width",
    "bb_position",
    "donchian_upper",
    "donchian_lower",
    "donchian_mid",
    "donchian_width",
    "stoch_k",
    "stoch_d",
    "macd",
    "macd_signal",
    "macd_diff",
]

In [7]:
COLS_TO_STD = FEATURES_COLS + TA_STD
COLS_TO_MIN_MAX = TA_MIN_MAX

In [8]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

In [9]:
df[COLS_TO_STD] = standard_scaler.fit_transform(df[COLS_TO_STD])
df[COLS_TO_MIN_MAX] = minmax_scaler.fit_transform(df[COLS_TO_MIN_MAX])

In [10]:
df.describe()

Unnamed: 0,open,high,low,close,volume,spread,log_volume,close_delta,close_return,close_log_return,...,hour_cos,dow,dow_sin,dow_cos,dom,dom_sin,dom_cos,month,month_sin,month_cos
count,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,...,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0,1487264.0
mean,133.3599,133.3709,133.3489,133.36,462173400000.0,-4.8921760000000005e-17,1.174122e-15,3.617044e-05,2.947861e-07,1.3510810000000002e-17,...,-0.003159345,2.043148,0.3444055,-0.04867108,15.70863,0.002627637,-0.02241271,6.520214,-0.00498098,-0.006688097
std,16.75104,16.75366,16.74836,16.75104,480436600000.0,1.0,1.0,0.02215688,0.0001595134,1.0,...,0.7070467,1.499731,0.5273959,0.7751581,8.773026,0.7139855,0.6997972,3.434585,0.7077746,0.7063896
min,102.6025,102.6055,102.5925,102.6025,0.0,-0.5839181,-20.19033,-2.065,-0.01380804,-87.15783,...,-1.0,0.0,-0.7818315,-0.9009689,1.0,-0.9987165,-0.9948693,1.0,-1.0,-1.0
25%,114.5905,114.5965,114.584,114.5905,119740000000.0,-0.3113872,-0.5698804,-0.0075,-5.791981e-05,-0.3648372,...,-0.7071068,1.0,0.0,-0.9009689,8.0,-0.7247928,-0.7587581,4.0,-0.8660254,-0.8660254
50%,136.5375,136.5507,136.524,136.5375,303970000000.0,-0.1517452,0.1466835,0.0,0.0,-0.001768047,...,-1.83697e-16,2.0,0.4338837,-0.2225209,16.0,-2.449294e-16,-0.05064917,7.0,-2.449294e-16,-1.83697e-16
75%,147.704,147.7155,147.692,147.704,646320000000.0,-0.006056067,0.7269192,0.008,5.96859e-05,0.3723498,...,0.7071068,3.0,0.7818315,0.6234898,23.0,0.7247928,0.6889669,9.0,0.5,0.5
max,161.95,161.951,161.9435,161.9495,5114110000000.0,33.60063,2.31792,1.5795,0.01228753,76.55074,...,1.0,6.0,0.9749279,1.0,31.0,0.9987165,1.0,12.0,1.0,1.0


In [11]:
print(df[COLS_TO_STD].mean())         # Should be ~0
print(df[COLS_TO_STD].std(ddof=0))

close_log_return    1.351081e-17
log_volume          1.174122e-15
spread             -4.892176e-17
ret_mean_5          6.115220e-18
ret_mean_10        -1.375925e-18
ema5               -6.261986e-16
ema5_slope          3.172271e-18
ema20              -3.130993e-16
ema20_slope         3.745572e-18
atr14              -1.956870e-17
atr20              -2.152558e-16
vol_adj_return     -7.644025e-19
close_to_atr        1.375925e-17
bb_upper           -4.696489e-16
bb_lower            7.827482e-17
bb_mavg            -1.565496e-16
bb_width           -6.849047e-17
bb_position         2.323784e-17
donchian_upper      4.696489e-16
donchian_lower     -3.913741e-16
donchian_mid        7.827482e-17
donchian_width     -5.870611e-17
stoch_k            -1.481412e-16
stoch_d            -2.996458e-17
macd               -2.216767e-18
macd_signal         5.733019e-18
macd_diff          -6.505782e-18
dtype: float64
close_log_return    1.0
log_volume          1.0
spread              1.0
ret_mean_5          1.

In [12]:
import joblib
joblib.dump(standard_scaler, STD_SCALER_PATH)
joblib.dump(minmax_scaler, MINMAX_SCALER_PATH)
df.to_pickle(NORMALIZED_FILE_PATH)