In [1]:
import pandas as pd
import numpy as np

import os

In [2]:
from pathlib import Path

# Configurable parts
SOURCE = "dukascopy"
SYMBOL = "usdjpy"
MINUTES = 15
START_DATE = "2020-01-01"
END_DATE = "2024-12-31"

# Build base name
RESAMPLED_NAME = f"{SOURCE}-{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"
NORMALIZED_DIR = BASE_DIR / "normalized"
SCALER_DIR = BASE_DIR / "scalers"

# Final paths
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}_processed.pkl"
NORMALIZED_FILE_PATH = NORMALIZED_DIR / f"{RESAMPLED_NAME}_normalized.pkl"
STD_SCALER_PATH = SCALER_DIR / f"{RESAMPLED_NAME}_standard_scaler.pkl"
MINMAX_SCALER_PATH = SCALER_DIR / f"{RESAMPLED_NAME}_minmax_scaler.pkl"

In [3]:
df = pd.read_pickle(PROCESSED_FILE_PATH)
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,spread,log_volume,close_delta,close_return,...,dom_sin,dom_cos,month,month_sin,month_cos,label,train_label,3b_label,3b_train_label,ttrain_label
0,2020-01-02 12:45:00+00:00,108.789,108.7955,108.7635,108.787,1398160.0,0.001986,14.150668,-0.001,-9e-06,...,0.394356,0.918958,1,0.5,0.866025,0,0,-1,0,1
1,2020-01-02 13:00:00+00:00,108.787,108.79,108.772,108.778,1224630.0,0.001864,14.01815,-0.009,-8.3e-05,...,0.394356,0.918958,1,0.5,0.866025,0,0,-1,0,1
2,2020-01-02 13:15:00+00:00,108.777,108.777,108.709,108.727,2424340.0,0.001921,14.70107,-0.051,-0.000469,...,0.394356,0.918958,1,0.5,0.866025,-1,0,-1,0,0
3,2020-01-02 13:30:00+00:00,108.726,108.739,108.657,108.6945,2223350.0,0.002018,14.614526,-0.0325,-0.000299,...,0.394356,0.918958,1,0.5,0.866025,-1,1,-1,0,0
4,2020-01-02 13:45:00+00:00,108.694,108.7245,108.6885,108.694,6457450.0,0.002147,15.680745,-0.0005,-5e-06,...,0.394356,0.918958,1,0.5,0.866025,0,0,-1,0,1


In [4]:
df.describe()

Unnamed: 0,open,high,low,close,volume,spread,log_volume,close_delta,close_return,close_log_return,...,dom_sin,dom_cos,month,month_sin,month_cos,label,train_label,3b_label,3b_train_label,ttrain_label
count,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,...,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0
mean,127.990416,128.035766,127.943652,127.99086,6337400.0,0.006639,15.18226,0.000394,3e-06,3e-06,...,0.002353235,-0.022177,6.521973,-0.004503608,-0.004841234,0.017354,1.018484,0.023938,1.023938,1.017354
std,18.3986,18.410248,18.38579,18.398718,6365523.0,0.009127,1.066178,0.081064,0.000601,0.000602,...,0.7139405,0.699857,3.439884,0.7071395,0.7070488,0.795467,0.823626,0.797676,0.797676,0.795467
min,101.349,101.954,101.182,101.349,0.0,0.001347,0.0,-3.8855,-0.028329,-0.028738,...,-0.9987165,-0.994869,1.0,-1.0,-1.0,-1.0,0.0,-1.0,0.0,0.0
25%,109.436,109.4605,109.4105,109.4355,1971010.0,0.003175,14.494057,-0.027,-0.000217,-0.000217,...,-0.7247928,-0.758758,4.0,-0.8660254,-0.8660254,-1.0,0.0,-1.0,0.0,0.0
50%,130.7905,130.857,130.7255,130.7905,4233180.0,0.005348,15.258464,0.001,7e-06,7e-06,...,-2.449294e-16,-0.050649,7.0,-2.449294e-16,-1.83697e-16,0.0,1.0,0.0,1.0,1.0
75%,145.526,145.5805,145.471,145.527,8654850.0,0.006976,15.973631,0.0295,0.000232,0.000232,...,0.7247928,0.688967,10.0,0.5,0.5,1.0,2.0,1.0,2.0,2.0
max,161.941,161.951,161.9175,161.94,239390500.0,0.276667,19.293607,2.052,0.014463,0.01436,...,0.9987165,1.0,12.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0


In [5]:
FEATURES_COLS = [
    # Basic Data
    'close_log_return',
    'log_volume',
    'spread',

    # Other
    'ret_mean_5',
    'ret_mean_10'
]
TIME_FEATURE_COLS = [
    'hour_cos',
    'dow_cos',
    'dom_cos',
    'month_cos',
]
TA_MIN_MAX = [
    # TA
    'rsi_14',
]
TA_STD = [
    'ema_21',
    'sma_50',
    'atr_14',
    
    'bb_upper',
    'bb_lower',
    'bb_mavg',
    'bb_width',

    'donchian_upper',
    'donchian_lower',
    'donchian_mid',

    'stoch_k',
    'stoch_d',

    'macd',
    'macd_signal',
    'macd_diff',
]

In [6]:
COLS_TO_STD = FEATURES_COLS + TA_STD
COLS_TO_MIN_MAX = TA_MIN_MAX

In [7]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

In [8]:
df[COLS_TO_STD] = standard_scaler.fit_transform(df[COLS_TO_STD])
df[COLS_TO_MIN_MAX] = minmax_scaler.fit_transform(df[COLS_TO_MIN_MAX])

In [9]:
df.describe()

Unnamed: 0,open,high,low,close,volume,spread,log_volume,close_delta,close_return,close_log_return,...,dom_sin,dom_cos,month,month_sin,month_cos,label,train_label,3b_label,3b_train_label,ttrain_label
count,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,...,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0,123945.0
mean,127.990416,128.035766,127.943652,127.99086,6337400.0,-1.797783e-16,1.429971e-15,0.000394,3e-06,-7.165908e-18,...,0.002353235,-0.022177,6.521973,-0.004503608,-0.004841234,0.017354,1.018484,0.023938,1.023938,1.017354
std,18.3986,18.410248,18.38579,18.398718,6365523.0,1.000004,1.000004,0.081064,0.000601,1.000004,...,0.7139405,0.699857,3.439884,0.7071395,0.7070488,0.795467,0.823626,0.797676,0.797676,0.795467
min,101.349,101.954,101.182,101.349,0.0,-0.5798381,-14.23994,-3.8855,-0.028329,-47.76263,...,-0.9987165,-0.994869,1.0,-1.0,-1.0,-1.0,0.0,-1.0,0.0,0.0
25%,109.436,109.4605,109.4105,109.4355,1971010.0,-0.3795095,-0.6454883,-0.027,-0.000217,-0.3651262,...,-0.7247928,-0.758758,4.0,-0.8660254,-0.8660254,-1.0,0.0,-1.0,0.0,0.0
50%,130.7905,130.857,130.7255,130.7905,4233180.0,-0.1414836,0.07147442,0.001,7e-06,0.006416603,...,-2.449294e-16,-0.050649,7.0,-2.449294e-16,-1.83697e-16,0.0,1.0,0.0,1.0,1.0
75%,145.526,145.5805,145.471,145.527,8654850.0,0.03693064,0.7422525,0.0295,0.000232,0.3807668,...,0.7247928,0.688967,10.0,0.5,0.5,1.0,2.0,1.0,2.0,2.0
max,161.941,161.951,161.9175,161.94,239390500.0,29.58454,3.856168,2.052,0.014463,23.85868,...,0.9987165,1.0,12.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0


In [10]:
print(df[COLS_TO_STD].mean())         # Should be ~0
print(df[COLS_TO_STD].std(ddof=0))

close_log_return   -7.165908e-18
log_volume          1.429971e-15
spread             -1.797783e-16
ret_mean_5          4.356872e-18
ret_mean_10         1.238269e-17
ema_21             -1.445564e-15
sma_50             -5.503417e-17
atr_14              1.866576e-16
bb_upper            1.161221e-15
bb_lower           -1.456571e-15
bb_mavg             1.357510e-15
bb_width            1.801223e-16
donchian_upper     -7.337889e-17
donchian_lower     -1.052987e-15
donchian_mid       -2.531572e-16
stoch_k             2.678330e-16
stoch_d             1.016986e-16
macd               -9.630980e-18
macd_signal        -9.630980e-18
macd_diff           2.465072e-18
dtype: float64
close_log_return    1.0
log_volume          1.0
spread              1.0
ret_mean_5          1.0
ret_mean_10         1.0
ema_21              1.0
sma_50              1.0
atr_14              1.0
bb_upper            1.0
bb_lower            1.0
bb_mavg             1.0
bb_width            1.0
donchian_upper      1.0
donchian_lowe

In [11]:
import joblib
joblib.dump(standard_scaler, STD_SCALER_PATH)
joblib.dump(minmax_scaler, MINMAX_SCALER_PATH)
df.to_pickle(NORMALIZED_FILE_PATH)