In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [2]:
from pathlib import Path

# Configurable parts
SOURCE = "dukascopy"
SYMBOL = "USDJPY"
MINUTES = 1
EVENT = '58m-dollar'
START_DATE = "20210101"
END_DATE = "20241231"

# Build base name
RESAMPLED_NAME = f"{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"
# RESAMPLED_NAME = f"{SOURCE}-{SYMBOL}-{EVENT}-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"

# Final paths
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}_processed.pkl"

In [3]:
df = pd.read_pickle(PROCESSED_FILE_PATH)
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,log_volume,close_delta,close_return,close_log_return,ret_mean_5,ret_mean_10
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-01-03 22:10:00,103.227,103.2355,103.227,103.227,6870000000.0,0.023,22.65043,0.0045,4.4e-05,4.4e-05,3.197094e-05,5.3e-05
2021-01-03 22:11:00,103.227,103.228,103.227,103.228,7900000000.0,0.028667,22.790129,0.001,1e-05,1e-05,3.874955e-06,5.2e-05
2021-01-03 22:12:00,103.228,103.228,103.2255,103.2255,8750000000.0,0.02725,22.89232,-0.0025,-2.4e-05,-2.4e-05,5.812602e-06,-7e-06
2021-01-03 22:13:00,103.2275,103.2275,103.2275,103.2275,2000000000.0,0.029,21.416413,0.002,1.9e-05,1.9e-05,-9.687318e-07,3.1e-05
2021-01-03 22:14:00,103.228,103.228,103.228,103.228,2000000000.0,0.03,21.416413,0.0005,5e-06,5e-06,1.065631e-05,2e-05


### Reset Timestamp into Column

In [4]:
df = df.reset_index()

# Techinical Indicator

In [5]:
import numpy as np
from ta.volatility import AverageTrueRange, BollingerBands, DonchianChannel
from ta.momentum import StochasticOscillator, RSIIndicator
from ta.trend import MACD, EMAIndicator, SMAIndicator

### EMA & SMA

In [6]:
# 5-period EMA
ema5 = EMAIndicator(close=df['close'], window=5)
df['ema5'] = ema5.ema_indicator()
ema5_slope = df['ema5'].diff()
df['ema5_slope'] = ema5_slope
# 20-period EMA
ema20 = EMAIndicator(close=df['close'], window=20)
df['ema20'] = ema20.ema_indicator()
ema20_slope = df['ema20'].diff()
df['ema20_slope'] = ema20_slope

### ATR

In [7]:
atr14 = AverageTrueRange(
    high=df['high'],
    low=df['low'],
    close=df['close'],
    window=14
)
df['atr14'] = atr14.average_true_range()

atr20 = AverageTrueRange(
    high=df['high'],
    low=df['low'],
    close=df['close'],
    window=20
)
df['atr20'] = atr20.average_true_range()
df['vol_adj_return'] = df['close_log_return'] / df['atr20']
df['close_to_atr'] = df['close_delta'] / df['atr20']

### BollingerBands

In [8]:
bb = BollingerBands(
    close=df['close'],
    window=20,
    window_dev=2
)
df['bb_upper'] = bb.bollinger_hband()
df['bb_lower'] = bb.bollinger_lband()
df['bb_mavg'] = bb.bollinger_mavg()
df['bb_width'] = df['bb_upper'] - df['bb_lower']
df['bb_position'] = (df['close'] - df['bb_lower']) / df['bb_width']

### Donchian Channel

In [9]:
dc = DonchianChannel(
    high=df['high'],
    low=df['low'],
    close=df['close'],
    window=20
)
df['donchian_upper'] = dc.donchian_channel_hband()
df['donchian_lower'] = dc.donchian_channel_lband()
df['donchian_mid'] = dc.donchian_channel_mband()
df['donchian_width'] = df['donchian_upper'] - df['donchian_lower']

### Stochastic Oscillator

In [10]:
stoch = StochasticOscillator(
    high=df['high'],
    low=df['low'],
    close=df['close'],
    window=14,
    smooth_window=3
)
df['stoch_k'] = stoch.stoch()
df['stoch_d'] = stoch.stoch_signal()

### RSI (Relative Strength Index)

In [11]:
rsi = RSIIndicator(close=df['close'], window=14)
df['rsi14'] = rsi.rsi()


### MACD

In [12]:
macd = MACD(close=df['close'], window_slow=26, window_fast=12, window_sign=9)
df['macd'] = macd.macd()
df['macd_signal'] = macd.macd_signal()
df['macd_diff'] = macd.macd_diff()

# Time Features

In [13]:
df['unix_time'] = df['timestamp'].astype('int64') / 1e9

In [14]:
df['hour'] = df['timestamp'].dt.hour
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

In [15]:
df['dow'] = df['timestamp'].dt.dayofweek  # Monday=0
df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 7)
df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 7)

In [16]:
df['dom'] = df['timestamp'].dt.day
df['dom_sin'] = np.sin(2 * np.pi * df['dom'] / 31)
df['dom_cos'] = np.cos(2 * np.pi * df['dom'] / 31)

In [17]:
df['month'] = df['timestamp'].dt.month
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

## Drop NaN

In [18]:
df.isna().sum()

timestamp            0
open                 0
high                 0
low                  0
close                0
volume               0
spread               0
log_volume           0
close_delta          0
close_return         0
close_log_return     0
ret_mean_5           0
ret_mean_10          0
ema5                 4
ema5_slope           5
ema20               19
ema20_slope         20
atr14                0
atr20                0
vol_adj_return       2
close_to_atr         2
bb_upper            19
bb_lower            19
bb_mavg             19
bb_width            19
bb_position         19
donchian_upper      19
donchian_lower      19
donchian_mid        19
donchian_width      19
stoch_k             13
stoch_d             15
rsi14               13
macd                25
macd_signal         33
macd_diff           33
unix_time            0
hour                 0
hour_sin             0
hour_cos             0
dow                  0
dow_sin              0
dow_cos              0
dom        

In [19]:
df = df.dropna()
df.isna().sum()

timestamp           0
open                0
high                0
low                 0
close               0
volume              0
spread              0
log_volume          0
close_delta         0
close_return        0
close_log_return    0
ret_mean_5          0
ret_mean_10         0
ema5                0
ema5_slope          0
ema20               0
ema20_slope         0
atr14               0
atr20               0
vol_adj_return      0
close_to_atr        0
bb_upper            0
bb_lower            0
bb_mavg             0
bb_width            0
bb_position         0
donchian_upper      0
donchian_lower      0
donchian_mid        0
donchian_width      0
stoch_k             0
stoch_d             0
rsi14               0
macd                0
macd_signal         0
macd_diff           0
unix_time           0
hour                0
hour_sin            0
hour_cos            0
dow                 0
dow_sin             0
dow_cos             0
dom                 0
dom_sin             0
dom_cos   

In [20]:
df = df.set_index('timestamp')

In [21]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,log_volume,close_delta,close_return,close_log_return,...,hour_cos,dow,dow_sin,dow_cos,dom,dom_sin,dom_cos,month,month_sin,month_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-03 22:47:00,103.2105,103.2165,103.2045,103.2135,36890000000.0,0.031562,24.331206,0.0,0.0,0.0,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025
2021-01-03 22:48:00,103.2155,103.2175,103.2095,103.216,57100000000.0,0.02881,24.76807,0.0025,2.4e-05,2.4e-05,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025
2021-01-03 22:49:00,103.212,103.214,103.2105,103.212,57910000000.0,0.03795,24.782156,-0.004,-3.9e-05,-3.9e-05,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025
2021-01-03 22:50:00,103.2105,103.2135,103.2105,103.212,28590000000.0,0.037818,24.076323,0.0,0.0,0.0,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025
2021-01-03 22:51:00,103.207,103.2135,103.2045,103.2115,22050000000.0,0.034875,23.816578,-0.0005,-5e-06,-5e-06,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025


In [22]:
df.columns

Index(['open', 'high', 'low', 'close', 'volume', 'spread', 'log_volume',
       'close_delta', 'close_return', 'close_log_return', 'ret_mean_5',
       'ret_mean_10', 'ema5', 'ema5_slope', 'ema20', 'ema20_slope', 'atr14',
       'atr20', 'vol_adj_return', 'close_to_atr', 'bb_upper', 'bb_lower',
       'bb_mavg', 'bb_width', 'bb_position', 'donchian_upper',
       'donchian_lower', 'donchian_mid', 'donchian_width', 'stoch_k',
       'stoch_d', 'rsi14', 'macd', 'macd_signal', 'macd_diff', 'unix_time',
       'hour', 'hour_sin', 'hour_cos', 'dow', 'dow_sin', 'dow_cos', 'dom',
       'dom_sin', 'dom_cos', 'month', 'month_sin', 'month_cos'],
      dtype='object')

# Saving the file

In [23]:
df.to_pickle(PROCESSED_FILE_PATH)