In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [2]:
from pathlib import Path

# Configurable parts
SOURCE = "dukascopy"
SYMBOL = "usdjpy"
MINUTES = 15
START_DATE = "2020-01-01"
END_DATE = "2024-12-31"

# Build base name
RESAMPLED_NAME = f"{SOURCE}-{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"

# Final paths
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}_processed.pkl"

In [3]:
df = pd.read_pickle(PROCESSED_FILE_PATH)
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,spread,log_volume,close_delta,close_return,close_log_return,ret_mean_5,ret_mean_10
0,2020-01-02 00:30:00+00:00,108.683,108.711,108.682,108.695,862670.0,0.002198,13.667789,0.012,0.00011,0.00011,4e-06,-4.6e-05
1,2020-01-02 00:45:00+00:00,108.695,108.721,108.694,108.7025,801130.0,0.002384,13.59378,0.0075,6.9e-05,6.9e-05,4.9e-05,-1e-05
2,2020-01-02 01:00:00+00:00,108.702,108.732,108.702,108.7095,1456550.0,0.002392,14.191582,0.007,6.4e-05,6.4e-05,-2.9e-05,2.5e-05
3,2020-01-02 01:15:00+00:00,108.7085,108.7085,108.638,108.651,2111600.0,0.002312,14.562957,-0.0585,-0.000538,-0.000538,-9.8e-05,-3.8e-05
4,2020-01-02 01:30:00+00:00,108.652,108.6665,108.646,108.6565,1210730.0,0.002337,14.006735,0.0055,5.1e-05,5.1e-05,-4.9e-05,-4e-06


In [4]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Techinical Indicator

In [5]:
import numpy as np
from ta.volatility import AverageTrueRange, BollingerBands, DonchianChannel
from ta.momentum import StochasticOscillator, RSIIndicator
from ta.trend import MACD, EMAIndicator, SMAIndicator

### EMA & SMA

In [6]:
# 50-period EMA
ema_21 = EMAIndicator(close=df['close'], window=21)
df['ema_21'] = ema_21.ema_indicator()

# 50-period SMA
sma_50 = SMAIndicator(close=df['close'], window=50)
df['sma_50'] = sma_50.sma_indicator()

### ATR

In [7]:
atr = AverageTrueRange(
    high=df['high'],
    low=df['low'],
    close=df['close'],
    window=14
)
df['atr_14'] = atr.average_true_range()


### BollingerBands

In [8]:
bb = BollingerBands(
    close=df['close'],
    window=20,
    window_dev=2
)
df['bb_upper'] = bb.bollinger_hband()
df['bb_lower'] = bb.bollinger_lband()
df['bb_mavg'] = bb.bollinger_mavg()
df['bb_width'] = df['bb_upper'] - df['bb_lower']


### Donchian Channel

In [9]:
dc = DonchianChannel(
    high=df['high'],
    low=df['low'],
    close=df['close'],
    window=20
)
df['donchian_upper'] = dc.donchian_channel_hband()
df['donchian_lower'] = dc.donchian_channel_lband()
df['donchian_mid'] = dc.donchian_channel_mband()

### Stochastic Oscillator

In [10]:
stoch = StochasticOscillator(
    high=df['high'],
    low=df['low'],
    close=df['close'],
    window=14,
    smooth_window=3
)
df['stoch_k'] = stoch.stoch()
df['stoch_d'] = stoch.stoch_signal()

### RSI (Relative Strength Index)

In [11]:
rsi = RSIIndicator(close=df['close'], window=14)
df['rsi_14'] = rsi.rsi()


### MACD

In [12]:
macd = MACD(close=df['close'], window_slow=26, window_fast=12, window_sign=9)
df['macd'] = macd.macd()
df['macd_signal'] = macd.macd_signal()
df['macd_diff'] = macd.macd_diff()

# Time Features

In [13]:
df['unix_time'] = df['timestamp'].astype('int64') / 1e9

In [14]:
df['hour'] = df['timestamp'].dt.hour
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

In [15]:
df['dow'] = df['timestamp'].dt.dayofweek  # Monday=0
df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 7)
df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 7)

In [16]:
df['dom'] = df['timestamp'].dt.day
df['dom_sin'] = np.sin(2 * np.pi * df['dom'] / 31)
df['dom_cos'] = np.cos(2 * np.pi * df['dom'] / 31)

In [17]:
df['month'] = df['timestamp'].dt.month
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

## Drop NaN

In [18]:
df.isna().sum()

timestamp            0
open                 0
high                 0
low                  0
close                0
volume               0
spread               0
log_volume           0
close_delta          0
close_return         0
close_log_return     0
ret_mean_5           0
ret_mean_10          0
ema_21              20
sma_50              49
atr_14               0
bb_upper            19
bb_lower            19
bb_mavg             19
bb_width            19
donchian_upper      19
donchian_lower      19
donchian_mid        19
stoch_k             13
stoch_d             15
rsi_14              13
macd                25
macd_signal         33
macd_diff           33
unix_time            0
hour                 0
hour_sin             0
hour_cos             0
dow                  0
dow_sin              0
dow_cos              0
dom                  0
dom_sin              0
dom_cos              0
month                0
month_sin            0
month_cos            0
dtype: int64

In [19]:
df = df.dropna()
df.isna().sum()

timestamp           0
open                0
high                0
low                 0
close               0
volume              0
spread              0
log_volume          0
close_delta         0
close_return        0
close_log_return    0
ret_mean_5          0
ret_mean_10         0
ema_21              0
sma_50              0
atr_14              0
bb_upper            0
bb_lower            0
bb_mavg             0
bb_width            0
donchian_upper      0
donchian_lower      0
donchian_mid        0
stoch_k             0
stoch_d             0
rsi_14              0
macd                0
macd_signal         0
macd_diff           0
unix_time           0
hour                0
hour_sin            0
hour_cos            0
dow                 0
dow_sin             0
dow_cos             0
dom                 0
dom_sin             0
dom_cos             0
month               0
month_sin           0
month_cos           0
dtype: int64

In [20]:
df

Unnamed: 0,timestamp,open,high,low,close,volume,spread,log_volume,close_delta,close_return,...,hour_cos,dow,dow_sin,dow_cos,dom,dom_sin,dom_cos,month,month_sin,month_cos
49,2020-01-02 12:45:00+00:00,108.7890,108.7955,108.7635,108.7870,1.398160e+06,0.001986,14.150668,-0.0010,-0.000009,...,-1.000000,3,0.433884,-0.900969,2,0.394356,0.918958,1,5.000000e-01,0.866025
50,2020-01-02 13:00:00+00:00,108.7870,108.7900,108.7720,108.7780,1.224630e+06,0.001864,14.018150,-0.0090,-0.000083,...,-0.965926,3,0.433884,-0.900969,2,0.394356,0.918958,1,5.000000e-01,0.866025
51,2020-01-02 13:15:00+00:00,108.7770,108.7770,108.7090,108.7270,2.424340e+06,0.001921,14.701070,-0.0510,-0.000469,...,-0.965926,3,0.433884,-0.900969,2,0.394356,0.918958,1,5.000000e-01,0.866025
52,2020-01-02 13:30:00+00:00,108.7260,108.7390,108.6570,108.6945,2.223350e+06,0.002018,14.614526,-0.0325,-0.000299,...,-0.965926,3,0.433884,-0.900969,2,0.394356,0.918958,1,5.000000e-01,0.866025
53,2020-01-02 13:45:00+00:00,108.6940,108.7245,108.6885,108.6940,6.457450e+06,0.002147,15.680745,-0.0005,-0.000005,...,-0.965926,3,0.433884,-0.900969,2,0.394356,0.918958,1,5.000000e-01,0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124031,2024-12-30 22:45:00+00:00,156.9225,156.9225,156.8825,156.8900,5.675900e+05,0.049048,13.249156,-0.0275,-0.000175,...,0.866025,0,0.000000,1.000000,30,-0.201299,0.979530,12,-2.449294e-16,1.000000
124032,2024-12-30 23:00:00+00:00,156.8955,157.0250,156.8650,157.0205,2.453130e+06,0.024706,14.712876,0.1305,0.000832,...,0.965926,0,0.000000,1.000000,30,-0.201299,0.979530,12,-2.449294e-16,1.000000
124033,2024-12-30 23:15:00+00:00,157.0230,157.0675,157.0110,157.0270,1.410730e+06,0.012274,14.159619,0.0065,0.000041,...,0.965926,0,0.000000,1.000000,30,-0.201299,0.979530,12,-2.449294e-16,1.000000
124034,2024-12-30 23:30:00+00:00,157.0280,157.0520,156.9610,157.0335,4.898170e+06,0.011930,15.404372,0.0065,0.000041,...,0.965926,0,0.000000,1.000000,30,-0.201299,0.979530,12,-2.449294e-16,1.000000


In [21]:
df = df.reset_index(drop=True)

In [22]:
df.columns

Index(['timestamp', 'open', 'high', 'low', 'close', 'volume', 'spread',
       'log_volume', 'close_delta', 'close_return', 'close_log_return',
       'ret_mean_5', 'ret_mean_10', 'ema_21', 'sma_50', 'atr_14', 'bb_upper',
       'bb_lower', 'bb_mavg', 'bb_width', 'donchian_upper', 'donchian_lower',
       'donchian_mid', 'stoch_k', 'stoch_d', 'rsi_14', 'macd', 'macd_signal',
       'macd_diff', 'unix_time', 'hour', 'hour_sin', 'hour_cos', 'dow',
       'dow_sin', 'dow_cos', 'dom', 'dom_sin', 'dom_cos', 'month', 'month_sin',
       'month_cos'],
      dtype='object')

# Saving the file

In [23]:
df.to_pickle(PROCESSED_FILE_PATH)