In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [2]:
from pathlib import Path

# Configurable parts
SOURCE = "dukascopy"
SYMBOL = "USDJPY"
MINUTES = 1
EVENT = '58m-dollar'
START_DATE = "20210101"
END_DATE = "20241231"

# Build base name
RESAMPLED_NAME = f"{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"
# RESAMPLED_NAME = f"{SOURCE}-{SYMBOL}-{EVENT}-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"

# Final paths
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}-processed.pkl"

In [3]:
df = pd.read_pickle(PROCESSED_FILE_PATH)
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,close_delta,close_return,close_log_return,ret_mean_5,...,dom,month,hour_sin,hour_cos,dow_sin,dow_cos,dom_sin,dom_cos,month_sin,month_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-03 23:44:00,103.292,103.3005,103.2875,103.289,114740000000.0,0.003725,-0.003,-2.9e-05,-2.9e-05,-1.9e-05,...,3,1,-0.258819,0.965926,-0.781831,0.62349,0.571268,0.820763,0.5,0.866025
2021-01-03 23:45:00,103.286,103.309,103.2835,103.304,263700000000.0,0.004039,0.015,0.000145,0.000145,2.2e-05,...,3,1,-0.258819,0.965926,-0.781831,0.62349,0.571268,0.820763,0.5,0.866025
2021-01-03 23:46:00,103.3045,103.318,103.298,103.298,162260000000.0,0.003704,-0.006,-5.8e-05,-5.8e-05,1e-05,...,3,1,-0.258819,0.965926,-0.781831,0.62349,0.571268,0.820763,0.5,0.866025
2021-01-03 23:47:00,103.301,103.3015,103.2805,103.2895,199110000000.0,0.003133,-0.0085,-8.2e-05,-8.2e-05,-5e-06,...,3,1,-0.258819,0.965926,-0.781831,0.62349,0.571268,0.820763,0.5,0.866025
2021-01-03 23:48:00,103.2855,103.2855,103.269,103.2825,127510000000.0,0.002381,-0.007,-6.8e-05,-6.8e-05,-1.8e-05,...,3,1,-0.258819,0.965926,-0.781831,0.62349,0.571268,0.820763,0.5,0.866025


### Reset Timestamp into Column

In [4]:
df = df.reset_index()

In [7]:
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,spread,close_delta,close_return,close_log_return,...,dom,month,hour_sin,hour_cos,dow_sin,dow_cos,dom_sin,dom_cos,month_sin,month_cos
0,2021-01-03 23:44:00,103.292,103.3005,103.2875,103.289,114740000000.0,0.003725,-0.003,-2.9e-05,-2.9e-05,...,3,1,-0.258819,0.965926,-0.781831,0.62349,0.571268,0.820763,0.5,0.866025
1,2021-01-03 23:45:00,103.286,103.309,103.2835,103.304,263700000000.0,0.004039,0.015,0.000145,0.000145,...,3,1,-0.258819,0.965926,-0.781831,0.62349,0.571268,0.820763,0.5,0.866025
2,2021-01-03 23:46:00,103.3045,103.318,103.298,103.298,162260000000.0,0.003704,-0.006,-5.8e-05,-5.8e-05,...,3,1,-0.258819,0.965926,-0.781831,0.62349,0.571268,0.820763,0.5,0.866025
3,2021-01-03 23:47:00,103.301,103.3015,103.2805,103.2895,199110000000.0,0.003133,-0.0085,-8.2e-05,-8.2e-05,...,3,1,-0.258819,0.965926,-0.781831,0.62349,0.571268,0.820763,0.5,0.866025
4,2021-01-03 23:48:00,103.2855,103.2855,103.269,103.2825,127510000000.0,0.002381,-0.007,-6.8e-05,-6.8e-05,...,3,1,-0.258819,0.965926,-0.781831,0.62349,0.571268,0.820763,0.5,0.866025


# Techinical Indicator

In [9]:
import numpy as np
from ta.volatility import AverageTrueRange, BollingerBands, DonchianChannel
from ta.momentum import StochasticOscillator, RSIIndicator
from ta.trend import MACD, EMAIndicator, SMAIndicator

### EMA & SMA

In [10]:
# 5-period EMA
ema5 = EMAIndicator(close=df['close'], window=5)
df['ema5'] = ema5.ema_indicator()
ema5_slope = df['ema5'].diff()
df['ema5_slope'] = ema5_slope
# 20-period EMA
ema20 = EMAIndicator(close=df['close'], window=20)
df['ema20'] = ema20.ema_indicator()
ema20_slope = df['ema20'].diff()
df['ema20_slope'] = ema20_slope

### ATR

In [11]:
atr14 = AverageTrueRange(
    high=df['high'],
    low=df['low'],
    close=df['close'],
    window=14
)
df['atr14'] = atr14.average_true_range()

atr20 = AverageTrueRange(
    high=df['high'],
    low=df['low'],
    close=df['close'],
    window=20
)
df['atr20'] = atr20.average_true_range()
df['vol_adj_return'] = df['close_log_return'] / df['atr20']
df['close_to_atr'] = df['close_delta'] / df['atr20']

### BollingerBands

In [None]:
bb = BollingerBands(
    close=df['close'],
    window=20,
    window_dev=2
)
df['bb_upper'] = bb.bollinger_hband()
df['bb_lower'] = bb.bollinger_lband()
df['bb_mavg'] = bb.bollinger_mavg()
df['bb_width'] = df['bb_upper'] - df['bb_lower']
df['bb_position'] = (df['close'] - df['bb_lower']) / df['bb_width']

### Donchian Channel

In [None]:
dc = DonchianChannel(
    high=df['high'],
    low=df['low'],
    close=df['close'],
    window=20
)
df['donchian_upper'] = dc.donchian_channel_hband()
df['donchian_lower'] = dc.donchian_channel_lband()
df['donchian_mid'] = dc.donchian_channel_mband()
df['donchian_width'] = df['donchian_upper'] - df['donchian_lower']

### Stochastic Oscillator

In [None]:
stoch = StochasticOscillator(
    high=df['high'],
    low=df['low'],
    close=df['close'],
    window=14,
    smooth_window=3
)
df['stoch_k'] = stoch.stoch()
df['stoch_d'] = stoch.stoch_signal()

### RSI (Relative Strength Index)

In [22]:
rsi = RSIIndicator(close=df['close'], window=14)
df['rsi14'] = rsi.rsi()
df['rsi14_slope'] = df['rsi14'].diff()

In [None]:
df['rsi14']

### MACD

In [None]:
macd = MACD(close=df['close'], window_slow=26, window_fast=12, window_sign=9)
df['macd'] = macd.macd()
df['macd_signal'] = macd.macd_signal()
df['macd_diff'] = macd.macd_diff()

### ADX

In [None]:
from ta.trend import ADXIndicator
# 建立 ADX 指標
adx14 = ADXIndicator(
    high=df["high"], 
    low=df["low"], 
    close=df["close"], 
    window=14,         # 與 ta-lib 的 timeperiod 對應
    fillna=False
)

# ADX 值
df["adx14"] = adx14.adx()

# +DI / -DI
df["plus_di14"] = adx14.adx_pos()
df["minus_di14"] = adx14.adx_neg()

# Time Features

In [None]:
df['unix_time'] = df['timestamp'].astype('int64') / 1e9

In [None]:
df['hour'] = df['timestamp'].dt.hour
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

In [None]:
df['dow'] = df['timestamp'].dt.dayofweek  # Monday=0
df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 7)
df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 7)

In [None]:
df['dom'] = df['timestamp'].dt.day
df['dom_sin'] = np.sin(2 * np.pi * df['dom'] / 31)
df['dom_cos'] = np.cos(2 * np.pi * df['dom'] / 31)

In [None]:
df['month'] = df['timestamp'].dt.month
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

## Drop NaN

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()
df.isna().sum()

In [None]:
df = df.set_index('timestamp')

In [None]:
df.head()

In [None]:
df.columns

# Saving the file

In [None]:
df.to_pickle(PROCESSED_FILE_PATH)