In [48]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [49]:
from pathlib import Path

# Configurable parts
SYMBOL = "USDJPY"
SAMPLE_TYPE = "time"
MINUTES = 1
DOLLAR_THRESHOLD = "115009542m"
EVENT_NAME = "CUSUM"
EVENT_THRESHOLD = 2.52e-04
LABEL_NAME = "Z"

START_DATE = "20210101"
END_DATE = "20241231"

# Build base name
if SAMPLE_TYPE == "dollar":
    RESAMPLED_NAME = f"{SYMBOL}-{DOLLAR_THRESHOLD}-dollar-{START_DATE}-{END_DATE}"
else:
    RESAMPLED_NAME = f"{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"
EVENTS_DIR = BASE_DIR / "events"

# Final paths
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}-processed.pkl"
EVENT_FILE_PATH = EVENTS_DIR / f"{RESAMPLED_NAME}-{EVENT_NAME}.pkl"

In [50]:
DIRECTION_LABEL_DIR = BASE_DIR / "direction_labels"
DIRECTION_LABEL_DIR.mkdir(parents=True, exist_ok=True)
DIRECTION_LABEL_FILE_PATH = DIRECTION_LABEL_DIR / f"{RESAMPLED_NAME}-{EVENT_NAME}-{LABEL_NAME}.pkl"

In [51]:
print(PROCESSED_FILE_PATH)
print(EVENT_FILE_PATH)

../data/processed/USDJPY-1m-20210101-20241231-processed.pkl
../data/events/USDJPY-1m-20210101-20241231-CUSUM.pkl


In [52]:
%%time
df = pd.read_pickle(PROCESSED_FILE_PATH)
t_events = pd.read_pickle(EVENT_FILE_PATH)

CPU times: user 1.71 ms, sys: 213 ms, total: 214 ms
Wall time: 266 ms


In [53]:
df.columns

Index(['open', 'high', 'low', 'close', 'volume', 'spread', 'close_delta',
       'close_return', 'close_log_return', 'ret_mean_5', 'ret_mean_10',
       'ret_mean_15', 'ret_mean_20', 'log_volume', 'rv5', 'log_rv5',
       'sqrt_rv5', 'rv15', 'log_rv15', 'sqrt_rv15', 'rv50', 'log_rv50',
       'sqrt_rv50', 'ema5', 'ema5_slope', 'close_above_ema5', 'ema20',
       'ema20_slope', 'close_above_ema20', 'ema50', 'ema50_slope',
       'close_above_ema50', 'ema100', 'ema100_slope', 'close_above_ema100',
       'atr14', 'atr60', 'atr120', 'log_atr14', 'atr14_percent',
       'atr14_adjusted_return', 'log_atr60', 'atr60_percent',
       'atr60_adjusted_return', 'log_atr120', 'atr120_percent',
       'atr120_adjusted_return', 'adx14', 'plus_di14', 'minus_di14',
       'bb_upper', 'bb_lower', 'bb_mavg', 'bb_width', 'bb_position',
       'dc20_upper', 'dc20_lower', 'dc20_mid', 'dc20_width',
       'close_above_dc20_mid', 'dc20_breakout', 'dc20_breakdown', 'dc50_upper',
       'dc50_lower', 'dc50_mi

In [54]:
df.loc[t_events].head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,close_delta,close_return,close_log_return,ret_mean_5,...,dom,month,hour_sin,hour_cos,dow_sin,dow_cos,dom_sin,dom_cos,month_sin,month_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-04 12:47:00,102.9085,102.92,102.9085,102.92,96930000000.0,0.002548,0.0125,0.000121,0.000121,5.6e-05,...,4,1,1.224647e-16,-1.0,0.0,1.0,0.724793,0.688967,0.5,0.866025
2021-01-04 12:51:00,102.8995,102.8995,102.8825,102.8905,162330000000.0,0.002224,-0.009,-8.7e-05,-8.7e-05,-3.3e-05,...,4,1,1.224647e-16,-1.0,0.0,1.0,0.724793,0.688967,0.5,0.866025
2021-01-04 12:56:00,102.913,102.92,102.9115,102.92,277740000000.0,0.0031,0.0075,7.3e-05,7.3e-05,5.7e-05,...,4,1,1.224647e-16,-1.0,0.0,1.0,0.724793,0.688967,0.5,0.866025
2021-01-04 13:02:00,102.9185,102.9395,102.917,102.9375,218490000000.0,0.00271,0.02,0.000194,0.000194,2.6e-05,...,4,1,-0.258819,-0.965926,0.0,1.0,0.724793,0.688967,0.5,0.866025
2021-01-04 13:06:00,102.9635,102.978,102.9585,102.966,557230000000.0,0.001892,0.0045,4.4e-05,4.4e-05,9.4e-05,...,4,1,-0.258819,-0.965926,0.0,1.0,0.724793,0.688967,0.5,0.866025


## Calculate future return

In [55]:
from math import sqrt

def event_side_label_zscore(
    df: pd.DataFrame,
    t_events: pd.DatetimeIndex,
    horizon: int = 15,           # 垂直時限 H（以 bar 數或你資料頻率計）
    vol_window: int = 60,        # 波動率估計用的 rolling window（只看過去）
    tau: float = 0.6,            # z 的 dead-zone 門檻
    price_col: str = "close",
) -> pd.DataFrame:
    """
    事件驅動的 side labeling：只在 t_events 上做 z = r_H / sigma_H，並以 tau 分桶成 +1/-1/0。
    這是 event-based（不在每根 bar 標註），與 triple-barrier 同樣以事件為中心，
    但把上下 barrier 規則改成 z-score 的極端分桶（dead zone）。

    Returns: DataFrame(index=t_events) with ['r_H','sigma_H','z','label','t1']
    """
    px = df[price_col]

    # 嚴格「只用過去」的 1-bar 波動率估計（避免洩漏）
    r1 = np.log(px).diff()
    sigma_1 = r1.rolling(vol_window).std().shift(1)

    out_rows = []
    for t0 in t_events:
        if t0 not in df.index:
            continue
        # 垂直時限
        # 用「index 位置 + horizon」簡單取得 t1；你也可以改成「加 timedelta」
        try:
            loc0 = df.index.get_loc(t0)
        except KeyError:
            continue
        if isinstance(loc0, slice):  # 萬一遇到重複索引
            loc0 = loc0.start
        loc1 = loc0 + horizon
        if loc1 >= len(df.index):
            continue
        t1 = df.index[loc1]

        # 未來報酬（事件報酬）
        r_H = np.log(px.loc[t1] / px.loc[t0])

        # 當下估計的 H 視窗波動率
        sig1 = sigma_1.loc[t0]
        if pd.isna(sig1) or sig1 <= 0:
            continue
        sigma_H = sig1 * sqrt(horizon)

        z = r_H / sigma_H

        # 極端分桶
        if z >  tau:
            bin = 1
        elif z < -tau:
            bin = -1
        else:
            bin = 0

        out_rows.append((t0, r_H, sigma_H, z, bin, t1))

    out = pd.DataFrame(
        out_rows, columns=["t0", "r_H", "sigma_H", "z", "bin", "t1"]
    ).set_index("t0")

    return out

In [56]:
%%time
labels = event_side_label_zscore(
    df, t_events, horizon=5, vol_window=60, tau=0.6, price_col="close"
)
labels.head()

CPU times: user 3.62 s, sys: 65.6 ms, total: 3.69 s
Wall time: 3.69 s


Unnamed: 0_level_0,r_H,sigma_H,z,bin,t1
t0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-04 13:46:00,0.000379,0.000184,2.06035,1,2021-01-04 13:51:00
2021-01-04 13:50:00,0.000316,0.000191,1.652069,1,2021-01-04 13:55:00
2021-01-04 13:55:00,-0.000131,0.000199,-0.657322,-1,2021-01-04 14:00:00
2021-01-04 13:57:00,-0.000209,0.000225,-0.926273,-1,2021-01-04 14:02:00
2021-01-04 13:59:00,0.0,0.000224,0.0,0,2021-01-04 14:04:00


In [57]:
labels['bin'].value_counts()

bin
 0    120579
 1     65684
-1     62879
Name: count, dtype: int64

In [58]:
labels['bin_class'] = labels['bin'] + 1

## Save events to disk

In [59]:
labels.to_pickle(DIRECTION_LABEL_FILE_PATH)

In [60]:
DIRECTION_LABEL_FILE_PATH

PosixPath('../data/direction_labels/USDJPY-1m-20210101-20241231-CUSUM-Z.pkl')