In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [23]:
from pathlib import Path

# Configurable parts
SYMBOL = "USDJPY"
SAMPLE_TYPE = "time"
MINUTES = 1
DOLLAR_THRESHOLD = "115009542m"

START_DATE = "20210101"
END_DATE = "20241231"

# Build base name
if SAMPLE_TYPE == "dollar":
    RESAMPLED_NAME = f"{SYMBOL}-{DOLLAR_THRESHOLD}-dollar-{START_DATE}-{END_DATE}"
else:
    RESAMPLED_NAME = f"{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"
EVENTS_DIR = BASE_DIR / "events"

# Final paths
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}_processed.pkl"

In [7]:
%%time
df = pd.read_pickle(PROCESSED_FILE_PATH)

CPU times: user 1.64 ms, sys: 73 ms, total: 74.6 ms
Wall time: 72 ms


In [8]:
df.shape

(1487264, 48)

In [9]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,log_volume,close_delta,close_return,close_log_return,...,hour_cos,dow,dow_sin,dow_cos,dom,dom_sin,dom_cos,month,month_sin,month_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-03 22:47:00,103.2105,103.2165,103.2045,103.2135,36890000000.0,0.031562,24.331206,0.0,0.0,0.0,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025
2021-01-03 22:48:00,103.2155,103.2175,103.2095,103.216,57100000000.0,0.02881,24.76807,0.0025,2.4e-05,2.4e-05,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025
2021-01-03 22:49:00,103.212,103.214,103.2105,103.212,57910000000.0,0.03795,24.782156,-0.004,-3.9e-05,-3.9e-05,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025
2021-01-03 22:50:00,103.2105,103.2135,103.2105,103.212,28590000000.0,0.037818,24.076323,0.0,0.0,0.0,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025
2021-01-03 22:51:00,103.207,103.2135,103.2045,103.2115,22050000000.0,0.034875,23.816578,-0.0005,-5e-06,-5e-06,...,0.866025,6,-0.781831,0.62349,3,0.571268,0.820763,1,0.5,0.866025


In [10]:
df.columns

Index(['open', 'high', 'low', 'close', 'volume', 'spread', 'log_volume',
       'close_delta', 'close_return', 'close_log_return', 'ret_mean_5',
       'ret_mean_10', 'ema5', 'ema5_slope', 'ema20', 'ema20_slope', 'atr14',
       'atr20', 'vol_adj_return', 'close_to_atr', 'bb_upper', 'bb_lower',
       'bb_mavg', 'bb_width', 'bb_position', 'donchian_upper',
       'donchian_lower', 'donchian_mid', 'donchian_width', 'stoch_k',
       'stoch_d', 'rsi14', 'macd', 'macd_signal', 'macd_diff', 'unix_time',
       'hour', 'hour_sin', 'hour_cos', 'dow', 'dow_sin', 'dow_cos', 'dom',
       'dom_sin', 'dom_cos', 'month', 'month_sin', 'month_cos'],
      dtype='object')

## Calculate Daily Volatility

In [28]:
vol = df['close_log_return'].rolling(window=60).std() # 計算 60 分鐘的 標準差
vol.mean()

np.float64(0.00012612068875852208)

## Get Events Start Time (t0)

In [12]:
import numpy as np
import pandas as pd
from tqdm import tqdm

def get_tevents_optimized(data: pd.Series, threshold: float) -> pd.DatetimeIndex:
    # get tEvents using CUSUM filter
    values = data.values
    timestamps = data.index

    s_pos = np.zeros_like(values)
    s_neg = np.zeros_like(values)

    t_events_mask = np.zeros_like(values, dtype=bool)

    cum_pos, cum_neg = 0.0, 0.0

    for i in tqdm(range(len(values))):
        cum_pos = max(0.0, cum_pos + values[i])
        cum_neg = min(0.0, cum_neg + values[i])
        s_pos[i] = cum_pos
        s_neg[i] = cum_neg

        if cum_pos > threshold:
            t_events_mask[i] = True
            cum_pos = 0.0
        if cum_neg < -threshold:
            t_events_mask[i] = True
            cum_neg = 0.0

    return timestamps[t_events_mask]


In [29]:
EVENT_NAME = 'CUSUM'
THRESHOLD = vol.mean()*2
EVENT_PATH = EVENTS_DIR / f"{RESAMPLED_NAME}_{EVENT_NAME}_{THRESHOLD:.2e}.pkl"

In [14]:
%%time

t_events = get_tevents_optimized(df['close_log_return'].iloc[1:], threshold=THRESHOLD)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1487263/1487263 [00:00<00:00, 3411624.81it/s]

CPU times: user 444 ms, sys: 21.6 ms, total: 466 ms
Wall time: 466 ms





In [15]:
t_events

DatetimeIndex(['2021-01-03 23:00:00', '2021-01-03 23:10:00',
               '2021-01-03 23:20:00', '2021-01-03 23:30:00',
               '2021-01-03 23:32:00', '2021-01-03 23:35:00',
               '2021-01-03 23:49:00', '2021-01-03 23:52:00',
               '2021-01-04 00:01:00', '2021-01-04 00:05:00',
               ...
               '2024-12-30 22:29:00', '2024-12-30 23:00:00',
               '2024-12-30 23:06:00', '2024-12-30 23:10:00',
               '2024-12-30 23:12:00', '2024-12-30 23:17:00',
               '2024-12-30 23:18:00', '2024-12-30 23:35:00',
               '2024-12-30 23:40:00', '2024-12-30 23:47:00'],
              dtype='datetime64[ns]', name='timestamp', length=248998, freq=None)

In [30]:
EVENT_PATH

PosixPath('../data/events/USDJPY-1m-20210101-20241231_CUSUM_2.52e-04.pkl')

In [31]:
t_events.to_series().to_pickle(EVENT_PATH)