In [27]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [28]:
from pathlib import Path

# Configurable parts
SYMBOL = "USDJPY"
SAMPLE_TYPE = "time"
MINUTES = 1
DOLLAR_THRESHOLD = "115009542m"

START_DATE = "20240101"
END_DATE = "20241231"

# Build base name
if SAMPLE_TYPE == "dollar":
    RESAMPLED_NAME = f"{SYMBOL}-{DOLLAR_THRESHOLD}-dollar-{START_DATE}-{END_DATE}"
else:
    RESAMPLED_NAME = f"{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"
EVENTS_DIR = BASE_DIR / "events"

# Final paths
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}-processed.pkl"

In [58]:
%%time
df = pd.read_pickle(PROCESSED_FILE_PATH)

CPU times: user 2.22 ms, sys: 68.5 ms, total: 70.7 ms
Wall time: 71.4 ms


In [59]:
df.shape

(371236, 100)

In [60]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,close_delta,close_return,close_log_return,ret_mean_5,...,dom,month,hour_sin,hour_cos,dow_sin,dow_cos,dom_sin,dom_cos,month_sin,month_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-02 01:44:00,1.10381,1.103845,1.103805,1.103835,209060000000.0,3.7e-05,2e-05,1.8e-05,1.8e-05,1.6e-05,...,2,1,0.258819,0.965926,0.781831,0.62349,0.394356,0.918958,0.5,0.866025
2024-01-02 01:45:00,1.10385,1.10401,1.10385,1.10389,300160000000.0,3.3e-05,5.5e-05,5e-05,5e-05,2.6e-05,...,2,1,0.258819,0.965926,0.781831,0.62349,0.394356,0.918958,0.5,0.866025
2024-01-02 01:46:00,1.1039,1.10396,1.103885,1.103905,399720000000.0,3.4e-05,1.5e-05,1.4e-05,1.4e-05,2.6e-05,...,2,1,0.258819,0.965926,0.781831,0.62349,0.394356,0.918958,0.5,0.866025
2024-01-02 01:47:00,1.10391,1.10399,1.10386,1.1039,519310000000.0,3.2e-05,-5e-06,-5e-06,-5e-06,2.4e-05,...,2,1,0.258819,0.965926,0.781831,0.62349,0.394356,0.918958,0.5,0.866025
2024-01-02 01:48:00,1.10389,1.103955,1.10387,1.1039,198320000000.0,2.8e-05,0.0,0.0,0.0,1.5e-05,...,2,1,0.258819,0.965926,0.781831,0.62349,0.394356,0.918958,0.5,0.866025


In [61]:
df.columns

Index(['open', 'high', 'low', 'close', 'volume', 'spread', 'close_delta',
       'close_return', 'close_log_return', 'ret_mean_5', 'ret_mean_10',
       'ret_mean_15', 'ret_mean_20', 'log_volume', 'rv5', 'log_rv5',
       'sqrt_rv5', 'rv15', 'log_rv15', 'sqrt_rv15', 'rv50', 'log_rv50',
       'sqrt_rv50', 'ema5', 'ema5_slope', 'close_above_ema5', 'ema20',
       'ema20_slope', 'close_above_ema20', 'ema50', 'ema50_slope',
       'close_above_ema50', 'ema100', 'ema100_slope', 'close_above_ema100',
       'atr14', 'atr60', 'atr120', 'log_atr14', 'atr14_percent',
       'atr14_adjusted_return', 'log_atr60', 'atr60_percent',
       'atr60_adjusted_return', 'log_atr120', 'atr120_percent',
       'atr120_adjusted_return', 'adx14', 'plus_di14', 'minus_di14',
       'bb_upper', 'bb_lower', 'bb_mavg', 'bb_width', 'bb_position',
       'dc20_upper', 'dc20_lower', 'dc20_mid', 'dc20_width',
       'close_above_dc20_mid', 'dc20_breakout', 'dc20_breakdown', 'dc50_upper',
       'dc50_lower', 'dc50_mi

## Apply CUSUM Filter

## Calculate CUSUM Threshold

In [116]:
return_std = np.log(df['close']).diff().iloc[1:].std()
return_std * 3

np.float64(0.0003098871099078162)

In [117]:
EVENT_NAME = 'CUSUM'
THRESHOLD = 0.00031
EVENT_PATH = EVENTS_DIR / f"{RESAMPLED_NAME}_{EVENT_NAME}_{THRESHOLD}.pkl"

In [118]:
EVENT_PATH

PosixPath('../data/events/USDJPY-1m-20240101-20241231_CUSUM_0.00031.pkl')

## Get Events Start Time (t0)

In [119]:
import numpy as np
import pandas as pd
from tqdm import tqdm

def cusum_filter(closes: pd.Series, threshold: float) -> pd.DatetimeIndex:
    # get tEvents using CUSUM filter
    ret = np.log(closes).diff().iloc[1:]
    values = ret.values
    timestamps = ret.index

    t_events_mask = np.zeros_like(values, dtype=bool)

    cum_pos, cum_neg = 0.0, 0.0

    for i in tqdm(range(len(values))):
        cum_pos = max(0.0, cum_pos + values[i])
        cum_neg = min(0.0, cum_neg + values[i])

        if cum_pos > threshold:
            t_events_mask[i] = True
            cum_pos = 0.0
        elif cum_neg < -threshold:
            t_events_mask[i] = True
            cum_neg = 0.0

    return timestamps[t_events_mask]


In [120]:
%%time

t_events = cusum_filter(df['close'], threshold=THRESHOLD)

100%|█████████████████████████████████████| 371235/371235 [00:00<00:00, 3144057.45it/s]

CPU times: user 125 ms, sys: 6.61 ms, total: 132 ms
Wall time: 128 ms





In [121]:
t_events

DatetimeIndex(['2024-01-02 01:57:00', '2024-01-02 02:12:00',
               '2024-01-02 02:22:00', '2024-01-02 02:40:00',
               '2024-01-02 02:53:00', '2024-01-02 03:09:00',
               '2024-01-02 03:45:00', '2024-01-02 04:10:00',
               '2024-01-02 04:42:00', '2024-01-02 04:58:00',
               ...
               '2024-12-30 20:31:00', '2024-12-30 20:57:00',
               '2024-12-30 21:03:00', '2024-12-30 21:43:00',
               '2024-12-30 21:58:00', '2024-12-30 22:05:00',
               '2024-12-30 22:52:00', '2024-12-30 23:00:00',
               '2024-12-30 23:15:00', '2024-12-30 23:36:00'],
              dtype='datetime64[ns]', name='timestamp', length=29678, freq=None)

In [122]:
EVENT_PATH

PosixPath('../data/events/USDJPY-1m-20240101-20241231_CUSUM_0.00031.pkl')

In [123]:
t_events.to_series().to_pickle(EVENT_PATH)