In [80]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [81]:
from pathlib import Path

# Configurable parts
SYMBOL = "USDJPY"
SAMPLE_TYPE = "time"
MINUTES = 5

START_DATE = "20210101"
END_DATE = "20241231"

# Build base name
RESAMPLED_NAME = f"{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"
EVENTS_DIR = BASE_DIR / "interm/events"

# Final paths
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}_FEATURES.pkl"

In [82]:
%%time
df = pd.read_pickle(PROCESSED_FILE_PATH)

CPU times: user 1.16 ms, sys: 19.7 ms, total: 20.9 ms
Wall time: 38.9 ms


In [83]:
df.shape

(298164, 67)

In [84]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,close_pct_return,close_return,close_log_return,close_fd_return,...,dom,month,hour_sin,hour_cos,dow_sin,dow_cos,dom_sin,dom_cos,month_sin,month_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-07 03:25:00,103.1615,103.167,103.148,103.1505,722520000000.0,0.003158,-9.2e-05,-0.0095,-9.2e-05,1.913549,...,7,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025
2021-01-07 03:30:00,103.1485,103.159,103.1365,103.1365,867600000000.0,0.00323,-0.000136,-0.014,-0.000136,1.903814,...,7,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025
2021-01-07 03:35:00,103.1375,103.1475,103.131,103.131,728550000000.0,0.003208,-5.3e-05,-0.0055,-5.3e-05,1.906306,...,7,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025
2021-01-07 03:40:00,103.1315,103.1335,103.113,103.127,764210000000.0,0.003317,-3.9e-05,-0.004,-3.9e-05,1.907271,...,7,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025
2021-01-07 03:45:00,103.128,103.132,103.122,103.128,362270000000.0,0.002974,1e-05,0.001,1e-05,1.912092,...,7,1,0.707107,0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025


In [85]:
df.columns

Index(['open', 'high', 'low', 'close', 'volume', 'spread', 'close_pct_return',
       'close_return', 'close_log_return', 'close_fd_return',
       'close_log_fd_return', 'EMA_5', 'EMA_9', 'EMA_20', 'EMA_24', 'EMA_36',
       'EMA_50', 'EMA_100', 'ATRr_14', 'ATRr_60', 'ATRr_120', 'ADX_14',
       'ADXR_14_2', 'DMP_14', 'DMN_14', 'RSI_14', 'RSI_25', 'RSI_50',
       'BBL_5_2.0_2.0', 'BBM_5_2.0_2.0', 'BBU_5_2.0_2.0', 'BBB_5_2.0_2.0',
       'BBP_5_2.0_2.0', 'BBL_10_2.0_2.0', 'BBM_10_2.0_2.0', 'BBU_10_2.0_2.0',
       'BBB_10_2.0_2.0', 'BBP_10_2.0_2.0', 'BBL_15_2.0_2.0', 'BBM_15_2.0_2.0',
       'BBU_15_2.0_2.0', 'BBB_15_2.0_2.0', 'BBP_15_2.0_2.0', 'BBL_20_2.0_2.0',
       'BBM_20_2.0_2.0', 'BBU_20_2.0_2.0', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0',
       'MACD_8_17_9', 'MACDh_8_17_9', 'MACDs_8_17_9', 'MACD_12_26_9',
       'MACDh_12_26_9', 'MACDs_12_26_9', 'unix_time', 'hour', 'dow', 'dom',
       'month', 'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'dom_sin',
       'dom_cos', 'month_sin',

## Apply CUSUM Filter

## Calculate CUSUM Threshold

In [86]:
import numpy as np

In [87]:
return_std = np.log(df['close']).diff().iloc[1:].std()
return_std * 3

np.float64(0.0010667252441945762)

In [88]:
THRESHOLD = 0.001
EVENT_NAME = f'CUSUM_{THRESHOLD}'
EVENT_PATH = EVENTS_DIR / f"{RESAMPLED_NAME}_{EVENT_NAME}.pkl"

## Get Events Start Time (t0)

In [89]:
import pandas as pd
from tqdm import tqdm

def cusum_filter(closes: pd.Series, threshold: float) -> pd.DatetimeIndex:
    # get tEvents using CUSUM filter
    ret = np.log(closes).diff().iloc[1:]
    values = ret.values
    timestamps = ret.index

    t_events_mask = np.zeros_like(values, dtype=bool)

    cum_pos, cum_neg = 0.0, 0.0

    for i in tqdm(range(len(values))):
        cum_pos = max(0.0, cum_pos + values[i])
        cum_neg = min(0.0, cum_neg + values[i])

        if cum_pos > threshold:
            t_events_mask[i] = True
            cum_pos = 0.0
        elif cum_neg < -threshold:
            t_events_mask[i] = True
            cum_neg = 0.0

    return timestamps[t_events_mask]


In [90]:
%%time

t_events = cusum_filter(df['close'], threshold=THRESHOLD)

100%|████████████████████████████████████| 298163/298163 [00:00<00:00, 3616689.89it/s]

CPU times: user 84 ms, sys: 1.7 ms, total: 85.7 ms
Wall time: 85.2 ms





In [91]:
t_events

DatetimeIndex(['2021-01-07 05:45:00', '2021-01-07 07:05:00',
               '2021-01-07 08:05:00', '2021-01-07 09:10:00',
               '2021-01-07 09:50:00', '2021-01-07 10:35:00',
               '2021-01-07 12:00:00', '2021-01-07 12:35:00',
               '2021-01-07 14:20:00', '2021-01-07 14:35:00',
               ...
               '2024-12-30 14:45:00', '2024-12-30 15:00:00',
               '2024-12-30 15:20:00', '2024-12-30 15:35:00',
               '2024-12-30 15:40:00', '2024-12-30 16:40:00',
               '2024-12-30 18:00:00', '2024-12-30 19:00:00',
               '2024-12-30 20:30:00', '2024-12-30 23:05:00'],
              dtype='datetime64[ns]', name='timestamp', length=24493, freq=None)

In [92]:
print(EVENT_NAME)
print(EVENT_PATH)

CUSUM_0.001
../data/interm/events/USDJPY-5m-20210101-20241231_CUSUM_0.001.pkl


In [93]:
t_events.to_series().to_pickle(EVENT_PATH)