In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [2]:
from pathlib import Path

# Configurable parts
SYMBOL = "USDJPY"
SAMPLE_TYPE = "time"
MINUTES = 5
EVENT_NAME = 'CUSUM_0.001'
LABEL_NAME = "FH"

START_DATE = "20210101"
END_DATE = "20241231"

RESAMPLED_NAME = f"{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"
EVENTS_DIR = BASE_DIR / "interm/events"

# Final paths
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}_FEATURES.pkl"
EVENT_FILE_PATH = EVENTS_DIR / f"{RESAMPLED_NAME}_{EVENT_NAME}.pkl"

In [3]:
DIRECTION_LABEL_DIR = BASE_DIR / "labels/direction_labels"
DIRECTION_LABEL_DIR.mkdir(parents=True, exist_ok=True)
DIRECTION_LABEL_FILE_PATH = DIRECTION_LABEL_DIR / f"{RESAMPLED_NAME}-{EVENT_NAME}-{LABEL_NAME}.pkl"

In [4]:
%%time
df = pd.read_pickle(PROCESSED_FILE_PATH)
t_events = pd.read_pickle(EVENT_FILE_PATH)

CPU times: user 1.11 ms, sys: 27.7 ms, total: 28.8 ms
Wall time: 39.8 ms


In [5]:
df.loc[t_events].head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,close_pct_return,close_return,close_log_return,close_fd_return,...,dom,month,hour_sin,hour_cos,dow_sin,dow_cos,dom_sin,dom_cos,month_sin,month_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-07 05:45:00,103.1925,103.226,103.189,103.2135,704760000000.0,0.00241,0.000199,0.0205,0.000199,1.954618,...,7,1,0.965926,0.258819,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025
2021-01-07 07:05:00,103.2515,103.3085,103.25,103.2995,877410000000.0,0.002593,0.00046,0.0475,0.00046,1.993861,...,7,1,0.965926,-0.258819,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025
2021-01-07 08:05:00,103.359,103.3945,103.356,103.3795,1817860000000.0,0.002563,0.000189,0.0195,0.000189,1.972264,...,7,1,0.866025,-0.5,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025
2021-01-07 09:10:00,103.422,103.473,103.416,103.466,1227620000000.0,0.002118,0.000421,0.0435,0.000421,2.0029,...,7,1,0.707107,-0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025
2021-01-07 09:50:00,103.51,103.5775,103.5085,103.5755,2519750000000.0,0.001983,0.000623,0.0645,0.000623,2.044747,...,7,1,0.707107,-0.707107,0.433884,-0.900969,0.988468,0.151428,0.5,0.866025


In [15]:
df['next_return'] = df['close_log_return'].shift(-1)

In [22]:
return_threshold = df['next_return'].abs().quantile(.3)
return_threshold

np.float64(6.894801876440301e-05)

In [27]:
def bin_ret(ret: float, threshold: float):
    if abs(ret) < threshold: 
        return 0
    elif ret > 0:
        return 1
    else:
        return -1

In [28]:
df['bin'] = df['next_return'].apply(lambda x: bin_ret(x, threshold=return_threshold))

In [29]:
df['bin'].value_counts()

bin
 1    106649
-1    102066
 0     89449
Name: count, dtype: int64

In [35]:
labels = df['bin'].reindex(t_events)

In [36]:
labels

timestamp
2021-01-07 05:45:00   -1
2021-01-07 07:05:00   -1
2021-01-07 08:05:00    0
2021-01-07 09:10:00   -1
2021-01-07 09:50:00    0
                      ..
2024-12-30 16:40:00   -1
2024-12-30 18:00:00   -1
2024-12-30 19:00:00    0
2024-12-30 20:30:00   -1
2024-12-30 23:05:00    1
Name: bin, Length: 24493, dtype: int64

In [37]:
labels["bin_class"] = labels["bin"] + 1

KeyError: 'bin'

In [333]:
labeled_events = events.join(labels, how='inner')

In [334]:
labeled_events.head()

Unnamed: 0_level_0,t1,trgt,ret,bin,bin_class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-07 09:10:00,2021-01-07 09:45:00,0.000432,0.000435,1.0,2.0
2021-01-07 09:50:00,2021-01-07 10:30:00,0.000422,0.000531,1.0,2.0
2021-01-07 10:35:00,2021-01-07 12:00:00,0.000411,0.00069,1.0,2.0
2021-01-07 12:00:00,2021-01-07 12:20:00,0.000446,-0.000603,-1.0,0.0
2021-01-07 12:35:00,2021-01-07 13:10:00,0.000422,-0.00054,-1.0,0.0


## Save events to disk

In [335]:
labeled_events.to_pickle(DIRECTION_LABEL_FILE_PATH)

In [336]:
DIRECTION_LABEL_FILE_PATH

PosixPath('../data/labels/direction_labels/USDJPY-5m-20210101-20241231-CUSUM_0.001-TB.pkl')