In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [2]:
from pathlib import Path

# Configurable parts
SYMBOL = "USDJPY"
SAMPLE_TYPE = "time"
MINUTES = 15

START_DATE = "20240101"
END_DATE = "20241231"

# Build base name
RESAMPLED_NAME = f"{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"
EVENTS_DIR = BASE_DIR / "events"

# Final paths
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}_processed.pkl"

## Load Data

In [3]:
df = pd.read_pickle(PROCESSED_FILE_PATH)

In [4]:
FAST = 8
SLOW = 34
EMACROSS_EVENT_PATH = EVENTS_DIR / f"{RESAMPLED_NAME}_EMA_CROSS_{FAST}_{SLOW}.pkl"

In [5]:
ema_cross_events = pd.read_pickle(EMACROSS_EVENT_PATH)

In [6]:
LEGS = 5
THRESHOLD = .1
ZIGZAG_EVENT_PATH = EVENTS_DIR / f"{RESAMPLED_NAME}_ZIGZAGs_{THRESHOLD}%_{LEGS}.pkl"

In [7]:
zigzag_events = pd.read_pickle(ZIGZAG_EVENT_PATH)

In [8]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,return,log_return,fd_return,log_fd_return
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-02-18 23:00:00,1.07757,1.07803,1.077555,1.078015,1784510000000.0,6.9e-05,0.000435,0.000404,0.167941,0.011969
2024-02-18 23:15:00,1.07801,1.078175,1.077955,1.07817,927200000000.0,3.2e-05,0.000155,0.000144,0.168,0.012025
2024-02-18 23:30:00,1.078155,1.07823,1.078055,1.07823,1681190000000.0,3.1e-05,6e-05,5.6e-05,0.167989,0.012014
2024-02-18 23:45:00,1.078225,1.078515,1.078195,1.078415,1629270000000.0,3e-05,0.000185,0.000172,0.168124,0.012139
2024-02-19 00:00:00,1.078395,1.07878,1.07838,1.078775,2773220000000.0,3e-05,0.00036,0.000334,0.168415,0.012409


In [9]:
ema_cross_events

timestamp
2024-02-19 07:15:00   2024-02-19 07:15:00
2024-02-19 09:00:00   2024-02-19 09:00:00
2024-02-19 18:00:00   2024-02-19 18:00:00
2024-02-20 00:00:00   2024-02-20 00:00:00
2024-02-20 07:00:00   2024-02-20 07:00:00
                              ...        
2024-12-30 06:15:00   2024-12-30 06:15:00
2024-12-30 08:00:00   2024-12-30 08:00:00
2024-12-30 10:00:00   2024-12-30 10:00:00
2024-12-30 13:45:00   2024-12-30 13:45:00
2024-12-30 23:45:00   2024-12-30 23:45:00
Name: timestamp, Length: 813, dtype: datetime64[ns]

In [10]:
zigzag_events

timestamp
2024-02-19 00:30:00   2024-02-19 00:30:00
2024-02-19 07:15:00   2024-02-19 07:15:00
2024-02-19 18:15:00   2024-02-19 18:15:00
2024-02-20 10:30:00   2024-02-20 10:30:00
2024-02-20 14:30:00   2024-02-20 14:30:00
                              ...        
2024-12-30 12:00:00   2024-12-30 12:00:00
2024-12-30 13:00:00   2024-12-30 13:00:00
2024-12-30 14:45:00   2024-12-30 14:45:00
2024-12-30 18:45:00   2024-12-30 18:45:00
2024-12-30 23:00:00   2024-12-30 23:00:00
Name: timestamp, Length: 927, dtype: datetime64[ns]

## Locate ZigZag prices

In [11]:
df['next_zigzag'] = df['close'].loc[zigzag_events]

In [12]:
df['next_zigzag']

timestamp
2024-02-18 23:00:00        NaN
2024-02-18 23:15:00        NaN
2024-02-18 23:30:00        NaN
2024-02-18 23:45:00        NaN
2024-02-19 00:00:00        NaN
                        ...   
2024-12-30 22:45:00        NaN
2024-12-30 23:00:00    1.04013
2024-12-30 23:15:00        NaN
2024-12-30 23:30:00        NaN
2024-12-30 23:45:00        NaN
Name: next_zigzag, Length: 21642, dtype: float64

We've find the price at each zigzag point, and filled into **next_zigzag** column.

Now we need to backfill price, so the price at each timestamp represents the next zigzag price

In [13]:
df['next_zigzag'].bfill(inplace=True)

## Labeling for regression task

We can use next ZigZag price as a target to train regressor models to predict where the price might go.

In [14]:
label = pd.DataFrame(df['next_zigzag'], index=ema_cross_events)

In [15]:
label.dropna(inplace=True)

## Labeling for classfication task

In [16]:
label = pd.DataFrame(df[['close', 'next_zigzag']], index=ema_cross_events)

In [17]:
label

Unnamed: 0_level_0,close,next_zigzag
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-02-19 07:15:00,1.078685,1.078685
2024-02-19 09:00:00,1.077720,1.078060
2024-02-19 18:00:00,1.077980,1.078060
2024-02-20 00:00:00,1.077310,1.080525
2024-02-20 07:00:00,1.077360,1.080525
...,...,...
2024-12-30 06:15:00,1.042935,1.042855
2024-12-30 08:00:00,1.042380,1.045740
2024-12-30 10:00:00,1.044170,1.045740
2024-12-30 13:45:00,1.038800,1.040535


In [18]:
conditions = [
    label['next_zigzag'] < label['close'],  # 跌
    label['next_zigzag'] == label['close'],  # 坪
    label['next_zigzag'] > label['close']  # 漲
]

choices = [0, 1, 2]
label['class'] = np.select(conditions, choices)

In [19]:
label.drop(columns=['close', 'next_zigzag'], inplace=True)

In [20]:
label

Unnamed: 0_level_0,class
timestamp,Unnamed: 1_level_1
2024-02-19 07:15:00,1
2024-02-19 09:00:00,2
2024-02-19 18:00:00,2
2024-02-20 00:00:00,2
2024-02-20 07:00:00,2
...,...
2024-12-30 06:15:00,0
2024-12-30 08:00:00,2
2024-12-30 10:00:00,2
2024-12-30 13:45:00,2


In [21]:
label.value_counts()

class
2        569
0        196
1         48
Name: count, dtype: int64