In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [80]:
from pathlib import Path

# Configurable parts
SYMBOL = "USDJPY"
SAMPLE_TYPE = "time"
MINUTES = 5

START_DATE = "20240101"
END_DATE = "20241231"

RESAMPLED_NAME = f"{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"
SIDES_DIR = BASE_DIR / "interm/sides"
META_LABELS_DIR = BASE_DIR / "labels/meta_labels"

# Final paths
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}_FEATURES.pkl"

In [81]:
SIDE_NAME = "MACDCross_9_20_9"
SIDE_FILE_PATH = SIDES_DIR / f"{RESAMPLED_NAME}-{SIDE_NAME}.pkl"

In [82]:
%%time
df = pd.read_pickle(PROCESSED_FILE_PATH)
sides = pd.read_pickle(SIDE_FILE_PATH)

CPU times: user 2.48 ms, sys: 23.6 ms, total: 26.1 ms
Wall time: 27.8 ms


In [83]:
t_events = sides.index
df.loc[t_events].head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,close_pct_return,close_return,close_log_return,close_fd_return,...,dom,month,hour_sin,hour_cos,dow_sin,dow_cos,dom_sin,dom_cos,month_sin,month_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-05 03:25:00,144.896,144.9415,144.894,144.9105,2780660000000.0,0.006927,0.0001,0.0145,0.0001,2.788711,...,5,1,0.707107,0.707107,-0.433884,-0.900969,0.848644,0.528964,0.5,0.866025
2024-01-05 05:55:00,144.715,144.7455,144.7065,144.732,4156960000000.0,0.006613,0.000114,0.0165,0.000114,2.725506,...,5,1,0.965926,0.258819,-0.433884,-0.900969,0.848644,0.528964,0.5,0.866025
2024-01-05 07:10:00,145.1235,145.1265,145.0005,145.034,6889840000000.0,0.006496,-0.00062,-0.09,-0.00062,2.656937,...,5,1,0.965926,-0.258819,-0.433884,-0.900969,0.848644,0.528964,0.5,0.866025
2024-01-05 08:40:00,145.026,145.07,144.995,145.065,3996390000000.0,0.00554,0.000265,0.0385,0.000265,2.777666,...,5,1,0.866025,-0.5,-0.433884,-0.900969,0.848644,0.528964,0.5,0.866025
2024-01-05 10:40:00,145.246,145.251,145.2065,145.244,4032040000000.0,0.005936,-1.7e-05,-0.0025,-1.7e-05,2.753565,...,5,1,0.5,-0.866025,-0.433884,-0.900969,0.848644,0.528964,0.5,0.866025


## Calculate Target Volatility (trgt)
This will be used for the calculation of dynamic TP/SL

In [84]:
import pandas_ta as ta

In [85]:
df.ta.atr(length=9, append=True)
df['ATRr_9_PERC'] = df.ta.atr(length=0, percent=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,close_pct_return,close_return,close_log_return,close_fd_return,...,hour_sin,hour_cos,dow_sin,dow_cos,dom_sin,dom_cos,month_sin,month_cos,ATRr_9,ATRr_9_PERC
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-05 04:35:00,144.823,144.8405,144.807,144.8135,1595700000000.0,0.006927,-6.9e-05,-0.01,-6.9e-05,2.698221,...,0.866025,0.5,-0.433884,-0.900969,0.848644,0.528964,0.5,0.866025,0.044902,0.030705
2024-01-05 04:40:00,144.815,144.8465,144.8035,144.836,2861150000000.0,0.006798,0.000155,0.0225,0.000155,2.731859,...,0.866025,0.5,-0.433884,-0.900969,0.848644,0.528964,0.5,0.866025,0.04469,0.030628
2024-01-05 04:45:00,144.837,144.8565,144.8325,144.8475,3667890000000.0,0.006293,7.9e-05,0.0115,7.9e-05,2.736636,...,0.866025,0.5,-0.433884,-0.900969,0.848644,0.528964,0.5,0.866025,0.042392,0.029621
2024-01-05 04:50:00,144.848,144.861,144.84,144.8555,2710540000000.0,0.006701,5.5e-05,0.008,5.5e-05,2.738716,...,0.866025,0.5,-0.433884,-0.900969,0.848644,0.528964,0.5,0.866025,0.040015,0.028539
2024-01-05 04:55:00,144.855,144.8575,144.833,144.8505,2943220000000.0,0.006557,-3.5e-05,-0.005,-3.5e-05,2.728594,...,0.866025,0.5,-0.433884,-0.900969,0.848644,0.528964,0.5,0.866025,0.038291,0.02771


In [86]:
trgt = df['ATRr_9_PERC'].reindex(t_events) / 100

In [87]:
trgt.describe()

count    6858.000000
mean        0.000422
std         0.000272
min         0.000049
25%         0.000240
50%         0.000361
75%         0.000529
max         0.003888
Name: ATRr_9_PERC, dtype: float64

## Apply Triple Barrier
We use daily volatility to calculate TP/SL, and we set our T1 as 1day

In [88]:
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import pandas as pd

def applyPtSlOnT1(close,events,ptSl,molecule):
    # apply stop loss/profit taking, if it takes place before t1 (end of event)
    events_=events.loc[molecule]
    out=events_[['t1']].copy(deep=True)
    
    pt = ptSl[0] * events_['trgt'] if ptSl[0] > 0 else pd.Series(index=events.index)
    sl = -ptSl[1] * events_['trgt'] if ptSl[1] > 0 else pd.Series(index=events.index)

        
    for loc, t1 in events_['t1'].fillna(close.index[-1]).items():
        df0=close[loc:t1] # path prices, i.e. price[t0:t1]
        df0=(df0/close[loc]-1)*events_.at[loc,'side'] # path returns
        out.loc[loc,'sl']=df0[df0<sl[loc]].index.min() # earliest stop loss
        out.loc[loc,'pt']=df0[df0>pt[loc]].index.min() # earliest profit taking
    return out


def parallel_apply(func, items, num_threads=4, **kwargs):
    
    def worker(molecule):
        return func(molecule=molecule, **kwargs)

    chunks = np.array_split(items, num_threads)
    
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        results = list(executor.map(worker, chunks))

    return pd.concat(results).sort_index()


def getEvents(close, tEvents, ptSl, trgt, minTrgt, numThreads=4, t1=False, side=None):
    # Step 1: Filter targets
    trgt = trgt.loc[tEvents]
    trgt = trgt[trgt > minTrgt]

    # Step 2: Set vertical barrier (t1)
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=tEvents)

    # Step 3: Build events DataFrame
    if side is None:
        side_, ptSl_ = pd.Series(1., index=trgt.index), [ptSl[0], ptSl[0]]
    else:
        side_, ptSl_ = side.loc[trgt.index], ptSl[:2]

    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt'])

    # Step 4: Apply barriers in parallel
    df0 = parallel_apply(
        func=applyPtSlOnT1,
        items=events.index,
        num_threads=numThreads,
        close=close,
        events=events,
        ptSl=ptSl_
    )

    # Step 5: Choose the first touched barrier
    events['t1'] = df0.dropna(how='all').min(axis=1)
    
    if side is None:
        events = events.drop('side', axis=1)
        
    return events


In [89]:
def get_vertical_barrier(t_events, close, delta=pd.Timedelta(minutes=60)):
    """
    For each event in t_events, find the timestamp in close.index 
    that is at least delta later. Returns a pd.Series of t1.
    """
    barrier_times = t_events + delta
    t1_idx = close.index.searchsorted(barrier_times)
    valid_idx = t1_idx[t1_idx < len(close)]
    t1 = pd.Series(close.index[valid_idx], index=t_events[:len(valid_idx)])
    return t1

In [90]:
t1 = get_vertical_barrier(t_events, df['close'], delta=pd.Timedelta(minutes=60))

In [91]:
t1

timestamp
2024-01-05 03:25:00   2024-01-05 04:35:00
2024-01-05 05:55:00   2024-01-05 06:55:00
2024-01-05 07:10:00   2024-01-05 08:10:00
2024-01-05 08:40:00   2024-01-05 09:40:00
2024-01-05 10:40:00   2024-01-05 11:40:00
                              ...        
2024-12-30 15:40:00   2024-12-30 16:40:00
2024-12-30 17:25:00   2024-12-30 18:25:00
2024-12-30 19:20:00   2024-12-30 20:20:00
2024-12-30 20:25:00   2024-12-30 21:25:00
2024-12-30 21:35:00   2024-12-30 22:35:00
Name: timestamp, Length: 6858, dtype: datetime64[ns]

In [92]:
trgt.describe()

count    6858.000000
mean        0.000422
std         0.000272
min         0.000049
25%         0.000240
50%         0.000361
75%         0.000529
max         0.003888
Name: ATRr_9_PERC, dtype: float64

In [93]:
minTrgt = trgt.quantile(.3)
minTrgt

np.float64(0.0002630614833321803)

In [94]:
%%time

events = getEvents(
    close=df['close'],
    tEvents=t_events, 
    ptSl=[.5,.5], 
    trgt=trgt, 
    minTrgt=minTrgt, 
    numThreads=20, 
    t1=t1, 
    side=sides
)

CPU times: user 1.14 s, sys: 29.7 ms, total: 1.17 s
Wall time: 1.15 s


In [95]:
events.head()

Unnamed: 0_level_0,t1,trgt,side
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-05 05:55:00,2024-01-05 06:00:00,0.000325,1.0
2024-01-05 07:10:00,2024-01-05 07:20:00,0.000612,-1.0
2024-01-05 08:40:00,2024-01-05 09:00:00,0.000632,1.0
2024-01-05 10:40:00,2024-01-05 10:50:00,0.00047,-1.0
2024-01-05 11:20:00,2024-01-05 11:30:00,0.000431,1.0


In [96]:
events = events.dropna().copy()

# Keep only intraday events

In [97]:
events = events[events.index.date==events.t1.dt.date]

## Create classification labels

In [98]:
def getBins(events, close, minRet=0.0, t1=None):
    '''
    Compute event's outcome (including side information, if provided).
    events is a DataFrame where:
    -events.index is event's starttime
    -events['t1'] is event's endtime
    -events['trgt'] is event's target
    -events['side'] (optional) implies the algo's position side
    -t1 is original vertical barrier series
    Case 1: ('side' not in events): bin in (-1,1) <-label by price action
    Case 2: ('side' in events): bin in (0,1) <-label by pnl (meta-labeling)
    '''
    # 1) prices aligned with events
    events_ = events.dropna(subset=['t1'])
    px = events_.index.union(events_['t1'].values).drop_duplicates()
    px = close.reindex(px, method='bfill')
    # 2) create out object
    out = pd.DataFrame(index=events_.index)
    out['ret'] = px.loc[events_['t1'].values].values / px.loc[
        events_.index] - 1
    if 'side' in events_: out['ret'] *= events_['side']  # meta-labeling
    out['bin'] = np.sign(out['ret'])

    if 'side' not in events_:
        # only applies when not meta-labeling.
        # to update bin to 0 when vertical barrier is touched, we need the
        # original vertical barrier series since the events['t1'] is the time
        # of first touch of any barrier and not the vertical barrier
        # specifically. The index of the intersection of the vertical barrier
        # values and the events['t1'] values indicate which bin labels needs
        # to be turned to 0.
        vtouch_first_idx = events[events['t1'].isin(t1.values)].index
        out.loc[vtouch_first_idx, 'bin'] = 0.

    if 'side' in events_: out.loc[out['ret'] <= 0, 'bin'] = 0  # meta-labeling
    # out.loc[out['ret'].abs() < minRet, 'bin'] = 0. # 不做獲利過少的交易 (設為 0)
    # out = out.loc[out['ret'].abs() >= minRet] # 過濾掉獲利過少的交易 (刪掉)
    return out

In [99]:
labels = getBins(events, df['close'], minRet=0, t1=t1)

In [100]:
labels.bin.value_counts()

bin
1.0    2420
0.0    2355
Name: count, dtype: int64

In [101]:
labels = labels.join(events)

In [102]:
LABELS_FILE_PATH = META_LABELS_DIR / f"{RESAMPLED_NAME}-{SIDE_NAME}-meta.pkl"

In [103]:
labels.to_pickle(LABELS_FILE_PATH)