In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [2]:
from pathlib import Path

# Configurable parts
SOURCE = "dukascopy"
SYMBOL = "usdjpy"
MINUTES = 1
START_DATE = "2020-01-01"
END_DATE = "2024-12-31"

# Build base name
BASE_NAME = f"{SOURCE}-{SYMBOL}-tick-{START_DATE}-{END_DATE}"
RESAMPLED_NAME = f"{SOURCE}-{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"
# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"

# Final paths
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"

In [3]:
EVENT_NAME = 'cusum_filter'
SIDE_NAME = 'trend_line_same'
LABEL_DIR = BASE_DIR / "labels"
LABEL_DIR.mkdir(parents=True, exist_ok=True)
LABEL_FILE_PATH = LABEL_DIR / f"{RESAMPLED_NAME}-{EVENT_NAME}-{SIDE_NAME}.pkl"

In [4]:
%%time
df = pd.read_pickle(RESAMPLED_FILE_PATH)

CPU times: total: 46.9 ms
Wall time: 54.6 ms


In [5]:
df.shape

(1851808, 7)

In [6]:
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,spread
0,2020-01-01 22:00:00,108.758,108.76,108.758,108.759,9179.999948,0.060667
1,2020-01-01 22:01:00,108.757,108.759,108.7495,108.7495,13300.000012,0.060333
2,2020-01-01 22:02:00,108.7495,108.7535,108.7495,108.7535,4500.0,0.058667
3,2020-01-01 22:03:00,108.754,108.7555,108.7535,108.7555,10490.00001,0.059857
4,2020-01-01 22:04:00,108.7575,108.765,108.7555,108.765,11600.000024,0.0551


In [7]:
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

In [8]:
df = df.set_index('timestamp')

In [9]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,spread
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01 22:00:00,108.758,108.76,108.758,108.759,9179.999948,0.060667
2020-01-01 22:01:00,108.757,108.759,108.7495,108.7495,13300.000012,0.060333
2020-01-01 22:02:00,108.7495,108.7535,108.7495,108.7535,4500.0,0.058667
2020-01-01 22:03:00,108.754,108.7555,108.7535,108.7555,10490.00001,0.059857
2020-01-01 22:04:00,108.7575,108.765,108.7555,108.765,11600.000024,0.0551


In [10]:
import numpy as np

df['log_return'] = np.log(df['close'] / df['close'].shift(1))
df['log_volume'] = np.log1p(df['volume'])
df = df.dropna()

In [11]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,spread,log_return,log_volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-01 22:01:00,108.757,108.759,108.7495,108.7495,13300.000012,0.060333,-8.7e-05,9.495595
2020-01-01 22:02:00,108.7495,108.7535,108.7495,108.7535,4500.0,0.058667,3.7e-05,8.412055
2020-01-01 22:03:00,108.754,108.7555,108.7535,108.7555,10490.00001,0.059857,1.8e-05,9.258273
2020-01-01 22:04:00,108.7575,108.765,108.7555,108.765,11600.000024,0.0551,8.7e-05,9.358847
2020-01-01 22:05:00,108.77,108.77,108.769,108.77,1059.999987,0.021333,4.6e-05,6.966967


In [12]:
import numpy as np
import pandas as pd
from tqdm import tqdm

def get_tevents_optimized(data: pd.Series, threshold: float) -> pd.DatetimeIndex:
    values = data.values
    timestamps = data.index

    s_pos = np.zeros_like(values)
    s_neg = np.zeros_like(values)

    t_events_mask = np.zeros_like(values, dtype=bool)

    cum_pos, cum_neg = 0.0, 0.0

    for i in tqdm(range(len(values))):
        cum_pos = max(0.0, cum_pos + values[i])
        cum_neg = min(0.0, cum_neg + values[i])
        s_pos[i] = cum_pos
        s_neg[i] = cum_neg

        if cum_pos > threshold:
            t_events_mask[i] = True
            cum_pos = 0.0
        if cum_neg < -threshold:
            t_events_mask[i] = True
            cum_neg = 0.0

    return timestamps[t_events_mask]


In [13]:
%%time

t_events = get_tevents_optimized(df['log_return'].iloc[1:], 5e-3)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1851806/1851806 [00:01<00:00, 1180810.32it/s]

CPU times: total: 1.58 s
Wall time: 1.59 s





In [14]:
t_events

DatetimeIndex(['2020-01-02 15:31:00', '2020-01-03 02:27:00',
               '2020-01-06 15:48:00', '2020-01-07 23:27:00',
               '2020-01-08 02:38:00', '2020-01-08 11:49:00',
               '2020-01-08 18:44:00', '2020-01-09 15:38:00',
               '2020-01-13 23:47:00', '2020-01-22 23:32:00',
               ...
               '2024-12-19 07:10:00', '2024-12-19 09:06:00',
               '2024-12-19 15:14:00', '2024-12-20 02:16:00',
               '2024-12-20 13:30:00', '2024-12-20 17:17:00',
               '2024-12-23 07:12:00', '2024-12-24 00:15:00',
               '2024-12-26 13:31:00', '2024-12-30 14:44:00'],
              dtype='datetime64[ns]', name='timestamp', length=2792, freq=None)

In [15]:
df.loc[t_events]

Unnamed: 0_level_0,open,high,low,close,volume,spread,log_return,log_volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-02 15:31:00,108.3515,108.3515,108.3095,108.3135,5.426700e+05,0.002007,-0.000355,13.204259
2020-01-03 02:27:00,108.1110,108.1275,108.0830,108.0835,8.869400e+05,0.002467,-0.000264,13.695534
2020-01-06 15:48:00,108.3215,108.3340,108.3210,108.3340,3.674300e+05,0.002109,0.000129,12.814291
2020-01-07 23:27:00,108.1215,108.1345,108.0580,108.0785,7.303600e+05,0.002598,-0.000398,13.501294
2020-01-08 02:38:00,108.1775,108.2960,108.1775,108.2640,1.358470e+06,0.003059,0.000785,14.121870
...,...,...,...,...,...,...,...,...
2024-12-20 17:17:00,156.0365,156.0365,155.9735,155.9770,8.590200e+05,0.007137,-0.000385,13.663549
2024-12-23 07:12:00,156.7390,156.7590,156.7390,156.7590,6.459300e+05,0.006100,0.000121,13.378448
2024-12-24 00:15:00,157.2405,157.2910,157.2400,157.2655,6.881700e+05,0.009516,0.000156,13.441793
2024-12-26 13:31:00,157.6495,157.7020,157.6495,157.7000,6.484200e+05,0.007121,0.000336,13.382295


## Calculate Daily Volatility
This will be used for the calculation of dynamic TP/SL

In [16]:
def getDailyVol(close,span0=100):
    # daily vol reindexed to close
    df0=close.index.searchsorted(close.index-pd.Timedelta(days=1))
    #bp()
    df0=df0[df0>0]
    #bp()
    df0=(pd.Series(close.index[df0-1],
                   index=close.index[close.shape[0]-df0.shape[0]:]))
    #bp()
    try:
        df0=close.loc[df0.index]/close.loc[df0.values].values-1 # daily rets
    except Exception as e:
        print(e)
        print('adjusting shape of close.loc[df0.index]')
        cut = close.loc[df0.index].shape[0] - close.loc[df0.values].shape[0]
        df0=close.loc[df0.index].iloc[:-cut]/close.loc[df0.values].values-1
    df0=df0.ewm(span=span0).std().rename('dailyVol')
    return df0

In [17]:
%%time

daily_vol = getDailyVol(df['close'])

CPU times: total: 641 ms
Wall time: 680 ms


In [18]:
daily_vol.describe()

count    1.850372e+06
mean     8.050931e-04
std      7.732581e-04
min      2.596404e-05
25%      3.693544e-04
50%      5.935283e-04
75%      9.740889e-04
max      2.287419e-02
Name: dailyVol, dtype: float64

In [19]:
trgt = daily_vol.reindex(t_events, method='ffill')

## Get Side from Trend Line

In [20]:
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

def get_side_from_trendline(close: pd.Series, t_events: pd.Index, lookback: int = 50):
    """
    Create a 'side' signal using linear regression trendline logic.
    
    Args:
        close (pd.Series): Close price series (indexed by datetime)
        t_events (pd.Index): Event timestamps (e.g., from MACD or CUSUM)
        lookback (int): Number of past bars to fit linear trend
    
    Returns:
        pd.Series: side signal indexed by t_events (1 = long, -1 = short)
    """
    side = pd.Series(index=t_events, dtype='float32')

    for t in t_events:
        if t not in close.index:
            continue
        try:
            end_loc = close.index.get_loc(t)
            start_loc = end_loc - lookback + 1
            if start_loc < 0:
                continue  # not enough lookback

            window = close.iloc[start_loc:end_loc+1]
            x = np.arange(len(window)).reshape(-1, 1)  # time steps
            y = window.values.reshape(-1, 1)

            model = LinearRegression().fit(x, y)
            trend_value = model.predict([[len(window) - 1]])[0][0]
            actual_price = y[-1][0]

            side[t] = 1 if actual_price > trend_value else -1
        except:
            continue

    return side.dropna()


In [23]:
sides = get_side_from_trendline(df['close'], t_events, lookback=24)

In [24]:
sides

timestamp
2020-01-02 15:31:00   -1.0
2020-01-03 02:27:00   -1.0
2020-01-06 15:48:00    1.0
2020-01-07 23:27:00   -1.0
2020-01-08 02:38:00    1.0
                      ... 
2024-12-20 17:17:00   -1.0
2024-12-23 07:12:00    1.0
2024-12-24 00:15:00    1.0
2024-12-26 13:31:00    1.0
2024-12-30 14:44:00   -1.0
Length: 2792, dtype: float32

## Apply Triple Barrier
We use daily volatility to calculate TP/SL, and we set our T1 as 1day

In [25]:
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import pandas as pd

def applyPtSlOnT1(close,events,ptSl,molecule):
    # apply stop loss/profit taking, if it takes place before t1 (end of event)
    events_=events.loc[molecule]
    out=events_[['t1']].copy(deep=True)
    if ptSl[0]>0: pt=ptSl[0]*events_['trgt']
    else: pt=pd.Series(index=events.index) # NaNs
    if ptSl[1]>0: sl=-ptSl[1]*events_['trgt']
    else: sl=pd.Series(index=events.index) # NaNs
    for loc, t1 in events_['t1'].fillna(close.index[-1]).items():
        df0=close[loc:t1] # path prices
        df0=(df0/close[loc]-1)*events_.at[loc,'side'] # path returns
        out.loc[loc,'sl']=df0[df0<sl[loc]].index.min() # earliest stop loss
        out.loc[loc,'pt']=df0[df0>pt[loc]].index.min() # earliest profit taking
    return out


def parallel_apply(func, items, num_threads=4, **kwargs):
    def worker(molecule):
        return func(molecule=molecule, **kwargs)

    chunks = np.array_split(items, num_threads)
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        results = list(executor.map(worker, chunks))

    return pd.concat(results).sort_index()


def getEvents(close, tEvents, ptSl, trgt, minRet, numThreads=4, t1=False, side=None):
    # Step 1: Filter targets
    trgt = trgt.loc[tEvents]
    trgt = trgt[trgt > minRet]

    # Step 2: Set vertical barrier (t1)
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=tEvents)

    # Step 3: Build events DataFrame
    if side is None:
        side_, ptSl_ = pd.Series(1., index=trgt.index), [ptSl[0], ptSl[0]]
    else:
        side_, ptSl_ = side.loc[trgt.index], ptSl[:2]

    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt'])

    # Step 4: Apply barriers in parallel
    df0 = parallel_apply(
        func=applyPtSlOnT1,
        items=events.index,
        num_threads=numThreads,
        close=close,
        events=events,
        ptSl=ptSl_
    )

    # Step 5: Choose the first touched barrier
    events['t1'] = df0.dropna(how='all').min(axis=1)
    if side is None:
        events = events.drop('side', axis=1)
    return events


In [26]:
close = df['close']

t1 = close.index.searchsorted(t_events + pd.Timedelta(days=1))
t1 = t1[t1<close.shape[0]]
t1 = pd.Series(close.index[t1], index=t_events[:t1.shape[0]])

In [27]:
%%time

events = getEvents(df['close'], t_events, [5e-3, 5e-3], trgt, 0, 8, t1=t1, side=sides)

CPU times: total: 2.03 s
Wall time: 1.85 s


In [28]:
events

Unnamed: 0_level_0,t1,trgt,side
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-03 02:27:00,2020-01-03 02:28:00,0.001237,-1.0
2020-01-06 15:48:00,2020-01-06 15:49:00,0.000648,1.0
2020-01-07 23:27:00,2020-01-07 23:28:00,0.001216,-1.0
2020-01-08 02:38:00,2020-01-08 02:39:00,0.001244,1.0
2020-01-08 11:49:00,2020-01-08 11:50:00,0.001082,1.0
...,...,...,...
2024-12-20 17:17:00,2024-12-20 17:18:00,0.002112,-1.0
2024-12-23 07:12:00,2024-12-23 07:13:00,0.000289,1.0
2024-12-24 00:15:00,2024-12-24 00:16:00,0.000317,1.0
2024-12-26 13:31:00,2024-12-26 13:32:00,0.000363,1.0


In [29]:
def getBins(events, close, t1=None):
    '''
    Compute event's outcome (including side information, if provided).
    events is a DataFrame where:
    -events.index is event's starttime
    -events['t1'] is event's endtime
    -events['trgt'] is event's target
    -events['side'] (optional) implies the algo's position side
    -t1 is original vertical barrier series
    Case 1: ('side' not in events): bin in (-1,1) <-label by price action
    Case 2: ('side' in events): bin in (0,1) <-label by pnl (meta-labeling)
    '''
    # 1) prices aligned with events
    events_ = events.dropna(subset=['t1'])
    px = events_.index.union(events_['t1'].values).drop_duplicates()
    px = close.reindex(px, method='bfill')
    # 2) create out object
    out = pd.DataFrame(index=events_.index)
    out['ret'] = px.loc[events_['t1'].values].values / px.loc[
        events_.index] - 1
    if 'side' in events_: out['ret'] *= events_['side']  # meta-labeling
    out['bin'] = np.sign(out['ret'])

    if 'side' not in events_:
        # only applies when not meta-labeling.
        # to update bin to 0 when vertical barrier is touched, we need the
        # original vertical barrier series since the events['t1'] is the time
        # of first touch of any barrier and not the vertical barrier
        # specifically. The index of the intersection of the vertical barrier
        # values and the events['t1'] values indicate which bin labels needs
        # to be turned to 0.
        vtouch_first_idx = events[events['t1'].isin(t1.values)].index
        out.loc[vtouch_first_idx, 'bin'] = 0.

    if 'side' in events_: out.loc[out['ret'] <= 0, 'bin'] = 0  # meta-labeling
    return out

In [30]:
labels = getBins(events, close, t1=t1)
labels

Unnamed: 0_level_0,ret,bin
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-03 02:27:00,-0.000056,0.0
2020-01-06 15:48:00,-0.000111,0.0
2020-01-07 23:27:00,0.000023,1.0
2020-01-08 02:38:00,0.000462,1.0
2020-01-08 11:49:00,0.000175,1.0
...,...,...
2024-12-20 17:17:00,-0.000170,0.0
2024-12-23 07:12:00,0.000038,1.0
2024-12-24 00:15:00,0.000197,1.0
2024-12-26 13:31:00,0.000216,1.0


In [32]:
labeled_events = events.join(labels, how='inner')
labeled_events

Unnamed: 0_level_0,t1,trgt,side,ret,bin
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-03 02:27:00,2020-01-03 02:28:00,0.001237,-1.0,-0.000056,0.0
2020-01-06 15:48:00,2020-01-06 15:49:00,0.000648,1.0,-0.000111,0.0
2020-01-07 23:27:00,2020-01-07 23:28:00,0.001216,-1.0,0.000023,1.0
2020-01-08 02:38:00,2020-01-08 02:39:00,0.001244,1.0,0.000462,1.0
2020-01-08 11:49:00,2020-01-08 11:50:00,0.001082,1.0,0.000175,1.0
...,...,...,...,...,...
2024-12-20 17:17:00,2024-12-20 17:18:00,0.002112,-1.0,-0.000170,0.0
2024-12-23 07:12:00,2024-12-23 07:13:00,0.000289,1.0,0.000038,1.0
2024-12-24 00:15:00,2024-12-24 00:16:00,0.000317,1.0,0.000197,1.0
2024-12-26 13:31:00,2024-12-26 13:32:00,0.000363,1.0,0.000216,1.0


## Save events to disk

In [None]:
labeled_events.to_pickle(LABEL_FILE_PATH)