In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
MODEL_NAME = "t2v+transformer"
V_NUM = 22
SOURCE = "dukascopy"
SYMBOL = "usdjpy"
MINUTES = 1
EVENT = '58m-dollar'
START_DATE = "2020-01-01"
END_DATE = "2024-12-31"
EVENT_NAME = "cusum_filter"
SEQUENCE_LENGTH = 120
TIME_COLS = [
    # 'timestamp',
    "hour",
    "dow",
    "dom",
    "month",
    # "open",
    # "high",
    # "low",
    # "close",
]
FEATURES_COLS = [
    # Basic Data
    "close_log_return",
    "ret_mean_5",
    "ret_mean_10",
    "log_volume",
    "ema5_slope",
    "ema20_slope",
    "atr20",
    "vol_adj_return",
    "close_to_atr",
    "macd_diff",
    "bb_width",
    "bb_position",
    "donchian_width"
]
TARGET_COL = "bin_class"

In [4]:
from pathlib import Path

# Build base name
BASE_NAME = f"{SOURCE}-{SYMBOL}-tick-{START_DATE}-{END_DATE}"
RESAMPLED_NAME = f"{SOURCE}-{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"
RESAMPLED_NAME = f"{SOURCE}-{SYMBOL}-{EVENT}-{START_DATE}-{END_DATE}"
LABEL_NAME = f"{RESAMPLED_NAME}-{EVENT_NAME}"
# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
LABEL_DIR = BASE_DIR / "labels"
PROCESSED_DIR = BASE_DIR / "processed"
NORMALIZED_DIR = BASE_DIR / "normalized"
DIRECTION_LABEL_DIR = BASE_DIR / "direction_labels"

# Final paths
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}_processed.pkl"
NORMALIZED_FILE_PATH = NORMALIZED_DIR / f"{RESAMPLED_NAME}_normalized.pkl"
DIRECTION_LABEL_FILE_PATH = DIRECTION_LABEL_DIR / f"{RESAMPLED_NAME}-{EVENT_NAME}.pkl"

In [5]:
MODEL_PATH = f'../lightning_logs/{MODEL_NAME}-{EVENT}-{EVENT_NAME}/version_{V_NUM}/checkpoints/best_checkpoint.ckpt'

In [6]:
SIDE_NAME = 'transformer_sides'

META_LABEL_DIR = BASE_DIR / "meta_labels"
META_LABEL_DIR.mkdir(parents=True, exist_ok=True)
META_LABEL_FILE_PATH = META_LABEL_DIR / f"{RESAMPLED_NAME}-{EVENT_NAME}-{SIDE_NAME}.pkl"

In [7]:
%%time
df = pd.read_pickle(NORMALIZED_FILE_PATH)
labels_df = pd.read_pickle(DIRECTION_LABEL_FILE_PATH)

CPU times: user 3.31 ms, sys: 124 ms, total: 128 ms
Wall time: 198 ms


In [8]:
t_events = labels_df.index

In [9]:
missing = t_events.difference(df.index)
missing

DatetimeIndex([], dtype='datetime64[ns]', name='timestamp', freq=None)

### Load Model

In [10]:
from models.classification.t2v_transformer_model import T2VTransformerModule

In [11]:
model = T2VTransformerModule.load_from_checkpoint(MODEL_PATH)

## Get Side from Trained Model

In [12]:
features = df[TIME_COLS + FEATURES_COLS]

In [13]:
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
import torch

def get_side_from_model(model, data: pd.DataFrame, labels: pd.DataFrame, lookback: int = 24, device: str = 'cpu'):
    """
    Create a 'side' signal using linear regression trendline logic.
    
    Args:
        close (pd.Series): Close price series (indexed by datetime)
        t_events (pd.Index): Event timestamps (e.g., from MACD or CUSUM)
        lookback (int): Number of past bars to fit linear trend
    
    Returns:
        pd.Series: side signal indexed by t_events (1 = long, -1 = short)
    """
    
    model.eval()
    model.to(device)
    t_events = labels.index
    side = pd.Series(index=t_events, dtype='float32')

    for t in t_events:
        if t not in data.index:
            print(f"{t} is not in data index")
            continue
        try:
            end_loc = data.index.get_loc(t)
            start_loc = end_loc - lookback
            if start_loc < 0:
                print(f"{start_loc}, {end_loc} is out of range")
                continue  # not enough lookback
            seq = data.iloc[start_loc:end_loc].values.astype(np.float32)
            if seq.shape[0] != lookback:
                print('sequence length wrong')
                continue
            x_tensor = torch.tensor(seq).unsqueeze(0).to(device)
            with torch.no_grad():
                logits = model(x_tensor)
                
                pred_class = int(torch.argmax(logits, dim=1).cpu().item())
                probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
                # print(f"Probabilities: {probs} → Predicted class: {pred_class}")

            side[t] = pred_class

        except Exception as e:
            print(e)
            continue
        

    return side.dropna()


In [14]:
%%time
predicted_sides = get_side_from_model(model=model, data=features,labels=labels_df, lookback=SEQUENCE_LENGTH, device='cpu')
predicted_sides.value_counts(), predicted_sides.isna().sum()

-53, 67 is out of range
-11, 109 is out of range
CPU times: user 1min 13s, sys: 1min 5s, total: 2min 18s
Wall time: 1min 1s


(2.0    24918
 1.0    14246
 0.0    12662
 Name: count, dtype: int64,
 np.int64(0))

In [15]:
labels_df = labels_df.join(predicted_sides.rename("pred_class"))

In [16]:
labels_df.shape

(51828, 7)

In [17]:
labels_df.isna().sum()

t1            0
trgt          0
duration      0
ret           0
bin           0
bin_class     0
pred_class    2
dtype: int64

In [18]:
labels_df.loc[labels_df['pred_class'].isna()]

Unnamed: 0_level_0,t1,trgt,duration,ret,bin,bin_class,pred_class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-02 09:39:45.692,2020-01-02 10:12:00.594,0.000523,0 days 00:32:14.902000,0.000193,0.0,1.0,
2020-01-02 13:20:20.277,2020-01-02 13:40:53.939,0.000486,0 days 00:20:33.662000,-0.000598,-1.0,0.0,


In [19]:
labels_df = labels_df.dropna()

In [21]:
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import pandas as pd

def applyPtSlOnT1(close,events,ptSl,molecule):
    # apply stop loss/profit taking, if it takes place before t1 (end of event)
    events_=events.loc[molecule]
    out=events_[['t1']].copy(deep=True)
    if ptSl[0]>0: pt=ptSl[0]*events_['trgt']
    else: pt=pd.Series(index=events.index) # NaNs
    if ptSl[1]>0: sl=-ptSl[1]*events_['trgt']
    else: sl=pd.Series(index=events.index) # NaNs
    for loc, t1 in events_['t1'].fillna(close.index[-1]).items():
        df0=close[loc:t1] # path prices
        df0=(df0/close[loc]-1)*events_.at[loc,'side'] # path returns
        out.loc[loc,'sl']=df0[df0<sl[loc]].index.min() # earliest stop loss
        out.loc[loc,'pt']=df0[df0>pt[loc]].index.min() # earliest profit taking
    return out


def parallel_apply(func, items, num_threads=4, **kwargs):
    def worker(molecule):
        return func(molecule=molecule, **kwargs)

    chunks = np.array_split(items, num_threads)
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        results = list(executor.map(worker, chunks))

    return pd.concat(results).sort_index()


def getEvents(close, tEvents, ptSl, trgt, minRet, numThreads=4, t1=False, side=None):
    # Step 1: Filter targets
    trgt = trgt.loc[tEvents]
    trgt = trgt[trgt > minRet]

    # Step 2: Set vertical barrier (t1)
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=tEvents)

    # Step 3: Build events DataFrame
    if side is None:
        side_, ptSl_ = pd.Series(1., index=trgt.index), [ptSl[0], ptSl[0]]
    else:
        side_, ptSl_ = side.loc[trgt.index], ptSl[:2]

    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt'])

    # Step 4: Apply barriers in parallel
    df0 = parallel_apply(
        func=applyPtSlOnT1,
        items=events.index,
        num_threads=numThreads,
        close=close,
        events=events,
        ptSl=ptSl_
    )

    # Step 5: Choose the first touched barrier
    events['t1'] = df0.dropna(how='all').min(axis=1)
    if side is None:
        events = events.drop('side', axis=1)
    return events


In [27]:
%%time

events = getEvents(df['close'], tEvents=labels_df.index, ptSl=[1,1], trgt=labels_df.trgt, minRet=0, numThreads=8, t1=labels_df.t1, side=predicted_sides)

CPU times: user 13.9 s, sys: 1.29 s, total: 15.2 s
Wall time: 14.2 s


In [28]:
events

Unnamed: 0_level_0,t1,trgt,side
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-02 15:12:25.452,2020-01-02 15:13:50.102,0.000352,2.0
2020-01-02 15:20:27.753,2020-01-02 15:23:24.339,0.000375,2.0
2020-01-02 15:27:08.082,2020-01-02 15:30:51.020,0.000456,2.0
2020-01-02 15:33:18.152,2020-01-02 15:36:26.386,0.000481,2.0
2020-01-02 16:14:52.893,2020-01-02 16:39:57.573,0.000419,2.0
...,...,...,...
2024-12-30 18:36:06.406,2024-12-30 18:39:30.082,0.000345,2.0
2024-12-30 19:58:40.522,2024-12-30 20:04:11.854,0.000382,1.0
2024-12-30 20:33:36.881,2024-12-30 20:36:57.321,0.000330,1.0
2024-12-30 21:24:11.892,2024-12-30 21:46:12.738,0.000417,1.0


In [29]:
def getBins(events, close, t1=None):
    '''
    Compute event's outcome (including side information, if provided).
    events is a DataFrame where:
    -events.index is event's starttime
    -events['t1'] is event's endtime
    -events['trgt'] is event's target
    -events['side'] (optional) implies the algo's position side
    -t1 is original vertical barrier series
    Case 1: ('side' not in events): bin in (-1,1) <-label by price action
    Case 2: ('side' in events): bin in (0,1) <-label by pnl (meta-labeling)
    '''
    # 1) prices aligned with events
    events_ = events.dropna(subset=['t1'])
    px = events_.index.union(events_['t1'].values).drop_duplicates()
    px = close.reindex(px, method='bfill')
    # 2) create out object
    out = pd.DataFrame(index=events_.index)
    out['ret'] = px.loc[events_['t1'].values].values / px.loc[
        events_.index] - 1
    if 'side' in events_: out['ret'] *= events_['side']  # meta-labeling
    out['bin'] = np.sign(out['ret'])

    if 'side' not in events_:
        # only applies when not meta-labeling.
        # to update bin to 0 when vertical barrier is touched, we need the
        # original vertical barrier series since the events['t1'] is the time
        # of first touch of any barrier and not the vertical barrier
        # specifically. The index of the intersection of the vertical barrier
        # values and the events['t1'] values indicate which bin labels needs
        # to be turned to 0.
        vtouch_first_idx = events[events['t1'].isin(t1.values)].index
        out.loc[vtouch_first_idx, 'bin'] = 0.

    if 'side' in events_: out.loc[out['ret'] <= 0, 'bin'] = 0  # meta-labeling
    return out

In [31]:
labels = getBins(events, df['close'], t1=labels_df.t1)
labels

Unnamed: 0_level_0,ret,bin
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-02 15:12:25.452,-0.000644,0.0
2020-01-02 15:20:27.753,-0.000461,0.0
2020-01-02 15:27:08.082,-0.000618,0.0
2020-01-02 15:33:18.152,0.000489,1.0
2020-01-02 16:14:52.893,0.000618,1.0
...,...,...
2024-12-30 18:36:06.406,0.000357,1.0
2024-12-30 19:58:40.522,-0.000391,0.0
2024-12-30 20:33:36.881,-0.000338,0.0
2024-12-30 21:24:11.892,0.000593,1.0


In [32]:
labels.bin.value_counts()

bin
0.0    31708
1.0    20118
Name: count, dtype: int64

In [33]:
labeled_events = events.join(labels, how='inner')

In [34]:
labeled_events

Unnamed: 0_level_0,t1,trgt,side,ret,bin
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02 15:12:25.452,2020-01-02 15:13:50.102,0.000352,2.0,-0.000644,0.0
2020-01-02 15:20:27.753,2020-01-02 15:23:24.339,0.000375,2.0,-0.000461,0.0
2020-01-02 15:27:08.082,2020-01-02 15:30:51.020,0.000456,2.0,-0.000618,0.0
2020-01-02 15:33:18.152,2020-01-02 15:36:26.386,0.000481,2.0,0.000489,1.0
2020-01-02 16:14:52.893,2020-01-02 16:39:57.573,0.000419,2.0,0.000618,1.0
...,...,...,...,...,...
2024-12-30 18:36:06.406,2024-12-30 18:39:30.082,0.000345,2.0,0.000357,1.0
2024-12-30 19:58:40.522,2024-12-30 20:04:11.854,0.000382,1.0,-0.000391,0.0
2024-12-30 20:33:36.881,2024-12-30 20:36:57.321,0.000330,1.0,-0.000338,0.0
2024-12-30 21:24:11.892,2024-12-30 21:46:12.738,0.000417,1.0,0.000593,1.0


In [35]:
labeled_events.to_pickle(META_LABEL_FILE_PATH)