In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
MODEL_NAME = "simple_transformer"
V_NUM = 0
SYMBOL = "USDJPY"
MINUTES = 1
EVENT = '115009542m-dollar'
START_DATE = "20210101"
END_DATE = "20241231"
EVENT_NAME = "cusum_filter"
SEQUENCE_LENGTH = 120
TIME_COLS = [
    # 'timestamp',
    "hour",
    "dow",
    "dom",
    "month",
    # "open",
    # "high",
    # "low",
    # "close",
]
FEATURES_COLS = [
    "hour_cos",
    "dow_cos",
    "dom_cos",
    "month_cos",
    # Basic Data
    "close_log_return",
    "ret_mean_5",
    "ret_mean_10",
    "log_volume",
    "ema5_slope",
    "ema20_slope",
    "atr20",
    "vol_adj_return",
    "close_to_atr",
    "macd_diff",
    "bb_width",
    "bb_position",
    "donchian_width",
]

TARGET_COL = "bin_class"

In [3]:
from pathlib import Path

# Build base name
RESAMPLED_NAME = f"{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"
RESAMPLED_NAME = f"{SYMBOL}-{EVENT}-{START_DATE}-{END_DATE}"
LABEL_NAME = f"{RESAMPLED_NAME}-{EVENT_NAME}"
# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
LABEL_DIR = BASE_DIR / "labels"
PROCESSED_DIR = BASE_DIR / "processed"
NORMALIZED_DIR = BASE_DIR / "normalized"
DIRECTION_LABEL_DIR = BASE_DIR / "direction_labels"
PREDICTION_DIR = BASE_DIR / "predictions"

# Final paths
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}-processed.pkl"
NORMALIZED_FILE_PATH = NORMALIZED_DIR / f"{RESAMPLED_NAME}-normalized.pkl"
DIRECTION_LABEL_FILE_PATH = DIRECTION_LABEL_DIR / f"{RESAMPLED_NAME}-{EVENT_NAME}.pkl"
PREDICTION_FILE_PATH = PREDICTION_DIR / f"{RESAMPLED_NAME}-{EVENT_NAME}.pkl"

In [4]:
MODEL_PATH = f'../lightning_logs/{MODEL_NAME}-{EVENT}-{EVENT_NAME}/version_{V_NUM}/checkpoints/best_checkpoint.ckpt'

In [5]:
SIDE_NAME = 'transformer_sides'

META_LABEL_DIR = BASE_DIR / "meta_labels"
META_LABEL_DIR.mkdir(parents=True, exist_ok=True)
META_LABEL_FILE_PATH = META_LABEL_DIR / f"{RESAMPLED_NAME}-{EVENT_NAME}-{SIDE_NAME}.pkl"

In [6]:
%%time
df = pd.read_pickle(NORMALIZED_FILE_PATH)
labels_df = pd.read_pickle(DIRECTION_LABEL_FILE_PATH)

CPU times: user 1.03 ms, sys: 37.8 ms, total: 38.8 ms
Wall time: 39.6 ms


In [7]:
t_events = labels_df.index

In [8]:
missing = t_events.difference(df.index)
missing

DatetimeIndex([], dtype='datetime64[ns]', name='timestamp', freq=None)

### Load Model

In [9]:
from models.classification.t2v_transformer_model import T2VTransformerModule
from models.classification.simple_transformer_model import SimpleTransformerModule

In [10]:
model = SimpleTransformerModule.load_from_checkpoint(MODEL_PATH)

## Get Side from Trained Model

In [11]:
# features = df[TIME_COLS + FEATURES_COLS]
features = df[FEATURES_COLS]

In [12]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm


def extract_sequences(data: pd.DataFrame, t_events, lookback: int):
    """Extract valid sequences and events for batch processing."""
    sequences, valid_events = [], []
    data_values, data_index = data.values.astype(np.float32), data.index
    
    for t in t_events:
        try:
            if t in data_index:
                end_loc = data_index.get_loc(t)
                start_loc = end_loc - lookback
                if start_loc >= 0:
                    seq = data_values[start_loc:end_loc]
                    if len(seq) == lookback:
                        sequences.append(seq)
                        valid_events.append(t)
        except:
            continue
    
    return (np.stack(sequences), valid_events) if sequences else (None, [])

def get_side_from_model_batch(model, data: pd.DataFrame, labels: pd.DataFrame, 
                             lookback: int = 24, device: str = 'cpu', batch_size: int = 64):
    """Batch process model predictions for time series events."""
    model.eval().to(device)
    
    sequences, valid_events = extract_sequences(data, labels.index, lookback)
    if sequences is None:
        return pd.DataFrame()
    
    predictions, probabilities = [], []
    n_batches = (len(sequences) + batch_size - 1) // batch_size
    
    for i in tqdm(range(0, len(sequences), batch_size), total=n_batches, desc="Processing batches"):
        batch = sequences[i:i + batch_size]
        x_tensor = torch.tensor(batch, device=device)
        with torch.no_grad():
            logits = model(x_tensor)
            probs = torch.softmax(logits, dim=1)
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            probabilities.extend(probs.cpu().numpy())

    result_df = pd.DataFrame(probabilities, index=valid_events)
    result_df.columns = [f'prob_{i}' for i in range(len(result_df.columns))]
    result_df['prediction'] = predictions
    
    return result_df

In [13]:
%%time
predicted_sides = get_side_from_model_batch(model=model, data=features,labels=labels_df, lookback=SEQUENCE_LENGTH, device='cpu')
predicted_sides.value_counts(), predicted_sides.isna().sum()

Processing batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4961/4961 [00:55<00:00, 88.96it/s]


CPU times: user 2min 1s, sys: 54.9 s, total: 2min 56s
Wall time: 57.5 s


(prob_0    prob_1    prob_2    prediction
 0.053344  0.885991  0.060665  1             1
 0.375512  0.242052  0.382436  2             1
 0.375521  0.211768  0.412711  2             1
 0.375520  0.227462  0.397018  2             1
 0.375519  0.233570  0.390911  2             1
                                            ..
 0.287560  0.436840  0.275600  1             1
 0.287559  0.425607  0.286833  1             1
           0.424638  0.287803  1             1
           0.399896  0.312545  1             1
 0.517639  0.053666  0.428695  0             1
 Name: count, Length: 317460, dtype: int64,
 prob_0        0
 prob_1        0
 prob_2        0
 prediction    0
 dtype: int64)

In [15]:
labels_df = labels_df.join(predicted_sides)

In [16]:
labels_df.shape

(317488, 10)

In [17]:
labels_df.isna().sum()

t1             0
trgt           0
duration       0
ret            0
bin            0
bin_class      0
prob_0        28
prob_1        28
prob_2        28
prediction    28
dtype: int64

In [41]:
labels_df = labels_df.dropna().copy()

In [42]:
labels_df['side'] = labels_df['prediction'] - 1

In [25]:
def getBins(events, close, t1=None):
    '''
    Compute event's outcome (including side information, if provided).
    events is a DataFrame where:
    -events.index is event's starttime
    -events['t1'] is event's endtime
    -events['trgt'] is event's target
    -events['side'] (optional) implies the algo's position side
    -t1 is original vertical barrier series
    Case 1: ('side' not in events): bin in (-1,1) <-label by price action
    Case 2: ('side' in events): bin in (0,1) <-label by pnl (meta-labeling)
    '''
    # 1) prices aligned with events
    events_ = events.dropna(subset=['t1'])
    px = events_.index.union(events_['t1'].values).drop_duplicates()
    px = close.reindex(px, method='bfill')
    # 2) create out object
    out = pd.DataFrame(index=events_.index)
    out['ret'] = px.loc[events_['t1'].values].values / px.loc[
        events_.index] - 1
    if 'side' in events_: out['ret'] *= events_['side']  # meta-labeling
    out['bin'] = np.sign(out['ret'])

    if 'side' not in events_:
        # only applies when not meta-labeling.
        # to update bin to 0 when vertical barrier is touched, we need the
        # original vertical barrier series since the events['t1'] is the time
        # of first touch of any barrier and not the vertical barrier
        # specifically. The index of the intersection of the vertical barrier
        # values and the events['t1'] values indicate which bin labels needs
        # to be turned to 0.
        vtouch_first_idx = events[events['t1'].isin(t1.values)].index
        out.loc[vtouch_first_idx, 'bin'] = 0.

    if 'side' in events_: out.loc[out['ret'] <= 0, 'bin'] = 0  # meta-labeling
    return out

In [44]:
labels = getBins(labels_df, df['close'], t1=labels_df.t1)
labels

Unnamed: 0_level_0,ret,bin
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-04 15:05:41.463,0.000000,0.0
2021-01-04 15:08:16.396,0.000000,0.0
2021-01-04 15:21:52.633,0.000000,0.0
2021-01-04 15:29:04.720,-0.000000,0.0
2021-01-04 15:34:33.499,0.000000,0.0
...,...,...
2024-12-30 22:57:51.659,-0.000121,0.0
2024-12-30 23:10:19.118,-0.000322,0.0
2024-12-30 23:15:58.082,-0.000118,0.0
2024-12-30 23:36:09.864,-0.000150,0.0


In [45]:
labels.bin.value_counts()

bin
0.0    232431
1.0     85029
Name: count, dtype: int64

In [48]:
labeled_events = labels_df.drop(columns=['ret', 'bin']).join(labels, how='inner')

In [49]:
labeled_events

Unnamed: 0_level_0,t1,trgt,duration,bin_class,prob_0,prob_1,prob_2,prediction,side,ret,bin
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-01-04 15:05:41.463,2021-01-04 15:08:16.396,0.000213,0 days 00:02:34.933000,2.0,0.145125,0.687308,0.167567,1.0,0.0,0.000000,0.0
2021-01-04 15:08:16.396,2021-01-04 15:21:52.633,0.000214,0 days 00:13:36.237000,2.0,0.143740,0.690801,0.165459,1.0,0.0,0.000000,0.0
2021-01-04 15:21:52.633,2021-01-04 15:29:04.720,0.000217,0 days 00:07:12.087000,2.0,0.144605,0.691683,0.163712,1.0,0.0,0.000000,0.0
2021-01-04 15:29:04.720,2021-01-04 15:34:33.499,0.000218,0 days 00:05:28.779000,0.0,0.143102,0.694681,0.162217,1.0,0.0,-0.000000,0.0
2021-01-04 15:34:33.499,2021-01-04 15:45:39.081,0.000219,0 days 00:11:05.582000,1.0,0.141421,0.698880,0.159699,1.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2024-12-30 22:57:51.659,2024-12-30 23:06:02.364,0.000114,0 days 00:08:10.705000,2.0,0.349527,0.315080,0.335393,0.0,-1.0,-0.000121,0.0
2024-12-30 23:10:19.118,2024-12-30 23:15:58.082,0.000114,0 days 00:05:38.964000,2.0,0.365897,0.286249,0.347854,0.0,-1.0,-0.000322,0.0
2024-12-30 23:15:58.082,2024-12-30 23:33:37.983,0.000115,0 days 00:17:39.901000,1.0,0.368173,0.282428,0.349398,0.0,-1.0,-0.000118,0.0
2024-12-30 23:36:09.864,2024-12-30 23:37:33.033,0.000115,0 days 00:01:23.169000,2.0,0.429030,0.176818,0.394151,0.0,-1.0,-0.000150,0.0


In [50]:
labeled_events.columns

Index(['t1', 'trgt', 'duration', 'bin_class', 'prob_0', 'prob_1', 'prob_2',
       'prediction', 'side', 'ret', 'bin'],
      dtype='object')

In [51]:
labeled_events.bin.value_counts()

bin
0.0    232431
1.0     85029
Name: count, dtype: int64

In [52]:
labeled_events.to_pickle(META_LABEL_FILE_PATH)