In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

import os

In [2]:
from pathlib import Path

# Configurable parts
SOURCE = "dukascopy"
SYMBOL = "usdjpy"
MINUTES = 15
START_DATE = "2020-01-01"
END_DATE = "2024-12-31"

# Build base name
RESAMPLED_NAME = f"{SOURCE}-{SYMBOL}-{MINUTES}m-{START_DATE}-{END_DATE}"

# Base directories
BASE_DIR = Path("../data")
RESAMPLED_DIR = BASE_DIR / "resampled"
PROCESSED_DIR = BASE_DIR / "processed"

# Final paths
RESAMPLED_FILE_PATH = RESAMPLED_DIR / f"{RESAMPLED_NAME}.pkl"
PROCESSED_FILE_PATH = PROCESSED_DIR / f"{RESAMPLED_NAME}_processed.pkl"

In [3]:
df = pd.read_pickle(PROCESSED_FILE_PATH)
df.columns

Index(['timestamp', 'open', 'high', 'low', 'close', 'volume', 'spread',
       'log_volume', 'close_delta', 'close_return', 'close_log_return',
       'ret_mean_5', 'ret_mean_10', 'ema_21', 'sma_50', 'atr_14', 'bb_upper',
       'bb_lower', 'bb_mavg', 'bb_width', 'donchian_upper', 'donchian_lower',
       'donchian_mid', 'stoch_k', 'stoch_d', 'rsi_14', 'macd', 'macd_signal',
       'macd_diff', 'unix_time', 'hour', 'hour_sin', 'hour_cos', 'dow',
       'dow_sin', 'dow_cos', 'dom', 'dom_sin', 'dom_cos', 'month', 'month_sin',
       'month_cos', 'label', 'train_label', '3b_label'],
      dtype='object')

## Classification Labeling

### Simple Labeling Techniques

In [4]:
threshold = 0.00015  # 0.03%
df['label'] = 0
df.loc[df['close_return'] > threshold, 'label'] = 1
df.loc[df['close_return'] < -threshold, 'label'] = -1


In [5]:
df['label'].value_counts()

label
 0    45488
 1    40314
-1    38173
Name: count, dtype: int64

### Triple Barrier Labeling

In [6]:
import numpy as np

In [7]:
def apply_triple_barrier_labels(df, window_size, barrier_pct):
    """
    Apply the Triple Barrier Method to assign directional labels
    based on price movements within a future window.

    Args:
        df (pd.DataFrame): Must include a 'close' column.
        window_size (int): Number of future bars to evaluate (vertical barrier).
        barrier_pct (float): Barrier threshold as percentage (for TP/SL).

    Returns:
        pd.DataFrame: Copy of input with new 'label' column.
        float: Label balance score (0 = imbalance, 1 = perfect balance).
    """
    result_df = df.copy()
    close_prices = df["close"].values
    labels = []

    for i in range(len(close_prices) - window_size):
        future_window = close_prices[i:i + window_size]
        start_price = close_prices[i]
        upper_threshold = start_price * (1 + barrier_pct / 100)
        lower_threshold = start_price * (1 - barrier_pct / 100)

        label_series = np.where(future_window >= upper_threshold, 1,
                        np.where(future_window <= lower_threshold, -1, 0))

        first_signal = label_series[label_series != 0]
        labels.append(first_signal[0] if first_signal.size > 0 else 0)

    # Align output with valid input range
    result_df = result_df.iloc[:len(labels)].copy()
    result_df["3b_label"] = labels

    # Compute label distribution balance score
    label_dist = result_df["3b_label"].value_counts(normalize=True)
    if set(label_dist.index) == {-1, 0, 1}:
        p_up, p_neutral, p_down = label_dist[1], label_dist[0], label_dist[-1]
        balance_score = (p_up * p_neutral * p_down) / ((1 / 3) ** 3)
    else:
        balance_score = 0.0

    return result_df, balance_score


In [26]:
import numpy as np
import pandas as pd

def apply_triple_barrier_return(df, window_size, barrier_pct, use_log_return=True):
    """
    Apply Triple Barrier Labeling for regression tasks by recording
    the return at the first barrier hit (upper, lower, or vertical).

    Args:
        df (pd.DataFrame): Must contain 'close' column.
        window_size (int): Number of future bars to evaluate (vertical barrier).
        barrier_pct (float): Barrier threshold as percentage.
        use_log_return (bool): If True, use log return; else, use % return.

    Returns:
        pd.DataFrame: Copy of input with new '3b_return' column.
    """
    result_df = df.copy()
    close_prices = df["close"].values
    returns = []

    for i in range(len(close_prices) - window_size):
        start_price = close_prices[i]
        future_prices = close_prices[i+1:i+1+window_size]  # exclude current bar

        upper = start_price * (1 + barrier_pct / 100)
        lower = start_price * (1 - barrier_pct / 100)

        hit_index = None
        for j, price in enumerate(future_prices):
            if price >= upper or price <= lower:
                hit_index = j
                break

        # If a barrier was hit, use that price
        if hit_index is not None:
            exit_price = future_prices[hit_index]
        else:
            # Otherwise, use the last price in the window
            exit_price = future_prices[-1]

        if use_log_return:
            ret = np.log(exit_price / start_price)
        else:
            ret = (exit_price - start_price) / start_price

        returns.append(ret)

    result_df = result_df.iloc[:len(returns)].copy()
    result_df["3b_return"] = returns
    return result_df


In [8]:
def evaluate_label_balance_grid(df, barrier_pct_list, window_size_list):
    """
    Perform grid search over barrier % and window sizes to evaluate label balance.

    Args:
        df (pd.DataFrame): DataFrame with price series ('close' column).
        barrier_pct_list (list of float): List of TP/SL percentage thresholds.
        window_size_list (list of int): List of window sizes for lookahead.

    Returns:
        pd.DataFrame: Grid DataFrame of balance scores for each (window, barrier) pair.
    """
    balance_grid = pd.DataFrame(index=window_size_list, columns=barrier_pct_list)
    balance_grid.index.name = "Window Size"
    balance_grid.columns.name = "Barrier %"

    for window_size in window_size_list:
        for barrier_pct in barrier_pct_list:
            try:
                _, balance = apply_triple_barrier_labels(df, window_size, barrier_pct)
                balance_grid.at[window_size, barrier_pct] = balance
            except Exception as e:
                balance_grid.at[window_size, barrier_pct] = np.nan
                print(f"Error at window={window_size}, barrier={barrier_pct}: {e}")

    return balance_grid


In [9]:
# Define ranges to test
barrier_pct_list = [0.05, 0.1, 0.15, 0.2, 0.3]      # Try multiple horizontal barrier levels
window_size_list = [6, 12, 18, 24, 48, 96]            # Try different vertical barrier sizes

# Run the grid search
balance_df = evaluate_label_balance_grid(df, barrier_pct_list, window_size_list)

# View result
print(balance_df)

Barrier %        0.05      0.10      0.15      0.20      0.30
Window Size                                                  
6            0.998268  0.541493  0.199691  0.074463  0.012646
12           0.646299  0.968435  0.634182  0.344614  0.094576
18             0.3399  0.965825   0.89874  0.618402  0.230151
24           0.172989   0.82065  0.992788   0.81897  0.385419
48           0.016569  0.311892  0.743586  0.973199  0.852511
96           0.001578  0.052904  0.237934  0.507236  0.942416


In [18]:
df, balance = apply_triple_barrier_labels(df, 24, .15)

In [19]:
df['3b_label'].value_counts()

3b_label
 0    45010
 1    40951
-1    37984
Name: count, dtype: int64

In [20]:
df['3b_train_label'] = df['3b_label'].apply(lambda x: x + 1)

In [21]:
df['ttrain_label'] = df['label'].apply(lambda x: x + 1)

In [22]:
df

Unnamed: 0,timestamp,open,high,low,close,volume,spread,log_volume,close_delta,close_return,...,dom_sin,dom_cos,month,month_sin,month_cos,label,train_label,3b_label,3b_train_label,ttrain_label
0,2020-01-02 12:45:00+00:00,108.7890,108.7955,108.7635,108.7870,1.398160e+06,0.001986,14.150668,-0.0010,-0.000009,...,0.394356,0.918958,1,5.000000e-01,0.866025,0,0,-1,0,1
1,2020-01-02 13:00:00+00:00,108.7870,108.7900,108.7720,108.7780,1.224630e+06,0.001864,14.018150,-0.0090,-0.000083,...,0.394356,0.918958,1,5.000000e-01,0.866025,0,0,-1,0,1
2,2020-01-02 13:15:00+00:00,108.7770,108.7770,108.7090,108.7270,2.424340e+06,0.001921,14.701070,-0.0510,-0.000469,...,0.394356,0.918958,1,5.000000e-01,0.866025,-1,0,-1,0,0
3,2020-01-02 13:30:00+00:00,108.7260,108.7390,108.6570,108.6945,2.223350e+06,0.002018,14.614526,-0.0325,-0.000299,...,0.394356,0.918958,1,5.000000e-01,0.866025,-1,1,-1,0,0
4,2020-01-02 13:45:00+00:00,108.6940,108.7245,108.6885,108.6940,6.457450e+06,0.002147,15.680745,-0.0005,-0.000005,...,0.394356,0.918958,1,5.000000e-01,0.866025,0,0,-1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123940,2024-12-30 12:15:00+00:00,157.5975,157.6525,157.5970,157.6045,8.624280e+06,0.008132,15.970092,0.0075,0.000048,...,-0.201299,0.979530,12,-2.449294e-16,1.000000,0,0,-1,0,1
123941,2024-12-30 12:30:00+00:00,157.6025,157.6025,157.5210,157.5315,9.999940e+06,0.008564,16.118090,-0.0730,-0.000463,...,-0.201299,0.979530,12,-2.449294e-16,1.000000,-1,2,-1,0,0
123942,2024-12-30 12:45:00+00:00,157.5290,157.5590,157.4885,157.5575,1.129775e+07,0.007739,16.240114,0.0260,0.000165,...,-0.201299,0.979530,12,-2.449294e-16,1.000000,1,0,-1,0,2
123943,2024-12-30 13:00:00+00:00,157.5560,157.5600,157.5065,157.5425,1.025755e+07,0.007714,16.143525,-0.0150,-0.000095,...,-0.201299,0.979530,12,-2.449294e-16,1.000000,0,2,-1,0,1


In [23]:
df = df.reset_index(drop=True)

In [24]:
df.shape

(123945, 47)

# Saving the file

In [25]:
df.to_pickle(PROCESSED_FILE_PATH)