# Random Forest: Triple-Barrier Labels, Feature Alignment, and `predict_proba` Mapping


This notebook shows **exactly** how to:
- Build **aligned** features/labels per (ticker, date) without leakage
- Create **triple-barrier** labels (+1 = TP/Buy, 0 = Neutral, -1 = SL/Sell)
- Train a **RandomForestClassifier**
- Interpret `predict_proba` using **`rf.classes_`** to map columns → labels
- Run **sanity checks** that catch common misalignment bugs


In [2]:

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
np.random.seed(42)


## Simulate OHLCV for a couple of tickers

In [3]:

def simulate_ohlcv(n=800, tickers=('AAPL','MSFT')):
    idx = pd.date_range('2018-01-01', periods=n, freq='B')
    frames = []
    for tkr in tickers:
        ret = np.random.normal(loc=0.0003, scale=0.01, size=n)
        price = 100 * np.exp(np.cumsum(ret))
        close = pd.Series(price, index=idx)
        open_ = close.shift(1).fillna(close.iloc[0]) * (1 + np.random.normal(0, 0.0005, n))
        high = pd.concat([open_, close], axis=1).max(axis=1) * (1 + np.abs(np.random.normal(0, 0.001, n)))
        low  = pd.concat([open_, close], axis=1).min(axis=1) * (1 - np.abs(np.random.normal(0, 0.001, n)))
        vol  = (np.random.lognormal(mean=12, sigma=0.3, size=n)).astype(int)
        df = pd.DataFrame({'ticker': tkr, 'open': open_, 'high': high, 'low': low, 'close': close, 'volume': vol})
        frames.append(df)
    data = pd.concat(frames).reset_index().rename(columns={'index':'date'})
    data = data.set_index(['ticker','date']).sort_index()
    return data

data = simulate_ohlcv()
data.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2018-01-01,100.575266,100.593661,100.455549,100.528104,172052
AAPL,2018-01-02,100.502165,100.640343,100.40157,100.419327,184911
AAPL,2018-01-03,100.424153,101.167476,100.369253,101.102168,173978
AAPL,2018-01-04,101.078799,102.766635,101.051341,102.684571,238866
AAPL,2018-01-05,102.662263,102.711822,102.303663,102.47515,122308


## Feature Engineering (lagged to avoid lookahead)

In [4]:

def make_features(df):
    out = df.copy()
    out['ret_1']  = out['close'].groupby(level=0).pct_change(1)
    out['ret_5']  = out['close'].groupby(level=0).pct_change(5)
    out['ret_20'] = out['close'].groupby(level=0).pct_change(20)
    out['vol_20'] = out['ret_1'].groupby(level=0).rolling(20).std().values
    out['ma_10']  = out['close'].groupby(level=0).rolling(10).mean().values
    out['dist_ma10'] = (out['close'] - out['ma_10'])/(out['ma_10']+1e-8)
    out['vol_norm_20'] = out['volume']/out['volume'].groupby(level=0).rolling(20).mean().values
    feature_cols = ['ret_1','ret_5','ret_20','vol_20','dist_ma10','vol_norm_20']
    out[feature_cols] = out.groupby(level=0)[feature_cols].shift(1)
    return out

feat = make_features(data)
feat.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume,ret_1,ret_5,ret_20,vol_20,ma_10,dist_ma10,vol_norm_20
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAPL,2018-01-01,100.575266,100.593661,100.455549,100.528104,172052,,,,,,,
AAPL,2018-01-02,100.502165,100.640343,100.40157,100.419327,184911,,,,,,,
AAPL,2018-01-03,100.424153,101.167476,100.369253,101.102168,173978,-0.001082,,,,,,
AAPL,2018-01-04,101.078799,102.766635,101.051341,102.684571,238866,0.0068,,,,,,
AAPL,2018-01-05,102.662263,102.711822,102.303663,102.47515,122308,0.015652,,,,,,


## Triple-barrier labeling (+1 TP, -1 SL, 0 neutral)

In [5]:

def triple_barrier_labels(close, ub=0.02, lb=-0.02, max_h=10):
    idx = close.index
    labels = pd.Series(index=idx, dtype=float)
    for i, t in enumerate(idx):
        if i == len(idx)-1:
            labels[t] = np.nan
            continue
        p0 = close.iloc[i]
        end = min(i+max_h, len(idx)-1)
        path = close.iloc[i+1:end+1]/p0 - 1.0
        hit_up = (path >= ub).idxmax() if (path >= ub).any() else None
        hit_dn = (path <= lb).idxmax() if (path <= lb).any() else None
        if hit_up is not None and hit_dn is not None:
            labels[t] = 1.0 if hit_up < hit_dn else -1.0
        elif hit_up is not None:
            labels[t] = 1.0
        elif hit_dn is not None:
            labels[t] = -1.0
        else:
            labels[t] = 0.0
    return labels

def apply_triple_barrier(df, ub=0.02, lb=-0.02, max_h=10):
    frames = []
    for tkr, g in df.groupby(level=0):
        close = g['close']
        lbl = triple_barrier_labels(close, ub=ub, lb=lb, max_h=max_h)
        frames.append(pd.DataFrame({'label': lbl}).set_index(g.index.names))
    y = pd.concat(frames).sort_index()
    return y

y = apply_triple_barrier(data, ub=0.02, lb=-0.02, max_h=10)
y.head()


KeyError: "None of ['ticker', 'date'] are in the columns"

## Align features + labels and train RF

In [None]:

feature_cols = ['ret_1','ret_5','ret_20','vol_20','dist_ma10','vol_norm_20']
df = feat.join(y, how='inner').dropna(subset=feature_cols + ['label'])

# time-based split per ticker
def time_split_per_ticker(df, split_ratio=0.7):
    train_idx = []
    test_idx = []
    for tkr, g in df.groupby(level=0):
        n = len(g); k = int(n*split_ratio)
        train_idx.extend(g.index[:k]); test_idx.extend(g.index[k:])
    return df.loc[train_idx], df.loc[test_idx]

train_df, test_df = time_split_per_ticker(df, 0.7)

X_train = train_df[feature_cols].values
y_train = train_df['label'].values.astype(int)
X_test  = test_df[feature_cols].values
y_test  = test_df['label'].values.astype(int)

rf = RandomForestClassifier(n_estimators=300, random_state=123, min_samples_leaf=5)
rf.fit(X_train, y_train)
print("rf.classes_:", rf.classes_)

proba = rf.predict_proba(X_test)
proba[:3]


## Map `predict_proba` columns using `rf.classes_`

In [None]:

class_order = list(rf.classes_)  # e.g., [-1, 0, 1]
proba_df = pd.DataFrame(proba, index=test_df.index, columns=[f"P({c})" for c in class_order])

P_sell = proba_df[f"P({-1})"]
P_hold = proba_df[f"P({0})"]
P_buy  = proba_df[f"P({1})"]

proba_df.head()


## Quick sanity checks

In [None]:

from sklearn.metrics import classification_report, confusion_matrix
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))
print(confusion_matrix(y_test, y_pred, labels=class_order))

peek = pd.DataFrame({
    'label_true': y_test[:10],
    'label_pred': y_pred[:10],
    'P_sell(-1)': proba[:10, class_order.index(-1)],
    'P_hold(0)':  proba[:10, class_order.index(0)],
    'P_buy(1)':   proba[:10, class_order.index(1)],
}, index=test_df.index[:10])
peek
