In [14]:
import numpy as np
import pandas as pd
import yfinance as yf

data = yf.download('AAPL')['Close']
def get_daily_vol(close, span=100):
    """
    Calculate exponentially weighted moving daily volatility.

    Parameters:
        close (pd.Series): Series of close prices indexed by datetime.
        span (int): Span for EWM volatility calculation.

    Returns:
        pd.Series: Daily volatility (standard deviation of returns).
    """
    # Compute daily log returns
    daily_returns = close.pct_change()

    # Exponentially weighted standard deviation
    daily_vol = daily_returns.ewm(span=span).std()

    return daily_vol


  data = yf.download('AAPL')['Close']
[*********************100%***********************]  1 of 1 completed


In [20]:
import pandas as pd
import numpy as np

# Simulate realistic 'close' price data
np.random.seed(42)
dates = pd.date_range(start='2023-01-01', periods=200, freq='B')  # 200 business days
close_prices = pd.Series(np.cumprod(1 + np.random.normal(0, 0.01, size=len(dates))), index=dates)

# Simulate realistic 'events' DataFrame
event_indices = np.random.choice(dates[10:-10], size=10, replace=False)
event_indices.sort()
t1 = [dates[dates.get_loc(d) + np.random.randint(5, 15)] for d in event_indices]
trgt = np.random.uniform(0.01, 0.05, size=len(event_indices))
side = np.random.choice([1, -1], size=len(event_indices))

events = pd.DataFrame({
    't1': t1,
    'trgt': trgt,
    'side': side
}, index=event_indices)

# Define profit-taking and stop-loss thresholds
pt_sl = (1, 1)  # 1x target for both

# Molecule (subset of event indices)
molecule = events.index[:5]

# Function from above
def apply_pt_sl_on_t1(close, events, pt_sl, molecule):
    events = events.loc[molecule]
    out = events[['t1']].copy(deep=True)

    pt = pt_sl[0] * events['trgt'] if pt_sl[0] > 0 else pd.Series(index=events.index)
    sl = -pt_sl[1] * events['trgt'] if pt_sl[1] > 0 else pd.Series(index=events.index)

    for loc, t1 in events['t1'].fillna(close.index[-1]).items():
        price_path = close[loc:t1]
        rel_returns = (price_path / close[loc]) * events.at[loc, 'side']

        out.loc[loc, 'sl'] = rel_returns[rel_returns < sl[loc]].index.min()
        out.loc[loc, 'pt'] = rel_returns[rel_returns > pt[loc]].index.min()

    return out

# Apply the function
result = apply_pt_sl_on_t1(close_prices, events, pt_sl, molecule)
result # we get time that we out of the game

Unnamed: 0,t1,sl,pt
2023-02-02,2023-02-17,2023-02-02,NaT
2023-02-17,2023-03-07,2023-02-17,NaT
2023-03-13,2023-03-28,NaT,2023-03-13
2023-03-14,2023-03-27,2023-03-14,NaT
2023-05-16,2023-05-23,2023-05-16,NaT


r

In [21]:
def get_events(close, t_events, trgt, pt_sl, min_ret, num_threads, t1=False):
    """
    Filter events and apply pt/sl thresholds using multiprocessing.

    Parameters:
        close (pd.Series): Close prices.
        t_events (pd.Index): Event indices (timestamps).
        trgt (pd.Series): Target returns.
        pt_sl (float): Profit-taking and stop-loss multiple.
        min_ret (float): Minimum target return required.
        num_threads (int): Number of threads for multiprocessing.
        t1 (pd.Series or bool): Optional vertical barrier.

    Returns:
        pd.DataFrame: Filtered events with updated t1 based on first trigger (pt/sl/barrier).
    """
    # 1. Filter by minimum return
    trgt = trgt.loc[t_events]
    trgt = trgt[trgt > min_ret]

    # 2. Handle t1
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=t_events)

    # 3. Create side and events dataframe
    side = pd.Series(1.0, index=trgt.index)  # long-only
    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side}, axis=1).dropna(subset=['trgt'])

    # 4. Apply stop-loss / profit-taking logic
    df0 = mpPandasObj(func=apply_pt_sl_on_t1,
                      pdObj=('molecule', events.index),
                      numThreads=num_threads,
                      close=close,
                      events=events,
                      pt_sl=[pt_sl, pt_sl])

    # 5. Update t1 with earliest of pt/sl hit
    events['t1'] = df0.dropna(how='all').min(axis=1)

    # 6. Drop side if not used later
    events = events.drop('side', axis=1)

    return events
## Basicly we do it again

In [22]:
def get_bins(events, close):
    """
    Compute event returns and bin them into {-1, 0, 1} labels.

    Parameters:
        events (pd.DataFrame): Must contain ['t1'] as the exit time.
        close (pd.Series): Price series.

    Returns:
        pd.DataFrame: Contains 'ret' and 'bin' columns.
    """
    # Only events with known t1
    events = events.dropna(subset=['t1'])

    # Union of event start and end times for accurate price reindexing
    idx = events.index.union(events['t1']).drop_duplicates()
    px = close.reindex(idx, method='bfill')

    # Calculate returns
    out = pd.DataFrame(index=events.index)
    start_px = px.loc[events.index]
    end_px = px.loc[events['t1']]
    out['ret'] = end_px.values / start_px.values - 1

    # Assign bins: +1 if positive return, -1 if negative, 0 if unchanged
    out['bin'] = np.sign(out['ret'])

    return out


In [25]:
def get_events(close, t_events, pt_sl, trgt, min_ret, num_threads, t1=False, side=None):
    """
    Create event structure and apply the triple-barrier labeling.

    Parameters:
        close (pd.Series): Close prices.
        t_events (pd.Index): Event timestamps.
        pt_sl (tuple): Tuple (pt, sl) as multiples of trgt.
        trgt (pd.Series): Target returns.
        min_ret (float): Minimum return threshold.
        num_threads (int): Number of threads to use.
        t1 (pd.Series or bool): Vertical barrier timestamps.
        side (pd.Series or None): Optional side information (for meta-labeling).

    Returns:
        pd.DataFrame: Events with updated 't1', 'trgt', and (optional) 'side'.
    """
    # 1) Filter out small targets
    trgt = trgt.loc[t_events]
    trgt = trgt[trgt > min_ret]

    # 2) Set vertical barrier if not provided
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=t_events)

    # 3) Create event dataframe
    if side is None:
        side_, pt_sl_ = pd.Series(1.0, index=trgt.index), [pt_sl[0], pt_sl[0]]
    else:
        side_, pt_sl_ = side.loc[trgt.index], pt_sl[:2]

    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1)
    events = events.dropna(subset=['trgt'])

    # 4) Apply stop-loss/profit-taking on events
    df0 = mpPandasObj(func=apply_pt_sl_on_t1,
                      pdObj=('molecule', events.index),
                      numThreads=num_threads,
                      close=close,
                      events=events,
                      pt_sl=pt_sl_)

    # 5) Replace t1 with earliest of pt/sl/vertical barrier
    events['t1'] = df0.dropna(how='all').min(axis=1)

    # 6) Drop 'side' if not doing meta-labeling
    if side is None:
        events = events.drop('side', axis=1)

    return events


In [26]:
def get_bins(events, close):
    """
    Compute event returns and assign classification labels (bins).

    Parameters:
        events (pd.DataFrame): Must have columns 't1' (mandatory) and optionally 'side' (for meta-labeling).
        close (pd.Series): Close price series.

    Returns:
        pd.DataFrame: DataFrame with:
            - 'ret': return from event start to t1
            - 'bin': label (+1, -1, or 0 depending on return and side)
    """
    # 1) Drop events without an end time (t1)
    events_ = events.dropna(subset=['t1'])

    # 2) Reindex prices for start and end times
    px = events_.index.union(events_['t1']).drop_duplicates()
    px = close.reindex(px, method='bfill')

    # 3) Calculate returns
    out = pd.DataFrame(index=events_.index)
    start_px = px.loc[events_.index]
    end_px = px.loc[events_['t1']]
    out['ret'] = end_px.values / start_px.values - 1

    # 4) Apply meta-labeling adjustment if 'side' is available
    if 'side' in events_:
        out['ret'] *= events_['side']

    # 5) Assign label
    out['bin'] = np.sign(out['ret'])

    # 6) In meta-labeling: only positive returns are considered correct
    if 'side' in events_:
        out.loc[out['ret'] <= 0, 'bin'] = 0  # discard false positives

    return out


In [28]:
def drop_labels(events, min_pct=0.05):
    """
    Drop minority classes until all bins meet the minimum proportion threshold.

    Parameters:
        events (pd.DataFrame): Must have a 'bin' column.
        min_pct (float): Minimum percentage for any class (e.g., 0.05 = 5%).

    Returns:
        pd.DataFrame: Events with underrepresented classes removed.
    """
    while True:
        label_pct = events['bin'].value_counts(normalize=True)
        if label_pct.min() >= min_pct or label_pct.shape[0] < 2:
            break  # All classes are balanced enough or only one remains

        min_label = label_pct.idxmin()
        print(f'Dropped label {min_label}, proportion was {label_pct[min_label]:.2%}')
        events = events[events['bin'] != min_label]

    return events
