<a href="https://colab.research.google.com/github/dgalassi99/quant-trading-self-study/blob/main/more/Meta_Labelling_%26_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Main Code

This exercise comes from studying "Machine Learning for Asset Managers". Due to the sie of the exercise I created a separate notebook file.

The full book study and exercise is here: https://github.com/dgalassi99/quant-trading-self-study/blob/main/00_books/Machine_Learning_for_Asset_Managers.ipynb

## Labelling with Triple Barrier Method

In [1]:
from google.colab import drive
import pandas as pd, numpy as np
import matplotlib.pyplot as plt

drive.mount('/content/drive')
# Load the CSV data
file_path = '/content/drive/MyDrive/QUANT/DATA/btc_1h_data_2018_to_2025.csv'
df = pd.read_csv(file_path, date_format='%Y%m%d %H%M%S')
df['Open time'] = pd.to_datetime(df['Open time'])
df = df[['Open time', 'Close','Volume']]

#define a target volume --> we use the average
target_vol = df.Volume.mean()
df['cum_vol'] = df.Volume.cumsum()
#create the bar_num in the df
df['vol_bar_num'] = df.cum_vol.apply(lambda x: x/target_vol).astype(int)
#keep only when vol_bar_num changes
df_vol = df.groupby('vol_bar_num').first()

#compute returns and std
df_vol['returns'] = df_vol.Close.pct_change()
std_returns = df_vol.returns.std()
# horizontal barriers #
hrz_barrier = 2*std_returns
print(f"Standard deviation of returns: {std_returns:.6f}")
print(f"Horizontal barrier (2 * std): {hrz_barrier:.6f}")

''' the maximum holding period is defined as the
average number of bars per day

'''
# Reset index to use timestamps again
df_vol = df_vol.reset_index()

# Add date column (only the day)
df_vol['date'] = df_vol['Open time'].dt.date

# Count bars per day, then average
bars_per_day = df_vol.groupby('date').size()
max_holding_period = int(bars_per_day.mean())
print(f"Max holding period (avg bars/day): {max_holding_period}")

labels = []  # list to store labels

#loop through the rows, stopping max_holding_period early
for i in range(len(df_vol.Close) - max_holding_period):
    label = 0  # default label
    start_price = df_vol.Close.iloc[i]

    #look forward up to max_holding_period steps
    for j in range(1, max_holding_period + 1):
        future_price = df_vol.Close.iloc[i + j]
        returns = (future_price - start_price) / start_price
        #check labelling conditions
        if returns >= hrz_barrier:
            label = 1
            break
        elif returns <= -hrz_barrier:
            label = -1
            break

    labels.append(label)
#append NaNs to align with df_vol's length
labels += [np.nan] * max_holding_period
df_vol["label"] = labels
print(df_vol['label'].value_counts(dropna=True))


Mounted at /content/drive
Standard deviation of returns: 0.009658
Horizontal barrier (2 * std): 0.019315
Max holding period (avg bars/day): 14
label
 1.0    13132
 0.0    13017
-1.0    12436
Name: count, dtype: int64


In [2]:
# overall distribution
overall_dist = df_vol['label'].value_counts(normalize=True)
print("Overall label distribution (Triple Barrier):")
print(overall_dist)

df_vol['hour'] = df_vol['Open time'].dt.hour
hourly_dist = df_vol.groupby('hour')['label'].value_counts(normalize=True).unstack().fillna(0)
print("Hourly label distribution (Triple Barrier):")
print(hourly_dist)

Overall label distribution (Triple Barrier):
label
 1.0    0.340340
 0.0    0.337359
-1.0    0.322301
Name: proportion, dtype: float64
Hourly label distribution (Triple Barrier):
label      -1.0       0.0       1.0
hour                               
0      0.343131  0.328134  0.328734
1      0.327804  0.332449  0.339748
2      0.330812  0.323934  0.345254
3      0.325747  0.327881  0.346373
4      0.317073  0.335725  0.347202
5      0.313584  0.317919  0.368497
6      0.327610  0.340659  0.331731
7      0.325032  0.332695  0.342273
8      0.324040  0.332714  0.343247
9      0.338054  0.327586  0.334360
10     0.307359  0.333333  0.359307
11     0.319436  0.321888  0.358676
12     0.328054  0.328054  0.343891
13     0.329570  0.318817  0.351613
14     0.330474  0.346115  0.323411
15     0.315628  0.356997  0.327375
16     0.315817  0.356280  0.327903
17     0.305035  0.353630  0.341335
18     0.330055  0.344806  0.325138
19     0.315231  0.347066  0.337703
20     0.311746  0.333333  0.