In [15]:
# =========================
# Build 5-class trading dataset (2h horizon) + leak-free features
# =========================
# Input  : data/combined_hourly_dataset.csv  (must include a time column + 'close'; others optional)
# Output : data/training/strategies_5cls_2h.csv
# =========================

from pathlib import Path
import numpy as np
import pandas as pd

# ------------ CONFIG ------------
PATH_IN   = "/Users/danilokacanski/Desktop/master-rad-clean/forecasting/1st try/combined_hourly_dataset.csv"
PATH_OUT  = Path("data/training/strategies_5cls_2h.csv")
VAL_HOURS = 24 * 30     # last 30 days only used to separate threshold estimation
H         = 2           # 2h forward horizon

PATH_OUT.parent.mkdir(parents=True, exist_ok=True)

# ------------ Helpers ------------
def to_datetime_utc_naive(series: pd.Series) -> pd.Series:
    """
    Parse to UTC and drop timezone (naive) for libs that expect tz-naive.
    IMPORTANT: use .dt on Series.
    """
    dt = pd.to_datetime(series, utc=True, errors="coerce")
    # Already UTC; just strip tz info:
    return dt.dt.tz_localize(None)

def rsi(series: pd.Series, window=14) -> pd.Series:
    delta = series.diff()
    gain = delta.clip(lower=0.0)
    loss = -delta.clip(upper=0.0)
    avg_gain = gain.ewm(alpha=1/window, adjust=False).mean()
    avg_loss = loss.ewm(alpha=1/window, adjust=False).mean()
    rs = avg_gain / (avg_loss.replace(0, np.nan))
    out = 100 - (100 / (1 + rs))
    return out.fillna(50.0)

def ema(series: pd.Series, span: int) -> pd.Series:
    return series.ewm(span=span, adjust=False).mean()

# ------------ 1) Load & hygiene ------------
df = pd.read_csv(PATH_IN)

# Find/normalize time column
time_candidates = [c for c in df.columns if c.lower() in ("timestamp", "time", "date")]
if not time_candidates and "imestamp" in df.columns:  # handle common typo
    df = df.rename(columns={"imestamp": "timestamp"})
    time_candidates = ["timestamp"]

assert time_candidates, "No time column found (expected one of: timestamp/time/date or 'imestamp')."
ts_col = time_candidates[0]

# Parse to tz-naive UTC, sort, set index, force hourly grid
df[ts_col] = to_datetime_utc_naive(df[ts_col])
df = df.sort_values(ts_col).set_index(ts_col)
df = df.asfreq("h")

# Keep numeric columns (coerce object to numeric where possible)
for c in df.columns:
    if df[c].dtype == "O":
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Require 'close'
assert "close" in df.columns, "'close' column is required in the input CSV."
df["close"] = df["close"].ffill().bfill()

# ------------ 2) Label (5 classes from future 2h log-returns) ------------
logp = np.log(df["close"])
df["fwd_ret_2h"] = logp.shift(-H) - logp  # future return aligned at t

# Use only the pre-validation history to estimate thresholds (no peeking)
split_time = df.index[-1] - pd.Timedelta(hours=VAL_HOURS)
r_train = df.loc[:split_time, "fwd_ret_2h"].dropna()

# Quantile thresholds for 5 classes with neutral middle bucket
q = r_train.quantile([0.05, 0.35, 0.65, 0.95]).values
bins   = [-np.inf, q[0], q[1], q[2], q[3], np.inf]
labels = ["crash↓↓", "down↓", "flat", "up↑", "moon↑↑"]

df["class"] = pd.cut(df["fwd_ret_2h"], bins=bins, labels=labels, include_lowest=True)

# Optional: position template
ACTION_MAP = {
    "crash↓↓": {"pos": -1.0, "note": "short hard"},
    "down↓":   {"pos": -0.5, "note": "short"},
    "flat":    {"pos":  0.0, "note": "no-trade"},
    "up↑":     {"pos":  0.5, "note": "long"},
    "moon↑↑":  {"pos":  1.0, "note": "long hard"},
}
df["action_pos"]  = df["class"].map(lambda c: ACTION_MAP.get(str(c), {"pos": 0})["pos"])
df["action_note"] = df["class"].map(lambda c: ACTION_MAP.get(str(c), {"note": ""})["note"])

# ------------ 3) Leak-free features (past-only) ------------
# Returns / momentum
df["ret_1h"]  = logp.diff(1)
df["ret_2h"]  = logp.diff(2)
df["ret_4h"]  = logp.diff(4)
df["ret_12h"] = logp.diff(12)
df["ret_24h"] = logp.diff(24)

# Rolling mean/std of 1h returns
for w in [6, 12, 24, 72, 168]:  # 6h, 12h, 1d, 3d, 7d
    df[f"roll_mean_ret_{w}h"] = df["ret_1h"].rolling(w, min_periods=max(2, w//2)).mean()
    df[f"roll_std_ret_{w}h"]  = df["ret_1h"].rolling(w, min_periods=max(2, w//2)).std()

# Volatility proxies
df["vol_24h"]  = df["ret_1h"].rolling(24,  min_periods=12).std() * np.sqrt(24)
df["vol_168h"] = df["ret_1h"].rolling(168, min_periods=84).std() * np.sqrt(24)

# Price-based indicators
df["ema_12h"]  = ema(df["close"], 12)
df["ema_26h"]  = ema(df["close"], 26)
df["macd"]     = df["ema_12h"] - df["ema_26h"]
df["macd_sig"] = ema(df["macd"], 9)
df["rsi_14"]   = rsi(df["close"], 14)

# Z-score of close (rolling)
df["roll_mean_close_24h"] = df["close"].rolling(24, min_periods=12).mean()
df["roll_std_close_24h"]  = df["close"].rolling(24, min_periods=12).std()
df["z_close_24h"] = (df["close"] - df["roll_mean_close_24h"]) / (df["roll_std_close_24h"].replace(0, np.nan))

# Time/cyclical features
idx = df.index
df["hour_sin"] = np.sin(2 * np.pi * idx.hour / 24.0)
df["hour_cos"] = np.cos(2 * np.pi * idx.hour / 24.0)
df["dow_sin"]  = np.sin(2 * np.pi * idx.dayofweek / 7.0)
df["dow_cos"]  = np.cos(2 * np.pi * idx.dayofweek / 7.0)

# Clip numeric outliers gently (optional but stabilizes training)
for c in df.columns:
    if pd.api.types.is_numeric_dtype(df[c]):
        df[c] = df[c].clip(df[c].quantile(0.001), df[c].quantile(0.999))

# ------------ 4) Select features + target & drop NaNs ------------
TARGET_COL = "class"

base_feats = [
    "close",
    "ret_1h","ret_2h","ret_4h","ret_12h","ret_24h",
    "roll_mean_ret_6h","roll_std_ret_6h",
    "roll_mean_ret_12h","roll_std_ret_12h",
    "roll_mean_ret_24h","roll_std_ret_24h",
    "roll_mean_ret_72h","roll_std_ret_72h",
    "roll_mean_ret_168h","roll_std_ret_168h",
    "vol_24h","vol_168h",
    "ema_12h","ema_26h","macd","macd_sig","rsi_14",
    "z_close_24h",
    "hour_sin","hour_cos","dow_sin","dow_cos",
]

# Add any other numeric columns present (volume, blockchain metrics, etc.)
other_numeric = [
    c for c in df.columns
    if (c not in base_feats)
    and (c not in ["fwd_ret_2h", "action_pos", "action_note"])
    and (c != TARGET_COL)
    and pd.api.types.is_numeric_dtype(df[c])
]

feature_cols = base_feats + other_numeric

final = df[feature_cols + [TARGET_COL, "action_pos", "action_note"]].dropna().copy()
final[TARGET_COL] = final[TARGET_COL].astype("category")

# ------------ 5) Save & sanity prints ------------
final.to_csv(PATH_OUT, index=True)
print(f"✅ Saved training table → {PATH_OUT}")
print(f"Rows: {len(final):,} | Features: {len(feature_cols)} | Classes: {list(final[TARGET_COL].cat.categories)}\n")

print("Class counts:")
print(final[TARGET_COL].value_counts().sort_index(), "\n")

print("Class ratios (%):")
print((final[TARGET_COL].value_counts(normalize=True).sort_index() * 100).round(2))

  df = df.asfreq("H")


✅ Saved training table → data/training/strategies_5cls_2h.csv
Rows: 49,277 | Features: 42 | Classes: ['crash↓↓', 'down↓', 'flat', 'up↑', 'moon↑↑']

Class counts:
class
crash↓↓     2436
down↓      14781
flat       14854
up↑        14774
moon↑↑      2432
Name: count, dtype: int64 

Class ratios (%):
class
crash↓↓     4.94
down↓      30.00
flat       30.14
up↑        29.98
moon↑↑      4.94
Name: proportion, dtype: float64


In [16]:
# Replace string labels with stable numeric IDs (0..4)
ORDERED_NAMES = ["crash↓↓", "down↓", "flat", "up↑", "moon↑↑"]  # keep your intended order
NAME2ID = {name: i for i, name in enumerate(ORDERED_NAMES)}

# Ensure class is string (not category codes that might be out-of-order)
final["class_name"] = final["class"].astype(str)
final["class_id"]   = final["class_name"].map(NAME2ID).astype("int64")

# Optional: drop the old 'class' column to avoid confusion and keep both readable & numeric
final = final.drop(columns=["class"]).rename(columns={"class_name": "class"})

# (Optional) sanity check
assert set(final["class"].unique()) == set(ORDERED_NAMES)
assert final["class_id"].min() == 0 and final["class_id"].max() == 4

# Save again if you want a numeric-target version
final.to_csv("data/training/strategies_5cls_2h_numeric.csv", index=True)
print("Saved → data/training/strategies_5cls_2h_numeric.csv")

Saved → data/training/strategies_5cls_2h_numeric.csv


In [17]:
final.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 49277 entries, 2020-01-04 12:00:00 to 2025-08-18 16:00:00
Freq: h
Data columns (total 46 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   close                    49277 non-null  float64
 1   ret_1h                   49277 non-null  float64
 2   ret_2h                   49277 non-null  float64
 3   ret_4h                   49277 non-null  float64
 4   ret_12h                  49277 non-null  float64
 5   ret_24h                  49277 non-null  float64
 6   roll_mean_ret_6h         49277 non-null  float64
 7   roll_std_ret_6h          49277 non-null  float64
 8   roll_mean_ret_12h        49277 non-null  float64
 9   roll_std_ret_12h         49277 non-null  float64
 10  roll_mean_ret_24h        49277 non-null  float64
 11  roll_std_ret_24h         49277 non-null  float64
 12  roll_mean_ret_72h        49277 non-null  float64
 13  roll_std_ret_72h         49277 no