SETUP & DATA LOADING

In [187]:
import numpy as np
import pandas as pd
import os 

from pathlib import Path

# root folder
PROJECT_ROOT = Path("ETHUSDT_Strategy.jpynb").resolve().parent.parent
print(PROJECT_ROOT) 
DATA_DIR = os.path.join(PROJECT_ROOT, "data/ETHUSDT.csv")
print(DATA_DIR)

/home/jovyan/quant
/home/jovyan/quant/data/ETHUSDT.csv


In [188]:
df_raw = pd.read_csv(DATA_DIR, parse_dates=['timestamp'], index_col='timestamp')

In [189]:
df_raw.head()

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-11-27 07:30:00,146.0,146.0,146.0,146.0,0.01
2019-11-27 08:00:00,125.03,125.03,125.03,125.03,0.01
2019-11-27 08:30:00,145.0,145.01,133.0,133.0,0.07
2019-11-27 09:00:00,133.0,133.0,133.0,133.0,0.0
2019-11-27 09:30:00,133.0,133.0,133.0,133.0,0.0


In [190]:
print(df_raw.columns)

Index(['open', 'high', 'low', 'close', 'volume'], dtype='object')


DATA EXPLORATION 

In [191]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 100222 entries, 2019-11-27 07:30:00 to 2025-08-15 06:00:00
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   open    100222 non-null  float64
 1   high    100222 non-null  float64
 2   low     100222 non-null  float64
 3   close   100222 non-null  float64
 4   volume  100222 non-null  float64
dtypes: float64(5)
memory usage: 4.6 MB


In [192]:
df_raw.describe()

Unnamed: 0,open,high,low,close,volume
count,100222.0,100222.0,100222.0,100222.0,100222.0
mean,2020.850894,2028.743724,2012.538468,2020.89524,69531.05
std,1142.978576,1147.338935,1138.329111,1142.993254,86547.58
min,95.73,98.44,84.23,95.73,0.0
25%,1293.55,1298.2925,1288.55,1293.56,23932.19
50%,1909.37,1915.32,1902.85,1909.395,43401.88
75%,2905.0675,2916.99,2890.315,2905.1325,81267.71
max,4852.0,4877.54,4843.58,4852.08,1915609.0


In [193]:
print(df_raw.isnull().sum)

<bound method DataFrame.sum of                       open   high    low  close  volume
timestamp                                              
2019-11-27 07:30:00  False  False  False  False   False
2019-11-27 08:00:00  False  False  False  False   False
2019-11-27 08:30:00  False  False  False  False   False
2019-11-27 09:00:00  False  False  False  False   False
2019-11-27 09:30:00  False  False  False  False   False
...                    ...    ...    ...    ...     ...
2025-08-15 04:00:00  False  False  False  False   False
2025-08-15 04:30:00  False  False  False  False   False
2025-08-15 05:00:00  False  False  False  False   False
2025-08-15 05:30:00  False  False  False  False   False
2025-08-15 06:00:00  False  False  False  False   False

[100222 rows x 5 columns]>


In [194]:
print(f"From: {df_raw.index.min()}")
print(f"To: {df_raw.index.max()}")

From: 2019-11-27 07:30:00
To: 2025-08-15 06:00:00


FEATURE ENGINEERING

In [469]:
def generate_features(df_features):
    # copy
    df_features = df_raw.copy()
    
    # RETURN
    df_features['return_1'] = df_features['close'].pct_change()       # 30 minute return  
    df_features['return_4'] = df_features['close'].pct_change(4)      # 2 hours return 
    df_features['return_48'] = df_features['close'].pct_change(48)    # 24 hours return 

    # VOLATILITY
    df_features['volatility_24'] = df_features['return_1'].rolling(48).std()       # 24 hours rolling volatility
    
    # MOMENTUM - RSI
    delta = df_features['close'].diff()
    gain = delta.where(delta > 0, 0).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    df_features['rsi'] = 100 - (100 / (1 + gain / loss)) 
    
    # SMA
    df_features['sma_20'] = df_features['close'].rolling(20).mean()
    df_features['sma_50'] = df_features['close'].rolling(50).mean()
    df_features['price_to_sma20'] = df_features['close'] / df_features['sma_20'] - 1
    df_features['price_to_sma50'] = df_features['close'] / df_features['sma_50'] - 1
    
    df_features['sma_cross'] = df_features['sma_20'] / df_features['sma_50'] - 1  # SMA crossover signal

    #EMA
    df_features['ema_20'] = df_features['close'].ewm(span=20).mean()
    df_features['price_to_ema20'] = df_features['close'] / df_features['ema_20'] - 1
    
    # VOLUME
    df_features['volume_sma'] = df_features['volume'].rolling(20).mean()
    df_features['volume_ratio'] = df_features['volume'] / df_features['volume_sma'] - 1
    
    # Drop NaN
    df_features.dropna(inplace=True)

    return df_features

In [470]:
def prepare_data(df_features, N, feature_cols, split_ratio):
    # Predict future return after N bars
    N = 4    # N * 0.5 hours
    df_features['target'] = df_features['close'].pct_change(N).shift(N)
    df_features['target'].dropna(inplace=True)

    X = df_features[feature_cols]
    y = df_features['target']
    
    split_idx = int(len(df) * split_ratio)
    
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    return X_train, X_test, y_train, y_test

In [471]:
df_features = generate_features(df_raw)
feature_cols = ['return_1', 'return_4', 'return_48', 'volatility_24',
                 'rsi', 'price_to_sma20', 'price_to_sma50', 'sma_cross',
                'price_to_ema20', 'volume_ratio']
X_train, X_test, y_train, y_test = prepare_data(df_features, 
                                                N=4, 
                                                feature_cols=feature_cols,
                                                split_ratio=0.8)
print("train samples:", len(X_train), "\ntest_samples:", len(X_test))

train samples: 80177 
test_samples: 19996


In [472]:
X_train.head()

Unnamed: 0_level_0,return_1,return_4,return_48,volatility_24,rsi,price_to_sma20,price_to_sma50,sma_cross,price_to_ema20,volume_ratio
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-11-28 08:00:00,0.000263,0.001185,0.215868,0.01597,59.342561,0.002734,0.020707,0.017924,0.002542,0.298826
2019-11-28 08:30:00,0.004473,0.003483,0.14812,0.013344,83.777778,0.007196,0.024351,0.017032,0.006349,0.431827
2019-11-28 09:00:00,-0.003143,-0.000263,0.144511,0.013367,69.6875,0.004328,0.01742,0.013035,0.00288,0.27293
2019-11-28 09:30:00,-6.6e-05,0.001513,0.144436,0.013367,64.480874,0.004365,0.014747,0.010337,0.002544,0.214676
2019-11-28 10:00:00,-0.012089,-0.010854,0.130602,0.013536,33.391916,-0.007072,0.000164,0.007287,-0.008667,0.340323


MODEL TRAINING

In [473]:
import lightgbm as lgb

model = lgb.LGBMRegressor(
      n_estimators=100,
      learning_rate=0.05,
      max_depth=5,
      min_child_samples=100,      # Lower from default 20
      min_split_gain=0.0,         # Allow any gain
      reg_alpha=0.1,              # L1 regularization
      reg_lambda=0.1,             # L2 regularization
      random_state=42,
      verbose=-1                  # Suppress warnings
)

model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

In [474]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_regression(y_true, y_pred):
  y_true = np.asarray(y_true, dtype=float)
  y_pred = np.asarray(y_pred, dtype=float)

  mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
  y_true, y_pred = y_true[mask], y_pred[mask]

  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_true, y_pred)

  return {
      "MAE": mae,
      "MSE": mse,
      "RMSE": rmse,
      "R2": r2
  }

print(evaluate_regression(y_test, y_pred))


{'MAE': 0.0045413378266565664, 'MSE': 4.463553532278399e-05, 'RMSE': 0.00668098311049983, 'R2': 0.6072558681521294}


BACKTESTING

In [475]:
def generate_signal(X_test, y_pred):
    # Create signals based on predicted returns
    threshold = 0.002  # Only trade if predicted return > 0.2%
    
    signals = pd.Series(0, index=X_test.index)
    signals[y_pred > threshold] = 1    # Long
    signals[y_pred < -threshold] = -1  # Short (optional, use 0 if long-only)
    
    print(f"Long signals:  {(signals == 1).sum()}")
    print(f"Short signals: {(signals == -1).sum()}")
    print(f"Flat:          {(signals == 0).sum()}")

    return signals

In [476]:
def backtesting(df_features, X_test, signals):
    # Get actual returns for test period
    actual_returns = df_features.loc[X_test.index, 'return_1']
    
    # Strategy returns (shift signals to avoid look-ahead)
    strategy_returns = signals.shift(1) * actual_returns
    strategy_returns.dropna(inplace=True)

    # Buy and Hold baseline
    buyhold_returns = actual_returns.loc[strategy_returns.index]
    
    return strategy_returns, buyhold_returns

In [477]:
def evaluate(strategy_returns, buyhold_returns):
    # Equity curve
    equity_curve = (1 + strategy_returns).cumprod()
    # Buy and hold curve
    buyhold_curve = (1 + buyhold_returns).cumprod()
    
    # Constants
    bars_per_year = 365 * 24 * 2  # 17,520 for 30-min bars
    
    # Sharpe Ratio
    sharpe = strategy_returns.mean() / strategy_returns.std() * np.sqrt(bars_per_year)
    
    # Annual Return
    n_bars = len(strategy_returns)
    total_return = equity_curve.iloc[-1] - 1
    n_years = n_bars / bars_per_year
    annual_return = (1 + total_return) ** (1 / n_years) - 1
    
    # Max Drawdown
    cummax = equity_curve.cummax()
    drawdown = (equity_curve - cummax) / cummax
    max_drawdown = drawdown.min()
    
    # Win Rate
    winning_trades = (strategy_returns > 0).sum()
    total_trades = (strategy_returns != 0).sum()
    win_rate = winning_trades / total_trades
    
    # Buy-Hold metrics
    bh_total_return = buyhold_curve.iloc[-1] - 1
    bh_annual_return = (1 + bh_total_return) ** (1 / n_years) - 1
    bh_sharpe = buyhold_returns.mean() / buyhold_returns.std() * np.sqrt(bars_per_year)
    
    print(f"{'Metric':<20} {'Strategy':>12} {'Buy-Hold':>12}")
    print("-" * 46)
    print(f"{'Sharpe Ratio':<20} {sharpe:>12.2f} {bh_sharpe:>12.2f}")
    print(f"{'Annual Return':<20} {annual_return:>11.1%} {bh_annual_return:>11.1%}")
    print(f"{'Max Drawdown':<20} {max_drawdown:>11.1%} {'N/A':>12}")
    print(f"{'Win Rate':<20} {win_rate:>11.1%} {'N/A':>12}")


In [478]:
signals = generate_signal(X_test, y_pred)

Long signals:  6576
Short signals: 5751
Flat:          7669


In [479]:
strategy_returns, buyhold_returns = backtesting(df_features, X_test, signals)

In [480]:
evaluate(strategy_returns, buyhold_returns)

Metric                   Strategy     Buy-Hold
----------------------------------------------
Sharpe Ratio                 1.80         0.79
Annual Return             147.7%       36.3%
Max Drawdown              -38.7%          N/A
Win Rate                   49.4%          N/A
