Import libraries

In [1]:
!pip install pandas numpy xgboost scikit-learn jupyter talib

Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)


ERROR: Could not find a version that satisfies the requirement talib (from versions: none)
ERROR: No matching distribution found for talib


In [5]:
!python -m pip install TA-Lib





In [6]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import talib
import warnings
warnings.filterwarnings('ignore')

In [7]:
class AdvancedBitcoinStrategy:
    def __init__(self, fee_rate=0.001):
        self.fee_rate = fee_rate
        self.scaler = StandardScaler()
        
    def add_technical_indicators(self, df):
        """Add comprehensive technical indicators"""
        df = df.copy()
        
        # Ensure we have OHLCV columns
        high = df['High'].values
        low = df['Low'].values
        close = df['Close'].values
        volume = df['Volume'].values
        open_price = df['Open'].values
        
        # === TREND INDICATORS ===
        # Moving Averages
        df['sma_5'] = talib.SMA(close, timeperiod=5)
        df['sma_10'] = talib.SMA(close, timeperiod=10)
        df['sma_20'] = talib.SMA(close, timeperiod=20)
        df['sma_50'] = talib.SMA(close, timeperiod=50)
        df['sma_100'] = talib.SMA(close, timeperiod=100)
        
        df['ema_5'] = talib.EMA(close, timeperiod=5)
        df['ema_10'] = talib.EMA(close, timeperiod=10)
        df['ema_20'] = talib.EMA(close, timeperiod=20)
        df['ema_50'] = talib.EMA(close, timeperiod=50)
        
        # MACD
        df['macd'], df['macd_signal'], df['macd_hist'] = talib.MACD(close)
        
        # ADX (Trend Strength)
        df['adx'] = talib.ADX(high, low, close, timeperiod=14)
        df['plus_di'] = talib.PLUS_DI(high, low, close, timeperiod=14)
        df['minus_di'] = talib.MINUS_DI(high, low, close, timeperiod=14)
        
        # === MOMENTUM INDICATORS ===
        # RSI
        df['rsi_14'] = talib.RSI(close, timeperiod=14)
        df['rsi_21'] = talib.RSI(close, timeperiod=21)
        
        # Stochastic
        df['stoch_k'], df['stoch_d'] = talib.STOCH(high, low, close)
        
        # Williams %R
        df['williams_r'] = talib.WILLR(high, low, close, timeperiod=14)
        
        # CCI
        df['cci'] = talib.CCI(high, low, close, timeperiod=14)
        
        # ROC (Rate of Change)
        df['roc_5'] = talib.ROC(close, timeperiod=5)
        df['roc_10'] = talib.ROC(close, timeperiod=10)
        df['roc_20'] = talib.ROC(close, timeperiod=20)
        
        # === VOLATILITY INDICATORS ===
        # Bollinger Bands
        df['bb_upper'], df['bb_middle'], df['bb_lower'] = talib.BBANDS(close)
        df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / df['bb_middle']
        df['bb_position'] = (close - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'])
        
        # ATR (Average True Range)
        df['atr'] = talib.ATR(high, low, close, timeperiod=14)
        
        # === VOLUME INDICATORS ===
        # Volume SMA
        df['volume_sma_10'] = talib.SMA(volume, timeperiod=10)
        df['volume_sma_20'] = talib.SMA(volume, timeperiod=20)
        
        # On Balance Volume
        df['obv'] = talib.OBV(close, volume)
        
        # Volume Rate of Change
        df['volume_roc'] = talib.ROC(volume, timeperiod=10)
        
        # === PRICE PATTERNS ===
        # Price ratios
        df['price_sma20_ratio'] = close / df['sma_20']
        df['price_sma50_ratio'] = close / df['sma_50']
        df['sma20_sma50_ratio'] = df['sma_20'] / df['sma_50']
        df['ema10_ema20_ratio'] = df['ema_10'] / df['ema_20']
        
        # High-Low ratios
        df['hl_ratio'] = high / low
        df['close_high_ratio'] = close / high
        df['close_low_ratio'] = close / low
        
        # === CUSTOM INDICATORS ===
        # Volatility
        df['volatility_5'] = df['Close'].rolling(5).std()
        df['volatility_10'] = df['Close'].rolling(10).std()
        df['volatility_20'] = df['Close'].rolling(20).std()
        
        # Price momentum
        df['momentum_5'] = close / df['Close'].shift(5) - 1
        df['momentum_10'] = close / df['Close'].shift(10) - 1
        df['momentum_20'] = close / df['Close'].shift(20) - 1
        
        # Volume momentum
        df['volume_momentum'] = volume / df['volume_sma_10']
        
        # === LAG FEATURES ===
        # Lag important indicators
        for lag in [1, 2, 3, 5, 7]:
            df[f'rsi_lag_{lag}'] = df['rsi_14'].shift(lag)
            df[f'macd_lag_{lag}'] = df['macd'].shift(lag)
            df[f'momentum_lag_{lag}'] = df['momentum_5'].shift(lag)
        
        return df
    
    def prepare_features_targets(self, df):
        """Prepare features and targets"""
        df = df.copy()
        df['Open time'] = pd.to_datetime(df['Open time'])
        df = df.sort_values('Open time')
        df.set_index('Open time', inplace=True)
        
        # Drop unnecessary columns
        df.drop(columns=['Close time', 'Ignore'], inplace=True, errors='ignore')
        
        # Add technical indicators
        df = self.add_technical_indicators(df)
        
        # Create target returns
        df['target_return_1'] = df['Close'].pct_change(-1)
        df['target_return_3'] = (df['Close'].shift(-3) / df['Close']) - 1
        df['target_return_5'] = (df['Close'].shift(-5) / df['Close']) - 1
        df['target_return_7'] = (df['Close'].shift(-7) / df['Close']) - 1
        df['target_return_14'] = (df['Close'].shift(-14) / df['Close']) - 1
        
        # Clean data
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.dropna()
        
        # Cap extreme values
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if 'return' in col or 'roc' in col or 'momentum' in col:
                df[col] = df[col].clip(-1, 1)  # Cap at ±100%
        
        return df
    
    def select_features(self, df):
        """Select the most important features"""
        # Exclude target columns and raw OHLCV
        exclude_cols = [
            'Open', 'High', 'Low', 'Close', 'Volume', 
            'Number of trades', 'Taker buy base asset volume', 
            'Taker buy quote asset volume'
        ] + [col for col in df.columns if col.startswith('target_')]
        
        features = [col for col in df.columns if col not in exclude_cols]
        
        # Prioritize most important technical indicators
        priority_features = [
            'rsi_14', 'macd', 'macd_signal', 'bb_position', 'bb_width',
            'adx', 'atr', 'cci', 'williams_r', 'stoch_k', 'stoch_d',
            'price_sma20_ratio', 'price_sma50_ratio', 'sma20_sma50_ratio',
            'ema10_ema20_ratio', 'momentum_5', 'momentum_10', 'volatility_10',
            'volume_momentum', 'obv', 'plus_di', 'minus_di'
        ]
        
        # Add lag features
        lag_features = [col for col in features if 'lag_' in col]
        
        # Combine priority and lag features
        selected_features = []
        for feat in priority_features + lag_features:
            if feat in features:
                selected_features.append(feat)
        
        # Add remaining features up to a limit
        remaining_features = [f for f in features if f not in selected_features]
        selected_features.extend(remaining_features[:20])  # Limit total features
        
        return selected_features
    
    def generate_signals(self, predictions, current_indicators):
        """Advanced signal generation with multiple filters"""
        signals = []
        
        for i, pred in enumerate(predictions):
            # Get current market conditions
            rsi = current_indicators.get('rsi_14', 50)
            bb_pos = current_indicators.get('bb_position', 0.5)
            adx = current_indicators.get('adx', 25)
            macd = current_indicators.get('macd', 0)
            macd_signal = current_indicators.get('macd_signal', 0)
            volatility = current_indicators.get('volatility_10', 0.02)
            
            # Signal strength based on prediction
            signal_strength = abs(pred)
            
            # Market condition filters
            trend_filter = True
            momentum_filter = True
            volatility_filter = True
            
            # 1. Trend Filter (ADX > 25 for trending markets)
            if adx < 20:
                trend_filter = False
            
            # 2. Momentum Filter (RSI not in extreme zones)
            if rsi > 80 and pred > 0:  # Don't buy when overbought
                momentum_filter = False
            elif rsi < 20 and pred < 0:  # Don't sell when oversold
                momentum_filter = False
            
            # 3. MACD Confirmation
            if pred > 0 and macd < macd_signal:  # Don't buy if MACD bearish
                momentum_filter = False
            elif pred < 0 and macd > macd_signal:  # Don't sell if MACD bullish
                momentum_filter = False
            
            # 4. Volatility Filter (avoid trading in extreme volatility)
            if volatility > 0.1:  # More than 10% daily volatility
                volatility_filter = False
            
            # 5. Bollinger Band Filter
            if bb_pos > 0.8 and pred > 0:  # Don't buy near upper band
                momentum_filter = False
            elif bb_pos < 0.2 and pred < 0:  # Don't sell near lower band
                momentum_filter = False
            
            # Generate final signal
            if (signal_strength > 0.02 and  # Minimum 2% predicted return
                trend_filter and momentum_filter and volatility_filter):
                signal = np.sign(pred)
            else:
                signal = 0  # No trade
            
            signals.append(signal)
        
        return signals
    
    def rolling_predict(self, df, horizon=5, window_size=500):
        """Advanced rolling prediction with ensemble models"""
        predictions = []
        target_col = f'target_return_{horizon}'
        
        # Select features
        features = self.select_features(df)
        print(f"Using {len(features)} features for prediction")
        
        # Create ensemble of models
        models = [
            XGBRegressor(
                n_estimators=100, max_depth=4, learning_rate=0.1,
                subsample=0.8, colsample_bytree=0.8, random_state=42,
                reg_alpha=0.1, reg_lambda=0.1
            ),
            XGBRegressor(
                n_estimators=150, max_depth=3, learning_rate=0.05,
                subsample=0.9, colsample_bytree=0.9, random_state=123,
                reg_alpha=0.05, reg_lambda=0.05
            ),
            RandomForestRegressor(
                n_estimators=100, max_depth=5, random_state=42,
                min_samples_split=10, min_samples_leaf=5
            )
        ]
        
        for i in range(window_size, len(df) - horizon):
            if i % 100 == 0:
                print(f"Processing sample {i}/{len(df) - horizon}")
            
            # Get training data
            train_df = df.iloc[i - window_size:i]
            test_row = df.iloc[i]
            
            X_train = train_df[features]
            y_train = train_df[target_col]
            
            # Remove NaN values
            valid_idx = ~(X_train.isna().any(axis=1) | y_train.isna())
            X_train = X_train[valid_idx]
            y_train = y_train[valid_idx]
            
            if len(X_train) < 100:
                continue
            
            # Scale features
            X_train_scaled = self.scaler.fit_transform(X_train)
            X_test_scaled = self.scaler.transform(test_row[features].values.reshape(1, -1))
            
            # Ensemble prediction
            ensemble_predictions = []
            for model in models:
                if hasattr(model, 'tree_method'):  # XGBoost
                    model.fit(X_train, y_train)
                    pred = model.predict(test_row[features].values.reshape(1, -1))[0]
                else:  # RandomForest
                    model.fit(X_train_scaled, y_train)
                    pred = model.predict(X_test_scaled)[0]
                
                ensemble_predictions.append(pred)
            
            # Average ensemble prediction
            predicted_return = np.mean(ensemble_predictions)
            predicted_return = np.clip(predicted_return, -0.2, 0.2)  # Cap at ±20%
            
            # Get actual return
            actual_return = df.iloc[i + horizon - 1][target_col]
            
            # Get current indicators for signal generation
            current_indicators = test_row[features].to_dict()
            
            # Generate signal using advanced logic
            signal = self.generate_signals([predicted_return], current_indicators)[0]
            
            # Calculate returns
            if signal != 0:
                gross_return = signal * actual_return
                # Position-based fee calculation
                net_return = gross_return - self.fee_rate
            else:
                net_return = 0
            
            predictions.append({
                'date': df.index[i],
                'predicted_return': predicted_return,
                'actual_return': actual_return,
                'net_return': net_return,
                'signal': signal,
                'rsi': current_indicators.get('rsi_14', 50),
                'macd': current_indicators.get('macd', 0),
                'bb_position': current_indicators.get('bb_position', 0.5)
            })
        
        return pd.DataFrame(predictions)
    
    def compute_metrics(self, results_df):
        """Comprehensive metrics calculation"""
        if len(results_df) == 0:
            return {"Error": "No predictions generated"}
        
        # Calculate cumulative returns
        results_df['cumulative_return'] = (1 + results_df['net_return']).cumprod()
        total_return = results_df['cumulative_return'].iloc[-1] - 1
        
        # Time-based metrics
        days = (results_df['date'].iloc[-1] - results_df['date'].iloc[0]).days
        annual_return = (1 + total_return) ** (365 / days) - 1
        
        # Risk metrics
        daily_returns = results_df['net_return']
        volatility = daily_returns.std() * np.sqrt(365)
        sharpe_ratio = (annual_return - 0.02) / volatility if volatility > 0 else 0  # Assume 2% risk-free rate
        
        # Drawdown
        cumulative = results_df['cumulative_return']
        running_max = cumulative.expanding().max()
        drawdown = (cumulative - running_max) / running_max
        max_drawdown = drawdown.min()
        
        # Trading statistics
        win_rate = (daily_returns > 0).sum() / len(daily_returns)
        num_trades = (results_df['signal'] != 0).sum()
        
        # Profit factor
        winning_trades = daily_returns[daily_returns > 0].sum()
        losing_trades = abs(daily_returns[daily_returns < 0].sum())
        profit_factor = winning_trades / losing_trades if losing_trades > 0 else np.inf
        
        # Sortino ratio (downside deviation)
        downside_returns = daily_returns[daily_returns < 0]
        downside_std = downside_returns.std() * np.sqrt(365) if len(downside_returns) > 0 else 0
        sortino_ratio = (annual_return - 0.02) / downside_std if downside_std > 0 else 0
        
        # Calmar ratio
        calmar_ratio = annual_return / abs(max_drawdown) if max_drawdown < 0 else 0
        
        return {
            "Total Return": f"{total_return*100:.2f}%",
            "Annualized Return": f"{annual_return*100:.2f}%",
            "Volatility (Annual)": f"{volatility*100:.2f}%",
            "Sharpe Ratio": round(sharpe_ratio, 3),
            "Sortino Ratio": round(sortino_ratio, 3),
            "Calmar Ratio": round(calmar_ratio, 3),
            "Max Drawdown": f"{max_drawdown*100:.2f}%",
            "Win Rate": f"{win_rate*100:.2f}%",
            "Total Trades": int(num_trades),
            "Trades per Month": round(num_trades / (days/30), 1),
            "Profit Factor": round(profit_factor, 2),
            "Best Day": f"{daily_returns.max()*100:.2f}%",
            "Worst Day": f"{daily_returns.min()*100:.2f}%",
        }


In [9]:
def run_advanced_strategy(df_path):
    """Run the advanced strategy"""
    
    # Load data
    df = pd.read_csv(df_path)
    
    # Initialize strategy
    strategy = AdvancedBitcoinStrategy(fee_rate=0.001)
    
    # Prepare data
    print("Preparing features and technical indicators...")
    df = strategy.prepare_features_targets(df)
    print(f"Data shape after feature engineering: {df.shape}")
    
    # Test different horizons
    horizons = [3, 5, 7, 14]
    results = {}
    
    for horizon in horizons:
        print(f"\n=== Testing {horizon}-day horizon ===")
        try:
            predictions = strategy.rolling_predict(df, horizon=horizon)
            metrics = strategy.compute_metrics(predictions)
            results[f"{horizon}_day"] = {
                'metrics': metrics,
                'predictions': predictions
            }
            
            print(f"Results for {horizon}-day strategy:")
            for key, value in metrics.items():
                print(f"  {key}: {value}")
                
        except Exception as e:
            print(f"Error with {horizon}-day horizon: {e}")
            results[f"{horizon}_day"] = {"Error": str(e)}
    
    return results

In [None]:
results = run_advanced_strategy("btc_1d_data_2018_to_2025.csv")

Preparing features and technical indicators...
Data shape after feature engineering: (2592, 77)

=== Testing 3-day horizon ===
Using 57 features for prediction
Processing sample 500/2589
Processing sample 600/2589
Processing sample 700/2589
Processing sample 800/2589
Processing sample 900/2589
Processing sample 1000/2589
Processing sample 1100/2589
Processing sample 1200/2589


In [None]:
best_strategy = None
best_return = -999
for strategy_name, result in results.items():
    if 'metrics' in result:
        total_return = float(result['metrics']['Total Return'].replace('%', ''))
        if total_return > best_return:
            best_return = total_return
            best_strategy = strategy_name

print(f"\nBest performing strategy: {best_strategy} with {best_return:.2f}% return")