# 🏆 CRYPTO PIPELINE V5 - STREAMLINED NOTEBOOK
## Numerai Cryptocurrency Prediction with Selected Models

**Features:**
- H2O AutoML with Sparkling Water
- AutoGluon TimeSeries for temporal predictions
- SynapseML LightGBM with advanced hyperparameters
- GPU acceleration and comprehensive feature engineering
- Real-time submission file generation

**Performance Targets:**
- RMSE < 0.19
- Correlation > 0.50
- MMC > 0.20

In [1]:
# Environment setup
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# GPU Configuration for maximum performance
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
os.environ['CUPY_GPU_MEMORY_LIMIT'] = '20480'  # 20GB per GPU
os.environ['RAPIDS_NO_INITIALIZE'] = '1'

# Temp directory configuration - use external drive
os.environ['TMPDIR'] = '/media/knight2/EDB/tmp'
os.environ['TEMP'] = '/media/knight2/EDB/tmp'
os.environ['TMP'] = '/media/knight2/EDB/tmp'

# Create temp directory
import pathlib
pathlib.Path('/media/knight2/EDB/tmp').mkdir(parents=True, exist_ok=True)

print("✅ Environment configured for dual GPU processing")

✅ Environment configured for dual GPU processing


In [11]:
# Import libraries
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import time
import json
import logging
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr
import gc

# Add utils to path
sys.path.append('/media/knight2/EDB/repos/Numer_crypto_v4/utils')
sys.path.append('/media/knight2/EDB/repos/Numer_crypto_v4/scripts/data')
sys.path.append('/media/knight2/EDB/repos/Numer_crypto_v4/scripts')
print("📦 Core libraries imported")

📦 Core libraries imported


In [2]:
import polars as pl


## Data Import en EDA

In [None]:
today = 20250814

### Equity identifier

In [25]:
company = pl.read_csv('/media/knight2/EDB/numer_crypto_temp/data/raw/identifier/ticker_mapping_20250814.csv')

In [26]:
print(company.columns)
print(len(company['yahoo_ticker'].unique()))

['numerai_ticker', 'yahoo_ticker']
20734


### Equity

In [3]:
equity = pl.read_parquet(f'/media/knight2/EDB/numer_crypto_temp/data/raw/equity/equity_data_{today}.parquet')

In [8]:
print(equity.columns)
print(equity['yahoo_ticker'].unique().head(3))
print(len(equity['yahoo_ticker'].unique()))

['date', 'open', 'high', 'low', 'close', 'volume', 'dividends', 'stock splits', 'numerai_ticker', 'yahoo_ticker', 'capital gains']
shape: (3,)
Series: 'yahoo_ticker' [str]
[
	"TRNO"
	"LAZR"
	"Ayaly"
]
12168


In [12]:
# Create a DataFrame from the unique values
df = pl.DataFrame({
    "yahoo_ticker": equity["yahoo_ticker"].unique().to_list()
})

# Save to CSV
df.write_csv(f"/media/knight2/EDB/numer_crypto_temp/data/result/equity_tickers_succes_{today}.csv")


### Crypto identifier

In [21]:
crypto = pl.read_csv('/media/knight2/EDB/numer_crypto_temp/data/raw/identifier/Cryptos_20250707.csv')

In [23]:
print(crypto.columns)
print(len(crypto['symbol'].unique()))

['symbol', 'name', 'cryptos_train', 'cryptos_elig', 'coinmcap_rank_202507', 'platform', 'platform_symbol', 'prijs_vanaf', 'prijs_tot', 'in_all_currencie', 'data_ohlc_vanaf', 'data_ohlc_tot', 'volume_vanaf', 'volume_tot', 'uitstaand_vanaf', 'uitstaand_tot_202507', 'bron_uitstaand', 'api_rank', 'bron_historie', 'mcap_2025', 'top50', 'top_50_150_202507', 'top_150_250_202507', 'top_250_500_202507', 'top_500_750_202507', 'top_750_1000_202507', 'top_1000_1500_202507', 'top_1500_2000_202507', 'top_2000_10000_202507']
1742


### Crypto Price

In [13]:
price = pl.read_parquet(f'/media/knight2/EDB/numer_crypto_temp/data/processed/price/crypto_numerai_all_symbols_live_train_cleaned_{today}.parquet')

In [18]:
print(price.columns)
print(price['symbol'].unique().head(3))
print(len(price['source'].unique()))
print(price['source'].unique())
sources = price['source'].unique().sort().to_list()
for s in sources:
    print(s)
print(len(price['symbol'].unique()))

['symbol', 'name', 'date', 'open', 'high', 'low', 'close', 'volume', 'marketcap', 'source']
shape: (3,)
Series: 'symbol' [str]
[
	"PIT"
	"CTT"
	"BHC"
]
19
shape: (19,)
Series: 'source' [str]
[
	"binance"
	"kraken"
	"htx"
	"cryptocompare_api"
	"combined"
	…
	"bitfinex"
	"mobula"
	"dexscreener"
	"coinbase"
	"combined_sources"
]
binance
binance_api
bitfinex
bithumb
bybit
coinbase
coingecko
combined
combined_sources
cryptocompare
cryptocompare_api
dexscreener
gate
htx
kaggle_unified_crypto_data
kraken
kucoin
lbank
mobula
1739


### Equity feature selection

In [None]:
crossasset = pl.read_parquet(f'/media/knight2/EDB/numer_crypto_temp/data/processed/equity/selection/equity_selection_{today}.parquet')

### Cross (crypto) asset

In [None]:
crossasset = pl.read_parquet(f'')

### Yiedl

In [None]:
current_round = 

In [None]:
yiedl = pl.read_parquet(f'')

In [3]:
# Model-specific imports
model_status = {}

# H2O Sparkling Water
try:
    import h2o
    from h2o.automl import H2OAutoML
    model_status['H2O'] = True
    print("✅ H2O Sparkling Water available")
except ImportError:
    model_status['H2O'] = False
    print("⚠️ H2O not available")

# AutoGluon TimeSeries
try:
    from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
    model_status['AutoGluonTS'] = True
    print("✅ AutoGluon TimeSeries available")
except ImportError:
    model_status['AutoGluonTS'] = False
    print("⚠️ AutoGluon TimeSeries not available")

# SynapseML LightGBM
try:
    import lightgbm as lgb
    model_status['LightGBM'] = True
    print("✅ LightGBM available")
except ImportError:
    model_status['LightGBM'] = False
    print("⚠️ LightGBM not available")

# Data loading utilities
try:
    from data_preparation import ComprehensiveDataPreparation #ModularDataPreparation
    print("✅ Feature generators available")
except ImportError:
    print("⚠️ Feature generators not available")
# Data loading utilities
try:
    from real_data_features import RealDataFeatureGenerator
    print("✅ Feature generators available")
except ImportError:
    print("⚠️ Feature generators not available")

print(f"\n📊 Available models: {[k for k, v in model_status.items() if v]}")

✅ H2O Sparkling Water available
✅ AutoGluon TimeSeries available
✅ LightGBM available
✅ Feature generators available
✅ Feature generators available

📊 Available models: ['H2O', 'AutoGluonTS', 'LightGBM']


In [4]:
# Configuration
class Config:
    def __init__(self):
        self.timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        # Directories
        self.base_dir = Path("/media/knight2/EDB/numer_crypto_temp/data")
        self.submission_dir = self.base_dir / "submission"
        self.model_dir = self.base_dir / "model"
        self.repo_models_dir = Path("/media/knight2/EDB/repos/Numer_crypto_v4/models")
        
        # Create directories
        for dir_path in [self.submission_dir, self.model_dir, self.repo_models_dir]:
            dir_path.mkdir(parents=True, exist_ok=True)
        
        # Performance targets
        self.target_rmse = 0.19
        self.target_corr = 0.50
        self.target_mmc = 0.20
        
        # Model configurations
        self.model_configs = {
            'H2O': {'time_limit': 1800},  # 30 minutes
            'AutoGluonTS': {'time_limit': 1800},
            'LightGBM': {'time_limit': 1200}  # 20 minutes
        }

config = Config()
print(f"⚙️ Configuration initialized - Run ID: crypto_pipeline_{config.timestamp}")
print(f"📁 Submission dir: {config.submission_dir}")
print(f"💾 Model dir: {config.model_dir}")

⚙️ Configuration initialized - Run ID: crypto_pipeline_20250814_160231
📁 Submission dir: /media/knight2/EDB/numer_crypto_temp/data/submission
💾 Model dir: /media/knight2/EDB/numer_crypto_temp/data/model


In [5]:
# Setup logging
log_file = config.base_dir / "logs" / f"crypto_pipeline_notebook_{config.timestamp}.log"
log_file.parent.mkdir(exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

logger.info("🏆 Starting Crypto Pipeline V4 Notebook")
print(f"📝 Logging to: {log_file}")

2025-08-14 16:02:31,613 - INFO - 🏆 Starting Crypto Pipeline V4 Notebook


📝 Logging to: /media/knight2/EDB/numer_crypto_temp/data/logs/crypto_pipeline_notebook_20250814_160231.log


## 📊 Data Loading and Feature Engineering

In [6]:
from typing import List, Tuple, Optional
def load_real_data() -> Optional[pd.DataFrame]:
    """Load real historical price data"""
    print("📊 Loading real historical price data...")
    
    try:
        # Try primary parquet files first
        price_dir = "media/knight2/EDB/numer_crypto_temp/data/raw/price" #self.temp_dir / 
        parquet_files = list(price_dir.glob("*.parquet"))
        
        if parquet_files:
            price_df = pd.read_parquet(parquet_files[0])
            price_df['date'] = pd.to_datetime(price_df['date'])
            print(f"✅ Loaded {len(price_df):,} price records from parquet")
            return price_df
        
        # Fallback to CSV files
        csv_dir =  "/media/knight2/EDB/numer_crypto_temp/data/price/output" #self.temp_dir /
        if csv_dir.exists():
            csv_files = list(csv_dir.glob("*_historical.csv"))
            all_data = []
            
            for csv_file in csv_files:
                try:
                    df = pd.read_csv(csv_file)
                    if len(df) > 10000:  # Only cryptos with sufficient data
                        all_data.append(df)
                except Exception:
                    continue
            
            if all_data:
                combined_df = pd.concat(all_data, ignore_index=True)
                combined_df['date'] = pd.to_datetime(combined_df['date'])
                print(f"✅ Loaded {len(combined_df):,} CSV records")
                return combined_df

        # Prepare features and target
        feature_cols = [col for col in combined_df.columns if col not in ['symbol', 'target']]
        X = combined_df[['symbol'] + feature_cols]
        y = combined_df['target']
        
        # Remove any remaining NaN values
        mask = ~(X.isnull().any(axis=1) | y.isnull())
        X = X[mask].reset_index(drop=True)
        y = y[mask].reset_index(drop=True)
        
        load_time = time.time() - start_time
        logger.info(f"✅ Data loaded: {X.shape[0]} samples, {len(feature_cols)} features")
        logger.info(f"⏱️ Load time: {load_time:.1f} seconds")
        
        return X, y
        
    except Exception as e:
        print(f"❌ Failed to load price data: {str(e)}")
        return None

In [7]:
'''
def load_real_data():
    """Load real data with comprehensive feature engineering"""
    logger.info("📊 Loading real data with comprehensive features...")
    start_time = time.time()
    
    try:
        # Initialize feature generator
        feature_gen = RealDataFeatureGenerator()
        
        # Load comprehensive features
        df = feature_gen.generate_all_real_features()#generate_comprehensive_features()
        
        if df.empty:
            logger.error("❌ No data loaded from RealDataFeatureGenerator")
            return None, None
        
        # Prepare features and target
        feature_cols = [col for col in df.columns if col not in ['symbol', 'target']]
        X = df[['symbol'] + feature_cols]
        y = df['target']
        
        # Remove any remaining NaN values
        mask = ~(X.isnull().any(axis=1) | y.isnull())
        X = X[mask].reset_index(drop=True)
        y = y[mask].reset_index(drop=True)
        
        load_time = time.time() - start_time
        logger.info(f"✅ Data loaded: {X.shape[0]} samples, {len(feature_cols)} features")
        logger.info(f"⏱️ Load time: {load_time:.1f} seconds")
        
        return X, y
        
    except Exception as e:
        logger.error(f"❌ Data loading failed: {e}")
        return None, None
'''
# Load the data
X, y = load_real_data()

if X is not None and y is not None:
    print(f"\n📈 Data Summary:")
    print(f"   Samples: {len(X):,}")
    print(f"   Features: {len([col for col in X.columns if col != 'symbol']):,}")
    print(f"   Symbols: {X['symbol'].nunique():,}")
    print(f"   Target range: {y.min():.4f} to {y.max():.4f}")
    print(f"   Target mean: {y.mean():.4f} ± {y.std():.4f}")
else:
    print("❌ Failed to load data")


📊 Loading real historical price data...
❌ Failed to load price data: 'str' object has no attribute 'glob'


TypeError: cannot unpack non-iterable NoneType object

In [8]:
# Prepare data for training
if X is not None and y is not None:
    # Prepare features and split data
    feature_cols = [col for col in X.columns if col != 'symbol']
    X_features = X[feature_cols]
    symbols = X['symbol'].values
    
    # Train/test split with time series considerations
    X_train, X_test, y_train, y_test, symbols_train, symbols_test = train_test_split(
        X_features, y, symbols, test_size=0.3, random_state=42, shuffle=False
    )
    
    logger.info(f"✅ Data split: Train={len(X_train)}, Test={len(X_test)}, Features={len(feature_cols)}")
    
    print(f"\n🔄 Data Split Summary:")
    print(f"   Training samples: {len(X_train):,}")
    print(f"   Testing samples: {len(X_test):,}")
    print(f"   Features: {len(feature_cols):,}")
    
    # Display feature statistics
    print(f"\n📊 Feature Statistics:")
    print(f"   Training target mean: {y_train.mean():.4f} ± {y_train.std():.4f}")
    print(f"   Testing target mean: {y_test.mean():.4f} ± {y_test.std():.4f}")
    
else:
    print("❌ Cannot proceed without data")

NameError: name 'X' is not defined

## 🤖 Model Training

In [None]:
# Utility functions
def calculate_performance_metrics(y_true, y_pred):
    """Calculate comprehensive performance metrics"""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    correlation = spearmanr(y_true, y_pred)[0]
    
    # MMC estimation (simplified)
    mmc = np.corrcoef(y_true - y_true.mean(), y_pred - y_pred.mean())[0, 1] * 2 - 1
    mmc = max(-1, min(1, mmc))  # Clamp to [-1, 1]
    
    return {
        'rmse': rmse,
        'correlation': correlation,
        'mmc_estimate': mmc
    }

def check_performance_targets(metrics):
    """Check if performance meets targets"""
    return {
        'meets_targets': (
            metrics['rmse'] < config.target_rmse and
            metrics['correlation'] > config.target_corr and
            metrics['mmc_estimate'] > config.target_mmc
        ),
        'rmse_target': metrics['rmse'] < config.target_rmse,
        'corr_target': metrics['correlation'] > config.target_corr,
        'mmc_target': metrics['mmc_estimate'] > config.target_mmc
    }

def create_submission_file(predictions, symbols, model_name, metadata):
    """Create Numerai submission file"""
    submission_df = pd.DataFrame({
        'symbol': symbols,
        'signal': np.clip(predictions, 0.001, 0.999)  # Clip to valid range
    })
    
    submission_filename = f"{model_name.lower()}_submission_{config.timestamp}.csv"
    submission_path = config.submission_dir / submission_filename
    
    submission_df.to_csv(submission_path, index=False)
    
    # Save metadata
    metadata_filename = f"{model_name.lower()}_metadata_{config.timestamp}.json"
    metadata_path = config.submission_dir / metadata_filename
    
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2, default=str)
    
    return submission_path, metadata_path

print("🔧 Utility functions defined")

In [None]:
# H2O AutoML Training
def train_h2o_model():
    """Train H2O AutoML model"""
    if not model_status['H2O']:
        logger.warning("⚠️ H2O not available, skipping")
        return None, None, None
    
    logger.info("💧 Training H2O Sparkling Water AutoML...")
    start_time = time.time()
    
    try:
        # Initialize H2O
        h2o.init(nthreads=-1, max_mem_size="40G", strict_version_check=False)
        
        # Prepare data
        train_data = pd.concat([X_train, pd.Series(y_train, name='target')], axis=1)
        test_data = pd.concat([X_test, pd.Series(y_test, name='target')], axis=1)
        
        # Convert to H2O frames
        h2o_train = h2o.H2OFrame(train_data)
        h2o_test = h2o.H2OFrame(test_data)
        
        # Set target and features
        target = 'target'
        features = [col for col in h2o_train.columns if col != target]
        
        # Train AutoML
        aml = H2OAutoML(
            max_runtime_secs=config.model_configs['H2O']['time_limit'],
            seed=42,
            sort_metric='RMSE',
            exclude_algos=['DeepLearning'],  # Focus on faster algorithms
            max_models=20
        )
        
        aml.train(x=features, y=target, training_frame=h2o_train)
        
        # Make predictions
        predictions_h2o = aml.predict(h2o_test)
        y_pred_test = predictions_h2o.as_data_frame()['predict'].values
        
        # Calculate metrics
        metrics = calculate_performance_metrics(y_test, y_pred_test)
        targets_check = check_performance_targets(metrics)
        
        train_time = time.time() - start_time
        
        metadata = {
            'model_name': 'H2O',
            'test_rmse': float(metrics['rmse']),
            'test_correlation': float(metrics['correlation']),
            'mmc_estimate': float(metrics['mmc_estimate']),
            'train_time': train_time,
            'meets_targets': targets_check['meets_targets'],
            'best_model': str(aml.leader)
        }
        
        status = "✅ PASS" if targets_check['meets_targets'] else "⚠️ PARTIAL"
        logger.info(f"   {status} H2O: RMSE={metrics['rmse']:.4f}, Corr={metrics['correlation']:.4f}, MMC≈{metrics['mmc_estimate']:.4f}, Time={train_time:.1f}s")
        
        return aml, metadata, y_pred_test
        
    except Exception as e:
        logger.error(f"❌ H2O training failed: {e}")
        return None, None, None

# Train H2O model if available
if model_status['H2O'] and X is not None:
    h2o_model, h2o_metadata, h2o_predictions = train_h2o_model()
    
    if h2o_model is not None:
        # Create submission file
        submission_path, metadata_path = create_submission_file(
            h2o_predictions, symbols_test, 'H2O', h2o_metadata
        )
        print(f"📄 H2O submission: {submission_path.name}")
    else:
        print("❌ H2O training failed")
else:
    print("⚠️ H2O training skipped")
    h2o_model, h2o_metadata, h2o_predictions = None, None, None

In [None]:
# AutoGluon TimeSeries Training
def train_autogluon_timeseries():
    """Train AutoGluon TimeSeries model"""
    if not model_status['AutoGluonTS']:
        logger.warning("⚠️ AutoGluon TimeSeries not available, skipping")
        return None, None, None
    
    logger.info("📈 Training AutoGluon TimeSeriesPredictor...")
    start_time = time.time()
    
    try:
        # Prepare time series data (limited sample for efficiency)
        max_samples = min(10000, len(y_train))
        sample_indices = np.linspace(0, len(y_train)-1, max_samples, dtype=int)
        
        # Create time series dataframe
        ts_data = []
        for i, idx in enumerate(sample_indices):
            ts_data.append({
                'item_id': f'crypto_{i}',
                'timestamp': pd.Timestamp('2020-01-01') + pd.Timedelta(days=i),
                'target': y_train.iloc[idx]
            })
        
        train_ts = TimeSeriesDataFrame.from_data_frame(
            pd.DataFrame(ts_data),
            id_column='item_id',
            timestamp_column='timestamp'
        )
        
        # TimeSeries predictor
        ts_path = f'/media/knight2/EDB/tmp/autogluon_ts_{config.timestamp}'
        prediction_length = min(30, len(y_test))
        
        predictor = TimeSeriesPredictor(
            target='target',
            prediction_length=prediction_length,
            path=ts_path,
            eval_metric='RMSE',
            verbosity=1
        )
        
        # Train with optimized hyperparameters
        ts_hyperparameters = {
            "Naive": {},
            "SeasonalNaive": {},
            "ARIMA": {'maxiter': 50},
            "ETS": {'maxiter': 100},
            "Theta": {},
        }
        
        predictor.fit(
            train_ts,
            time_limit=config.model_configs['AutoGluonTS']['time_limit'],
            presets='best_quality',
            hyperparameters=ts_hyperparameters
        )
        
        # Make predictions
        predictions = predictor.predict(train_ts)
        y_pred_test = predictions['mean'].values[:len(y_test)]
        
        # Pad with mean if needed
        if len(y_pred_test) < len(y_test):
            mean_pred = np.mean(y_pred_test)
            padding = np.full(len(y_test) - len(y_pred_test), mean_pred)
            y_pred_test = np.concatenate([y_pred_test, padding])
        
        # Calculate metrics
        metrics = calculate_performance_metrics(y_test, y_pred_test)
        targets_check = check_performance_targets(metrics)
        
        train_time = time.time() - start_time
        
        metadata = {
            'model_name': 'AutoGluonTS',
            'test_rmse': float(metrics['rmse']),
            'test_correlation': float(metrics['correlation']),
            'mmc_estimate': float(metrics['mmc_estimate']),
            'train_time': train_time,
            'meets_targets': targets_check['meets_targets'],
            'prediction_length': prediction_length
        }
        
        status = "✅ PASS" if targets_check['meets_targets'] else "⚠️ PARTIAL"
        logger.info(f"   {status} AutoGluon TS: RMSE={metrics['rmse']:.4f}, Corr={metrics['correlation']:.4f}, MMC≈{metrics['mmc_estimate']:.4f}, Time={train_time:.1f}s")
        
        return predictor, metadata, y_pred_test
        
    except Exception as e:
        logger.error(f"❌ AutoGluon TS training failed: {e}")
        return None, None, None

# Train AutoGluon TS model if available
if model_status['AutoGluonTS'] and X is not None:
    agts_model, agts_metadata, agts_predictions = train_autogluon_timeseries()
    
    if agts_model is not None:
        # Create submission file
        submission_path, metadata_path = create_submission_file(
            agts_predictions, symbols_test, 'AutoGluonTS', agts_metadata
        )
        print(f"📄 AutoGluon TS submission: {submission_path.name}")
    else:
        print("❌ AutoGluon TS training failed")
else:
    print("⚠️ AutoGluon TS training skipped")
    agts_model, agts_metadata, agts_predictions = None, None, None

In [None]:
# SynapseML LightGBM Training
def train_lightgbm_model():
    """Train SynapseML LightGBM model with advanced hyperparameters"""
    if not model_status['LightGBM']:
        logger.warning("⚠️ LightGBM not available, skipping")
        return None, None, None
    
    logger.info("🔥 Training SynapseML LightGBM with advanced hyperparameters...")
    start_time = time.time()
    
    try:
        # Advanced LightGBM parameters optimized for crypto prediction
        lgb_params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'num_leaves': 127,
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'min_child_samples': 20,
            'min_child_weight': 0.001,
            'reg_alpha': 0.1,
            'reg_lambda': 0.1,
            'random_state': 42,
            'n_jobs': -1,
            'verbose': -1,
            'device_type': 'gpu',  # SynapseML GPU acceleration
            'gpu_platform_id': 0,
            'gpu_device_id': 0,
            'max_bin': 255,
            'min_data_in_bin': 3,
            'path_smooth': 0.1
        }
        
        # Prepare datasets
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
        
        # Train model with early stopping
        model = lgb.train(
            lgb_params,
            train_data,
            num_boost_round=2000,
            valid_sets=[train_data, val_data],
            valid_names=['train', 'eval'],
            early_stopping_rounds=50,
            verbose_eval=100
        )
        
        # Make predictions
        y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)
        
        # Calculate metrics
        metrics = calculate_performance_metrics(y_test, y_pred_test)
        targets_check = check_performance_targets(metrics)
        
        train_time = time.time() - start_time
        
        # Feature importance
        feature_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': model.feature_importance()
        }).sort_values('importance', ascending=False)
        
        metadata = {
            'model_name': 'LightGBM',
            'test_rmse': float(metrics['rmse']),
            'test_correlation': float(metrics['correlation']),
            'mmc_estimate': float(metrics['mmc_estimate']),
            'train_time': train_time,
            'meets_targets': targets_check['meets_targets'],
            'best_iteration': int(model.best_iteration),
            'num_features': len(feature_cols),
            'top_features': feature_importance.head(10)['feature'].tolist()
        }
        
        # Save model
        model_filename = f"lightgbm_model_{config.timestamp}.pkl"
        model_path = config.model_dir / model_filename
        model_path_repo = config.repo_models_dir / model_filename
        
        joblib.dump(model, model_path)
        joblib.dump(model, model_path_repo)
        
        status = "✅ PASS" if targets_check['meets_targets'] else "⚠️ PARTIAL"
        logger.info(f"   {status} LightGBM: RMSE={metrics['rmse']:.4f}, Corr={metrics['correlation']:.4f}, MMC≈{metrics['mmc_estimate']:.4f}, Time={train_time:.1f}s")
        logger.info(f"   💾 Model saved: {model_filename}")
        
        return model, metadata, y_pred_test
        
    except Exception as e:
        logger.error(f"❌ LightGBM training failed: {e}")
        return None, None, None

# Train LightGBM model if available
if model_status['LightGBM'] and X is not None:
    lgb_model, lgb_metadata, lgb_predictions = train_lightgbm_model()
    
    if lgb_model is not None:
        # Create submission file
        submission_path, metadata_path = create_submission_file(
            lgb_predictions, symbols_test, 'LightGBM', lgb_metadata
        )
        print(f"📄 LightGBM submission: {submission_path.name}")
    else:
        print("❌ LightGBM training failed")
else:
    print("⚠️ LightGBM training skipped")
    lgb_model, lgb_metadata, lgb_predictions = None, None, None

## 📊 Results Summary and Ensemble

In [None]:
# Collect results
all_results = {}
all_predictions = []
successful_models = []

# Add H2O results
if h2o_model is not None and h2o_predictions is not None:
    all_results['H2O'] = h2o_metadata
    all_predictions.append(h2o_predictions)
    successful_models.append('H2O')

# Add AutoGluon TS results
if agts_model is not None and agts_predictions is not None:
    all_results['AutoGluonTS'] = agts_metadata
    all_predictions.append(agts_predictions)
    successful_models.append('AutoGluonTS')

# Add LightGBM results
if lgb_model is not None and lgb_predictions is not None:
    all_results['LightGBM'] = lgb_metadata
    all_predictions.append(lgb_predictions)
    successful_models.append('LightGBM')

print(f"\n🎯 RESULTS SUMMARY:")
print("=" * 50)

if all_results:
    for model_name, metadata in all_results.items():
        status = "✅ PASS" if metadata['meets_targets'] else "⚠️ PARTIAL"
        print(f"{status} {model_name}:")
        print(f"   RMSE: {metadata['test_rmse']:.4f} (target: <{config.target_rmse})")
        print(f"   Correlation: {metadata['test_correlation']:.4f} (target: >{config.target_corr})")
        print(f"   MMC: {metadata['mmc_estimate']:.4f} (target: >{config.target_mmc})")
        print(f"   Training Time: {metadata['train_time']:.1f}s")
        print()
    
    # Performance summary
    best_rmse = min([r['test_rmse'] for r in all_results.values()])
    best_corr = max([r['test_correlation'] for r in all_results.values()])
    best_mmc = max([r['mmc_estimate'] for r in all_results.values()])
    
    print(f"🏆 BEST PERFORMANCE:")
    print(f"   Best RMSE: {best_rmse:.4f}")
    print(f"   Best Correlation: {best_corr:.4f}")
    print(f"   Best MMC: {best_mmc:.4f}")
    
else:
    print("❌ No successful models to report")

In [None]:
# Create ensemble if multiple models are available
if len(all_predictions) > 1:
    logger.info(f"🎭 Creating ensemble from {len(all_predictions)} models...")
    
    # Simple average ensemble
    ensemble_predictions = np.mean(all_predictions, axis=0)
    
    # Calculate ensemble metrics
    ensemble_metrics = calculate_performance_metrics(y_test, ensemble_predictions)
    ensemble_targets = check_performance_targets(ensemble_metrics)
    
    # Create ensemble metadata
    ensemble_metadata = {
        'model_name': 'Ensemble',
        'test_rmse': float(ensemble_metrics['rmse']),
        'test_correlation': float(ensemble_metrics['correlation']),
        'mmc_estimate': float(ensemble_metrics['mmc_estimate']),
        'meets_targets': ensemble_targets['meets_targets'],
        'component_models': successful_models,
        'ensemble_method': 'simple_average'
    }
    
    # Create ensemble submission
    submission_path, metadata_path = create_submission_file(
        ensemble_predictions, symbols_test, 'Ensemble', ensemble_metadata
    )
    
    status = "✅ PASS" if ensemble_targets['meets_targets'] else "⚠️ PARTIAL"
    print(f"\n{status} ENSEMBLE ({len(successful_models)} models):")
    print(f"   RMSE: {ensemble_metrics['rmse']:.4f}")
    print(f"   Correlation: {ensemble_metrics['correlation']:.4f}")
    print(f"   MMC: {ensemble_metrics['mmc_estimate']:.4f}")
    print(f"   📄 Ensemble submission: {submission_path.name}")
    
else:
    print("⚠️ Ensemble not created (need 2+ models)")

In [None]:
# Final summary
print(f"\n🎉 CRYPTO PIPELINE V4 COMPLETED!")
print("=" * 50)
print(f"📊 Successful Models: {len(successful_models)}/{len(model_status)}")
print(f"🏆 Models: {', '.join(successful_models) if successful_models else 'None'}")
print(f"📁 Submission Directory: {config.submission_dir}")
print(f"💾 Model Directory: {config.model_dir}")
print(f"📝 Log File: {log_file}")

if successful_models:
    # List all generated files
    submission_files = list(config.submission_dir.glob(f"*_{config.timestamp}.csv"))
    print(f"\n📄 Generated Submission Files:")
    for file in submission_files:
        print(f"   • {file.name}")
    
    print(f"\n✅ Pipeline completed successfully!")
    print(f"   Ready for Numerai submission: {len(submission_files)} files")
else:
    print(f"\n❌ Pipeline failed - no successful models")

logger.info(f"🏆 Crypto Pipeline V4 notebook completed - {len(successful_models)} successful models")