<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/crypto_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==================== COMPLETE MODEL RETRAINING SCRIPT WITH EARLY STOPPING ====================
import sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from ta.trend import MACD
from ta.volatility import BollingerBands, AverageTrueRange
from ta.volume import on_balance_volume
from ta.momentum import RSIIndicator
import warnings
warnings.filterwarnings("ignore")

# Import Keras Callbacks
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Import imblearn for SMOTE and RUS
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

# Database configurations for NEW models (data from 2023 onwards)
db_configs_recent_data = [
    {'db_path': '/content/gdrive/MyDrive/TradingBotLogs/ohlcv_data_SOL.db', 'table_name': 'solusd_1h_data', 'symbol': 'SOL/USD', 'model_path': '/content/gdrive/MyDrive/TradingBotLogs/crypto_model_retrained_recent_data_SOL.keras'},
    {'db_path': '/content/gdrive/MyDrive/TradingBotLogs/ohlcv_data_LDO.db', 'table_name': 'ldousd_1h_data', 'symbol': 'LDO/USD', 'model_path': '/content/gdrive/MyDrive/TradingBotLogs/crypto_model_retrained_recent_data_LDO.keras'},
    {'db_path': '/content/gdrive/MyDrive/TradingBotLogs/ohlcv_data_TAO.db', 'table_name': 'taousd_1h_data', 'symbol': 'TAO/USD', 'model_path': '/content/gdrive/MyDrive/TradingBotLogs/crypto_model_retrained_recent_data_TAO.keras'},
    {'db_path': '/content/gdrive/MyDrive/TradingBotLogs/ohlcv_data.db',     'table_name': 'ethusd_1h_data', 'symbol': 'ETH/USD', 'model_path': '/content/gdrive/MyDrive/TradingBotLogs/crypto_model_retrained_recent_data_ETH.keras'},
    {'db_path': '/content/gdrive/MyDrive/TradingBotLogs/ohlcv_data_BTC.db', 'table_name': 'btcusd_1h_data', 'symbol': 'BTC/USD', 'model_path': '/content/gdrive/MyDrive/TradingBotLogs/crypto_model_retrained_recent_data_BTC.keras'}
]

def load_sqlite_data(symbol_config):
    """Load data from SQLite database"""
    symbol = symbol_config['symbol']
    db_path = symbol_config['db_path']
    table_name = symbol_config['table_name']

    conn = sqlite3.connect(db_path)
    query = f"SELECT * FROM {table_name} ORDER BY timestamp"
    df = pd.read_sql_query(query, conn, parse_dates=['timestamp'])
    conn.close()

    if df.empty:
        raise ValueError(f"No data in {db_path}/{table_name}")

    if df['timestamp'].dt.tz is None:
        df['timestamp'] = pd.to_datetime(df['timestamp']).dt.tz_localize('UTC').dt.tz_convert('America/New_York')
    else:
        df['timestamp'] = df['timestamp'].dt.tz_convert('America/New_York')

    df.set_index('timestamp', inplace=True)
    df = df[~df.index.duplicated(keep='last')]
    return df

def calculate_technical_indicators(df):
    """Calculate technical indicators"""
    df = df.copy()
    df[['open', 'high', 'low', 'close', 'volume']] = df[['open', 'high', 'low', 'close', 'volume']].ffill()

    df['RSI'] = RSIIndicator(df['close'], window=14).rsi()
    macd = MACD(df['close'], window_slow=26, window_fast=12, window_sign=9)
    df['MACD'] = macd.macd()
    df['MACD_Signal'] = macd.macd_signal()
    bb = BollingerBands(df['close'], window=20, window_dev=2)
    df['BB_Upper'] = bb.bollinger_hband()
    df['BB_Lower'] = bb.bollinger_lband()
    df['OBV'] = on_balance_volume(df['close'], df['volume'])
    df['ATR'] = AverageTrueRange(df['high'], df['low'], df['close'], window=14).average_true_range()

    return df.dropna()

def create_model(input_shape):
    """Create CNN-LSTM model architecture"""
    model = Sequential([
        Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape),
        MaxPooling1D(pool_size=2),
        Conv1D(filters=32, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        LSTM(100, return_sequences=True),
        Dropout(0.3),
        LSTM(50),
        Dropout(0.3),
        Dense(50, activation='relu'),
        Dropout(0.3),
        Dense(3, activation='softmax')  # 3 classes: Hold, Buy, Sell
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

def prepare_training_data(df, look_back=72):
    """Prepare data for training"""
    features = ['open', 'high', 'low', 'close', 'volume', 'RSI', 'MACD', 'MACD_Signal', 'BB_Upper', 'BB_Lower', 'OBV', 'ATR']

    # Calculate indicators
    df = calculate_technical_indicators(df)

    # Create sequences
    X_list, y_list = [], []
    for i in range(look_back, len(df)):
        X_list.append(df[features].iloc[i-look_back:i].values)
        # Create labels based on future price movement
        future_return = (df['close'].iloc[i] - df['close'].iloc[i-1]) / df['close'].iloc[i-1]
        if future_return > 0.002:  # Buy signal
            y_list.append([0, 1, 0])
        elif future_return < -0.002:  # Sell signal
            y_list.append([0, 0, 1])
        else:  # Hold
            y_list.append([1, 0, 0])

    X = np.array(X_list)
    y = np.array(y_list)

    # Reshape X to 2D for SMOTE (SMOTE-RUS needs 2D data)
    X_reshaped = X.reshape(X.shape[0], -1)

    return X_reshaped, y, X.shape[1], X.shape[2]

def retrain_single_model(symbol_config, epochs=100, look_back=72):
    """Retrain model for a single symbol with 1:1:1 SMOTE-RUS and early stopping"""
    symbol = symbol_config['symbol']
    model_path = symbol_config['model_path']

    print(f"\n🎯 Retraining {symbol} model with a perfect 1:1:1 class balance...")

    try:
        # Load and prepare data
        df = load_sqlite_data(symbol_config)
        df = df[df.index >= '2023-01-01']

        if len(df) < 5000:
            print(f"⚠️ Insufficient data for {symbol} ({len(df)} rows), skipping...")
            return False

        print(f"📊 Training on {len(df)} rows from {df.index[0].date()} to {df.index[-1].date()}")

        # Prepare training data (returns reshaped X for resampling)
        X_2d, y, n_steps, n_features = prepare_training_data(df, look_back)

        if len(X_2d) == 0:
            print(f"❌ No training sequences generated for {symbol}")
            return False

        print(f"✅ Generated {len(X_2d)} training sequences. Initial class distribution: {Counter(np.argmax(y, axis=1))}")

        # Split data before resampling
        X_train, X_val, y_train, y_val = train_test_split(
            X_2d, y, test_size=0.2, random_state=42, stratify=y
        )

        # --- CORRECTED SMOTE-RUS IMPLEMENTATION ---

        # Find the true majority class dynamically
        train_counts = Counter(np.argmax(y_train, axis=1))
        # Find the class with the maximum count
        majority_class_label = max(train_counts, key=train_counts.get)
        majority_count = train_counts[majority_class_label]

        # Set the target count for ALL classes to be the majority count
        target_count = majority_count

        sampling_strategy = {
            0: target_count, # Hold
            1: target_count, # Buy
            2: target_count  # Sell
        }

        pipeline = Pipeline(steps=[
            ('o', SMOTE(sampling_strategy=sampling_strategy)),
            ('u', RandomUnderSampler(sampling_strategy=sampling_strategy))
        ])

        X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

        print(f"✅ Training data resampled. New class distribution: {Counter(np.argmax(y_train_resampled, axis=1))}")

        X_train_final = X_train_resampled.reshape(-1, n_steps, n_features)
        X_val_final = X_val.reshape(-1, n_steps, n_features)

        # Create and train model
        model = create_model((n_steps, n_features))

        # --- ADDED CALLBACKS FOR EARLY STOPPING ---
        early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
        model_checkpoint = ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True, mode='min')
        callbacks_list = [early_stopping, model_checkpoint]

        print(f"🔄 Training {symbol} model for {epochs} epochs with early stopping...")
        history = model.fit(
            X_train_final, y_train_resampled,
            validation_data=(X_val_final, y_val),
            epochs=epochs,
            batch_size=32,
            verbose=1,
            callbacks=callbacks_list  # Pass the callbacks here
        )

        # The best model has already been saved by ModelCheckpoint, so we don't need model.save()
        print(f"💾 Best model saved to {model_path} via ModelCheckpoint.")

        # Print final accuracy from the training history
        train_acc = history.history['accuracy'][-1]
        val_acc = history.history['val_accuracy'][-1]
        print(f"✅ {symbol} training complete! Final accuracy: Train={train_acc:.4f}, Val={val_acc:.4f}")

        return True

    except Exception as e:
        print(f"❌ Error retraining {symbol}: {e}")
        return False

# ==================== EXECUTE RETRAINING ====================
if __name__ == "__main__":
    print("🚀 Starting model retraining with a perfect 1:1:1 class balance and early stopping...")
    print("="*70)

    successful_retrains = 0
    for config in db_configs_recent_data:
        if retrain_single_model(config, epochs=100, look_back=72):
            successful_retrains += 1

    print(f"\n🎉 Retraining completed! {successful_retrains}/{len(db_configs_recent_data)} models successfully retrained!")
    print("✅ Your new models, with a perfect 33.33% balance, are ready for backtesting.")