# Fit the small model

In [1]:
# mr_hydra_multi_stock_training.py

import os
import sys
import gc
import glob
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifierCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import TimeSeriesSplit
from aeon.transformations.collection.convolution_based import MultiRocket
from aeon.transformations.collection.convolution_based import HydraTransformer
from aeon.utils.validation import check_n_jobs

def load_and_preprocess(filename):
    if not os.path.exists(filename):
        raise FileNotFoundError(f"No file at {filename}")
    df = pd.read_csv(filename, parse_dates=["Date"])
    df["Direction"] = (df["Close"].shift(-1) > df["Close"]).astype(int)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    return df

def rolling_windows(df, window_size=10, drop_cols=None):
    if drop_cols is None:
        drop_cols = ["Date", "Close", "Target", "Direction",
                    "High", "Low", "Open", "High_lag1", "Low_lag1", "Open_lag1"]
    Xs, ys = [], []
    features = df.drop(columns=drop_cols, errors='ignore')  # Added errors='ignore'
    y = df["Direction"].values
    for i in range(window_size, len(df)):
        Xs.append(features.iloc[i-window_size : i].values)
        ys.append(y[i])
    return np.array(Xs), np.array(ys)

def reshape_for_aeon(X_3d):
    return np.transpose(X_3d, (0, 2, 1))

def mr_hydra_train_eval(df, stock_name, window_size=10,
                        mr_kernels=500,
                        hydra_groups=8, hydra_kernels=4,
                        test_ratio=0.2,
                        n_jobs=None,
                        random_state=42,
                        save_path="models/"):
    
    print(f"\n{'='*60}")
    print(f"Training model for: {stock_name}")
    print(f"{'='*60}")
    
    try:
        Xw, yw = rolling_windows(df, window_size=window_size)
        print(f"Data shape after rolling windows: {Xw.shape}")
        
        if len(Xw) < 100:  # Minimum samples check
            print(f"⚠️  Warning: Only {len(Xw)} samples for {stock_name}. Skipping.")
            return False
            
        scaler = StandardScaler()
        ns, ts, nf = Xw.shape
        X_flat = Xw.reshape(ns * ts, nf)
        X_flat = scaler.fit_transform(X_flat).astype("float32")
        X_scaled = X_flat.reshape(ns, ts, nf)
        X = reshape_for_aeon(X_scaled)
        y = yw

        split_idx = int((1 - test_ratio) * ns)
        X_train, X_test = X[:split_idx], X[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]
        
        print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")

        if n_jobs is None:
            n_jobs = check_n_jobs(-1)

        print("Training Hydra transformer...")
        hydra = HydraTransformer(n_groups=hydra_groups,
                      n_kernels=hydra_kernels,
                      random_state=random_state)

        X_hydra_train = hydra.fit_transform(X_train, y_train)
        X_hydra_test = hydra.transform(X_test)

        print("Training MultiRocket transformer...")
        mr = MultiRocket(n_kernels=mr_kernels,
                         n_jobs=n_jobs,
                         random_state=random_state)
        X_mr_train = mr.fit_transform(X_train, y_train)
        X_mr_test = mr.transform(X_test)

        X_train_full = np.hstack([X_hydra_train, X_mr_train])
        X_test_full = np.hstack([X_hydra_test, X_mr_test])

        print("Training classifier...")
        clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
        clf.fit(X_train_full, y_train)

        y_pred = clf.predict(X_test_full)
        acc = accuracy_score(y_test, y_pred)
        print(f"Test accuracy: {acc:.4f}")
        print(classification_report(y_test, y_pred, target_names=["Down", "Up"]))

        # Save model components
        os.makedirs(save_path, exist_ok=True)
        model_data = {
            'scaler': scaler,
            'hydra': hydra,
            'multirocket': mr,
            'classifier': clf,
            'window_size': window_size,
            'n_features': nf,
            'stock_name': stock_name,
            'test_accuracy': acc,
            'drop_cols': ["Date", "Close", "Target", "Direction",
                         "High", "Low", "Open", "High_lag1", "Low_lag1", "Open_lag1"]
        }
        
        model_filename = f"{save_path}/{stock_name}_mr_hydra_model.pkl"
        joblib.dump(model_data, model_filename)
        print(f"✓ Model saved to {model_filename}")
        
        # Clean up memory
        del hydra, mr, clf, X_train_full, X_test_full
        gc.collect()
        
        return True
        
    except Exception as e:
        print(f"❌ Error training {stock_name}: {str(e)}")
        return False

def train_all_stocks(data_dir="data/processed/stock_data/", save_path="models-small/", **kwargs):
    """Train models for all stock datasets in the directory"""
    
    # Find all CSV files
    csv_files = glob.glob(os.path.join(data_dir, "*.csv"))
    
    if not csv_files:
        print(f"No CSV files found in {data_dir}")
        return
    
    print(f"Found {len(csv_files)} datasets to process")
    
    results = []
    successful = 0
    failed = 0
    
    for i, filepath in enumerate(csv_files, 1):
        # Extract stock name from filename
        filename = os.path.basename(filepath)
        stock_name = filename.replace('_daily_features.csv', '').replace('.csv', '')
        
        print(f"\n[{i}/{len(csv_files)}] Processing {stock_name}...")
        
        try:
            df = load_and_preprocess(filepath)
            print(f"Loaded {len(df)} rows for {stock_name}")
            
            success = mr_hydra_train_eval(df, stock_name, save_path=save_path, **kwargs)
            
            if success:
                successful += 1
                results.append({'stock': stock_name, 'status': 'success', 'rows': len(df)})
            else:
                failed += 1
                results.append({'stock': stock_name, 'status': 'failed', 'rows': len(df)})
                
        except Exception as e:
            print(f"❌ Failed to process {stock_name}: {str(e)}")
            failed += 1
            results.append({'stock': stock_name, 'status': 'error', 'error': str(e)})
    
    # Summary
    print(f"\n{'='*60}")
    print("TRAINING SUMMARY")
    print(f"{'='*60}")
    print(f"Total datasets: {len(csv_files)}")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")
    
    # Save results summary
    results_df = pd.DataFrame(results)
    summary_path = f"{save_path}/training_summary-small.csv"
    results_df.to_csv(summary_path, index=False)
    print(f"\nDetailed results saved to: {summary_path}")
    
    # Show successful models
    successful_models = [r['stock'] for r in results if r['status'] == 'success']
    if successful_models:
        print(f"\nSuccessfully trained models for:")
        for stock in successful_models:
            print(f"  - {stock}")

if __name__ == "__main__":
    train_all_stocks(
        data_dir="data/processed/stock_data/",
        save_path="models-small/",
        window_size=10,
        mr_kernels=500,
        hydra_groups=8,
        hydra_kernels=4,
        test_ratio=0.2
    )

Found 5 datasets to process

[1/5] Processing AAPL...
Loaded 11224 rows for AAPL

Training model for: AAPL
Data shape after rolling windows: (11214, 10, 17)
Train samples: 8971, Test samples: 2243
Training Hydra transformer...
Training MultiRocket transformer...
Training classifier...
Test accuracy: 0.4695
              precision    recall  f1-score   support

        Down       0.46      0.90      0.61      1039
          Up       0.53      0.10      0.17      1204

    accuracy                           0.47      2243
   macro avg       0.50      0.50      0.39      2243
weighted avg       0.50      0.47      0.37      2243

✓ Model saved to models-small//AAPL_mr_hydra_model.pkl

[2/5] Processing GE...
Loaded 15977 rows for GE

Training model for: GE
Data shape after rolling windows: (15967, 10, 17)
Train samples: 12773, Test samples: 3194
Training Hydra transformer...
Training MultiRocket transformer...
Training classifier...
Test accuracy: 0.4975
              precision    recall  