In [1]:
"""
Market Regime Classification Model Training Script
Generates a simple model that classifies market conditions into regimes.
"""
import json
import pickle
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report


def generate_synthetic_data(n_samples=5000, random_state=42):
    """Generate synthetic market data with regime labels."""
    np.random.seed(random_state)
    
    # Generate features
    returns = np.random.randn(n_samples) * 0.02  # Daily returns
    volatility = np.abs(np.random.randn(n_samples) * 0.15 + 0.20)  # Volatility
    volume_ratio = np.random.gamma(2, 0.5, n_samples)  # Volume relative to average
    
    # Create regime labels based on rules
    regimes = np.zeros(n_samples, dtype=int)
    
    for i in range(n_samples):
        if returns[i] > 0.01 and volatility[i] < 0.25:
            regimes[i] = 0  # Bull market
        elif returns[i] < -0.01 and volatility[i] < 0.25:
            regimes[i] = 1  # Bear market
        elif volatility[i] > 0.30:
            regimes[i] = 3  # High volatility
        else:
            regimes[i] = 2  # Sideways/neutral
    
    df = pd.DataFrame({
        'returns': returns,
        'volatility': volatility,
        'volume_ratio': volume_ratio,
        'regime': regimes
    })
    
    return df


def train_model(df):
    """Train a RandomForest classifier."""
    X = df[['returns', 'volatility', 'volume_ratio']]
    y = df['regime']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print("\nModel Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, 
                                target_names=['Bull', 'Bear', 'Sideways', 'High Vol']))
    
    return model, accuracy, f1


def save_model(model, filename='model.pkl'):
    """Save model using pickle."""
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"\n✓ Saved {filename}")


def create_requirements():
    """Create requirements.txt file."""
    requirements = """scikit-learn==1.3.2
pandas==2.1.0
numpy==1.24.3
joblib==1.3.2"""
    
    with open('requirements.txt', 'w') as f:
        f.write(requirements)
    print("✓ Saved requirements.txt")


def create_metadata(accuracy, f1):
    """Create metadata.json file."""
    metadata = {
        "model_name": "Market_Regime_Classifier",
        "version": "1.0.0",
        "description": "Market regime classification model that predicts Bull, Bear, Sideways, or High Volatility market conditions based on returns, volatility, and volume patterns.",
        "framework": {
            "name": "scikit-learn",
            "version": "1.3.2"
        },
        "artifact": "model.pkl",
        "entrypoint": "inference.py",
        "input_schema": {
            "type": "object",
            "properties": {
                "returns": {
                    "type": "number",
                    "description": "Daily return as decimal (e.g., 0.02 for 2%)"
                },
                "volatility": {
                    "type": "number",
                    "description": "Volatility measure (0.0 to 1.0)"
                },
                "volume_ratio": {
                    "type": "number",
                    "description": "Volume relative to average (typically 0.5 to 2.0)"
                }
            },
            "required": ["returns", "volatility", "volume_ratio"]
        },
        "output_schema": {
            "type": "object",
            "properties": {
                "regime": {
                    "type": "integer",
                    "description": "0=Bull, 1=Bear, 2=Sideways, 3=High Volatility"
                },
                "regime_name": {
                    "type": "string"
                },
                "probability": {
                    "type": "array",
                    "description": "Probability for each regime class"
                }
            }
        },
        "training_metadata": {
            "dataset": "synthetic_market_data",
            "target_column": "regime",
            "n_samples": 5000,
            "regime_labels": {
                "0": "Bull Market",
                "1": "Bear Market",
                "2": "Sideways Market",
                "3": "High Volatility"
            },
            "metrics": {
                "accuracy": round(accuracy, 4),
                "f1_score": round(f1, 4)
            },
            "training_date": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
            "trained_by": "data.science@example.com"
        },
        "source": {
            "environment": "local development",
            "tool": "Python script",
            "repository": "https://github.com/example/regime-classifier"
        },
        "governance": {
            "model_owner": "Data Science Team",
            "risk_tier": "Medium",
            "validation_required": True,
            "external_upload": True
        }
    }
    
    with open('metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    print("✓ Saved metadata.json")


def create_inference():
    """Create inference.py file."""
    inference_code = '''"""
Inference script for Market Regime Classification Model
"""
import joblib
import pandas as pd


def load_model():
    """Load the trained model."""
    return joblib.load("model.pkl")


# Load model at module level for efficiency
model = load_model()

# Regime mapping
REGIME_NAMES = {
    0: "Bull Market",
    1: "Bear Market", 
    2: "Sideways Market",
    3: "High Volatility"
}


def predict(data):
    """
    Make predictions on input data.
    
    Args:
        data: dict or list of dicts with keys: returns, volatility, volume_ratio
        
    Returns:
        list of dicts with regime, regime_name, and probability
    """
    # Convert to DataFrame
    if isinstance(data, dict):
        data = [data]
    df = pd.DataFrame(data)
    
    # Ensure correct column order
    required_cols = ['returns', 'volatility', 'volume_ratio']
    df = df[required_cols]
    
    # Get predictions and probabilities
    predictions = model.predict(df)
    probabilities = model.predict_proba(df)
    
    # Format results
    results = []
    for pred, proba in zip(predictions, probabilities):
        results.append({
            'regime': int(pred),
            'regime_name': REGIME_NAMES[pred],
            'probability': proba.tolist()
        })
    
    return results
'''
    
    with open('inference.py', 'w') as f:
        f.write(inference_code)
    print("✓ Saved inference.py")


def main():
    """Main training pipeline."""
    print("=" * 60)
    print("Market Regime Classification Model Training")
    print("=" * 60)
    
    # Generate data
    print("\n1. Generating synthetic market data...")
    df = generate_synthetic_data()
    print(f"   Generated {len(df)} samples")
    print(f"   Regime distribution:\n{df['regime'].value_counts().sort_index()}")
    
    # Train model
    print("\n2. Training model...")
    model, accuracy, f1 = train_model(df)
    
    # Save all files
    print("\n3. Saving model artifacts...")
    save_model(model)
    create_requirements()
    create_metadata(accuracy, f1)
    create_inference()
    
    print("\n" + "=" * 60)
    print("✓ Training complete! Generated files:")
    print("  - model.pkl")
    print("  - requirements.txt")
    print("  - metadata.json")
    print("  - inference.py")
    print("=" * 60)
    
    # Test inference
    print("\n4. Testing inference...")
    from inference import predict
    
    test_data = {
        'returns': 0.015,
        'volatility': 0.18,
        'volume_ratio': 1.2
    }
    result = predict(test_data)
    print(f"   Input: {test_data}")
    print(f"   Prediction: {result[0]['regime_name']}")
    print(f"   Confidence: {max(result[0]['probability']):.2%}")


if __name__ == "__main__":
    main()

Market Regime Classification Model Training

1. Generating synthetic market data...
   Generated 5000 samples
   Regime distribution:
regime
0    1010
1     978
2    1743
3    1269
Name: count, dtype: int64

2. Training model...

Model Performance:
Accuracy: 1.0000
F1 Score: 1.0000

Classification Report:
              precision    recall  f1-score   support

        Bull       1.00      1.00      1.00       202
        Bear       1.00      1.00      1.00       196
    Sideways       1.00      1.00      1.00       348
    High Vol       1.00      1.00      1.00       254

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000


3. Saving model artifacts...

✓ Saved model.pkl
✓ Saved requirements.txt
✓ Saved metadata.json
✓ Saved inference.py

✓ Training complete! Generated files:
  - model.pkl
  - requirements.txt
  - metadata.json
  - inference.py

4. Testing inference...
   Inpu