In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import mutual_info_classif, SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Data Loading and Preparation

In [None]:
# Configuration
SYMBOLS = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA']
LOOKBACK_DAYS = 252  # 1 year of trading days
PREDICTION_HORIZON = 5  # Predict 5-day returns

# Feature categories
TECHNICAL_FEATURES = [
    # Momentum
    'rsi_14', 'rsi_7', 'stochastic_k', 'stochastic_d', 'williams_r',
    'roc_10', 'roc_20', 'momentum_10',
    # Trend
    'macd_line', 'macd_signal', 'macd_histogram', 'adx_14',
    'plus_di', 'minus_di', 'aroon_oscillator', 'cci_20',
    # Moving Averages
    'price_vs_sma_20', 'price_vs_sma_50', 'price_vs_sma_200',
    # Volatility
    'bb_width', 'bb_percent_b', 'atr_percent', 'volatility_20',
    # Volume
    'mfi_14', 'volume_ratio'
]

FUNDAMENTAL_FEATURES = [
    'pe_ratio', 'forward_pe', 'peg_ratio', 'pb_ratio', 'ps_ratio',
    'roe', 'roa', 'roic', 'gross_margin', 'operating_margin',
    'net_margin', 'debt_to_equity', 'current_ratio', 'quick_ratio',
    'revenue_growth', 'earnings_growth', 'dividend_yield'
]

MARKET_FEATURES = [
    'spy_correlation_20', 'spy_correlation_60', 'spy_beta',
    'sector_relative_strength', 'market_regime',
    'vix_level', 'put_call_ratio'
]

ALL_FEATURES = TECHNICAL_FEATURES + FUNDAMENTAL_FEATURES + MARKET_FEATURES
print(f"Total features: {len(ALL_FEATURES)}")

In [None]:
def generate_synthetic_features(n_samples=1000, n_features=50):
    """
    Generate synthetic feature data for demonstration.
    In production, this would load from the FeatureStore.
    """
    np.random.seed(42)
    
    # Generate base features
    X = pd.DataFrame()
    
    # Technical features (some correlated with target)
    X['rsi_14'] = np.random.uniform(20, 80, n_samples)
    X['rsi_7'] = X['rsi_14'] + np.random.normal(0, 5, n_samples)
    X['macd_histogram'] = np.random.normal(0, 1, n_samples)
    X['adx_14'] = np.random.uniform(10, 50, n_samples)
    X['bb_percent_b'] = np.random.uniform(0, 1, n_samples)
    X['atr_percent'] = np.random.uniform(1, 5, n_samples)
    X['volume_ratio'] = np.random.lognormal(0, 0.5, n_samples)
    X['mfi_14'] = np.random.uniform(20, 80, n_samples)
    X['price_vs_sma_20'] = np.random.normal(0, 3, n_samples)
    X['price_vs_sma_50'] = np.random.normal(0, 5, n_samples)
    X['momentum_10'] = np.random.normal(0, 2, n_samples)
    X['stochastic_k'] = np.random.uniform(0, 100, n_samples)
    
    # Fundamental features
    X['pe_ratio'] = np.random.lognormal(3, 0.5, n_samples)
    X['pb_ratio'] = np.random.lognormal(1, 0.5, n_samples)
    X['roe'] = np.random.uniform(-10, 40, n_samples)
    X['debt_to_equity'] = np.random.lognormal(0, 0.8, n_samples)
    X['revenue_growth'] = np.random.normal(10, 20, n_samples)
    X['earnings_growth'] = np.random.normal(15, 30, n_samples)
    
    # Market features
    X['spy_correlation_20'] = np.random.uniform(0.3, 0.95, n_samples)
    X['spy_beta'] = np.random.uniform(0.5, 2.0, n_samples)
    X['sector_relative_strength'] = np.random.normal(0, 1, n_samples)
    X['vix_level'] = np.random.lognormal(2.8, 0.3, n_samples)
    
    # Generate target (price direction) with some feature dependency
    prob = 0.5 + 0.01 * (X['macd_histogram'] + X['momentum_10']) \
              + 0.005 * (X['rsi_14'] - 50) \
              + 0.002 * X['sector_relative_strength']
    prob = np.clip(prob, 0.1, 0.9)
    y = (np.random.random(n_samples) < prob).astype(int)
    
    return X, y

# Generate data
X, y = generate_synthetic_features(n_samples=2000)
print(f"Dataset shape: {X.shape}")
print(f"Target distribution: {np.bincount(y)}")
X.head()

## 2. Feature Distribution Analysis

In [None]:
# Feature statistics
feature_stats = X.describe().T
feature_stats['missing'] = X.isnull().sum()
feature_stats['skewness'] = X.skew()
feature_stats['kurtosis'] = X.kurtosis()
feature_stats

In [None]:
# Plot feature distributions
fig, axes = plt.subplots(5, 4, figsize=(16, 20))
axes = axes.flatten()

for i, col in enumerate(X.columns):
    if i >= len(axes):
        break
    ax = axes[i]
    
    # Plot distribution by target class
    for label in [0, 1]:
        X[y == label][col].hist(ax=ax, alpha=0.5, bins=30, 
                                 label=f'Down' if label == 0 else 'Up')
    ax.set_title(col)
    ax.legend()

plt.tight_layout()
plt.suptitle('Feature Distributions by Target Class', y=1.02, fontsize=14)
plt.show()

## 3. Correlation Analysis

In [None]:
# Feature correlation matrix
correlation_matrix = X.corr()

plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f',
            cmap='RdBu_r', center=0, vmin=-1, vmax=1,
            square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Identify highly correlated feature pairs
high_corr_threshold = 0.7

high_correlations = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i + 1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > high_corr_threshold:
            high_correlations.append({
                'feature_1': correlation_matrix.columns[i],
                'feature_2': correlation_matrix.columns[j],
                'correlation': correlation_matrix.iloc[i, j]
            })

high_corr_df = pd.DataFrame(high_correlations).sort_values('correlation', 
                                                            key=abs, 
                                                            ascending=False)
print(f"Highly correlated feature pairs (|r| > {high_corr_threshold}):")
high_corr_df

In [None]:
# Correlation with target
X_with_target = X.copy()
X_with_target['target'] = y

target_correlations = X_with_target.corr()['target'].drop('target').sort_values(key=abs, ascending=False)

plt.figure(figsize=(12, 6))
colors = ['green' if x > 0 else 'red' for x in target_correlations.values]
plt.barh(range(len(target_correlations)), target_correlations.values, color=colors)
plt.yticks(range(len(target_correlations)), target_correlations.index)
plt.xlabel('Correlation with Target')
plt.title('Feature Correlation with Target (Price Direction)')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

print("\nTop 10 features by absolute correlation with target:")
target_correlations.head(10)

## 4. Feature Importance Methods

In [None]:
# Prepare data
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Method 1: Random Forest Feature Importance
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

rf_importance = pd.Series(
    rf_model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print("Random Forest Accuracy:", rf_model.score(X_test, y_test))
print("\nTop 10 features by RF importance:")
rf_importance.head(10)

In [None]:
# Method 2: Gradient Boosting Feature Importance
gb_model = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=5,
    random_state=42
)
gb_model.fit(X_train, y_train)

gb_importance = pd.Series(
    gb_model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print("Gradient Boosting Accuracy:", gb_model.score(X_test, y_test))
print("\nTop 10 features by GB importance:")
gb_importance.head(10)

In [None]:
# Method 3: Mutual Information
mi_scores = mutual_info_classif(X_scaled, y, random_state=42)
mi_importance = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

print("Top 10 features by Mutual Information:")
mi_importance.head(10)

In [None]:
# Method 4: ANOVA F-statistic
f_scores, p_values = f_classif(X_scaled, y)
f_importance = pd.Series(f_scores, index=X.columns).sort_values(ascending=False)

print("Top 10 features by F-statistic:")
f_importance.head(10)

## 5. Combined Feature Importance Ranking

In [None]:
# Normalize importance scores to 0-1 range
def normalize_series(s):
    return (s - s.min()) / (s.max() - s.min())

# Combine all importance methods
importance_df = pd.DataFrame({
    'random_forest': normalize_series(rf_importance),
    'gradient_boosting': normalize_series(gb_importance),
    'mutual_info': normalize_series(mi_importance),
    'f_statistic': normalize_series(f_importance),
    'target_correlation': normalize_series(target_correlations.abs())
})

# Calculate weighted average (tree-based methods weighted higher)
weights = {
    'random_forest': 0.25,
    'gradient_boosting': 0.25,
    'mutual_info': 0.20,
    'f_statistic': 0.15,
    'target_correlation': 0.15
}

importance_df['combined_score'] = sum(
    importance_df[col] * weight 
    for col, weight in weights.items()
)

importance_df = importance_df.sort_values('combined_score', ascending=False)
importance_df

In [None]:
# Visualize combined feature importance
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Combined score bar chart
ax1 = axes[0]
top_features = importance_df.head(15)
colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(top_features)))
bars = ax1.barh(range(len(top_features)), 
                top_features['combined_score'].values[::-1],
                color=colors[::-1])
ax1.set_yticks(range(len(top_features)))
ax1.set_yticklabels(top_features.index[::-1])
ax1.set_xlabel('Combined Importance Score')
ax1.set_title('Top 15 Features by Combined Importance')

# Heatmap of all methods
ax2 = axes[1]
heatmap_data = importance_df.drop('combined_score', axis=1).head(15)
sns.heatmap(heatmap_data, annot=True, fmt='.2f', cmap='YlOrRd',
            ax=ax2, cbar_kws={'label': 'Normalized Score'})
ax2.set_title('Feature Importance by Method (Top 15)')
ax2.set_xlabel('Method')

plt.tight_layout()
plt.show()

## 6. Feature Selection Recommendations

In [None]:
# Select top features based on combined score
TOP_N_FEATURES = 10

selected_features = importance_df.head(TOP_N_FEATURES).index.tolist()

print(f"Recommended Top {TOP_N_FEATURES} Features for ML Models:")
print("=" * 50)
for i, feature in enumerate(selected_features, 1):
    score = importance_df.loc[feature, 'combined_score']
    print(f"{i:2d}. {feature:25s} (score: {score:.4f})")

In [None]:
# Categorize features by type
def categorize_feature(feature):
    technical = ['rsi', 'macd', 'bb_', 'atr', 'adx', 'momentum', 'volume', 
                 'mfi', 'stochastic', 'price_vs_sma', 'cci', 'obv', 'roc']
    fundamental = ['pe_', 'pb_', 'ps_', 'roe', 'roa', 'margin', 'debt', 
                   'growth', 'dividend', 'ratio', 'roic']
    market = ['spy', 'beta', 'correlation', 'sector', 'vix', 'regime']
    
    feature_lower = feature.lower()
    for t in technical:
        if t in feature_lower:
            return 'Technical'
    for f in fundamental:
        if f in feature_lower:
            return 'Fundamental'
    for m in market:
        if m in feature_lower:
            return 'Market'
    return 'Other'

# Feature category distribution
importance_df['category'] = importance_df.index.map(categorize_feature)

category_summary = importance_df.groupby('category').agg({
    'combined_score': ['mean', 'count'],
    'random_forest': 'mean',
    'gradient_boosting': 'mean'
}).round(4)

print("\nFeature Importance by Category:")
category_summary

In [None]:
# Category distribution pie chart
category_counts = importance_df.head(10)['category'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart
colors = {'Technical': '#2ecc71', 'Fundamental': '#3498db', 'Market': '#e74c3c', 'Other': '#95a5a6'}
pie_colors = [colors.get(cat, '#95a5a6') for cat in category_counts.index]
axes[0].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%',
            colors=pie_colors, startangle=90)
axes[0].set_title('Top 10 Features by Category')

# Box plot of importance by category
importance_df.boxplot(column='combined_score', by='category', ax=axes[1])
axes[1].set_title('Importance Score Distribution by Category')
axes[1].set_xlabel('Category')
axes[1].set_ylabel('Combined Score')
plt.suptitle('')

plt.tight_layout()
plt.show()

## 7. Feature Stability Analysis

In [None]:
# Test feature importance stability with different random seeds
n_iterations = 5
stability_results = []

for seed in range(n_iterations):
    # Resample data
    X_sample, _, y_sample, _ = train_test_split(X_scaled, y, test_size=0.3, random_state=seed)
    
    # Train RF
    rf = RandomForestClassifier(n_estimators=50, max_depth=8, random_state=seed)
    rf.fit(X_sample, y_sample)
    
    importance = pd.Series(rf.feature_importances_, index=X.columns)
    stability_results.append(importance)

stability_df = pd.DataFrame(stability_results)
stability_stats = pd.DataFrame({
    'mean_importance': stability_df.mean(),
    'std_importance': stability_df.std(),
    'cv': stability_df.std() / stability_df.mean()  # Coefficient of variation
}).sort_values('mean_importance', ascending=False)

print("Feature Stability Analysis (lower CV = more stable):")
stability_stats.head(10)

In [None]:
# Plot stability
top_stable = stability_stats.head(10)

fig, ax = plt.subplots(figsize=(12, 6))
x = range(len(top_stable))
ax.bar(x, top_stable['mean_importance'], yerr=top_stable['std_importance'],
       capsize=5, color='steelblue', alpha=0.8)
ax.set_xticks(x)
ax.set_xticklabels(top_stable.index, rotation=45, ha='right')
ax.set_ylabel('Feature Importance')
ax.set_title('Top 10 Features with Stability (Error Bars = Std Dev)')
plt.tight_layout()
plt.show()

## 8. Final Recommendations

In [None]:
# Generate final feature recommendations
final_recommendations = pd.DataFrame({
    'combined_importance': importance_df['combined_score'],
    'stability_cv': stability_stats['cv'],
    'category': importance_df['category']
})

# Calculate final score (importance weighted by stability)
final_recommendations['final_score'] = (
    final_recommendations['combined_importance'] * 
    (1 - final_recommendations['stability_cv'].clip(0, 1))  # Penalize unstable features
)

final_recommendations = final_recommendations.sort_values('final_score', ascending=False)

print("\n" + "="*60)
print("FINAL FEATURE RECOMMENDATIONS")
print("="*60)
print("\nTop 10 Features (considering importance + stability):")
print("-"*60)

for i, (feature, row) in enumerate(final_recommendations.head(10).iterrows(), 1):
    print(f"{i:2d}. {feature:25s} [{row['category']:12s}] Score: {row['final_score']:.4f}")

print("\n" + "="*60)
print("RECOMMENDATIONS BY USE CASE")
print("="*60)

print("\nüéØ For Price Direction Prediction:")
direction_features = final_recommendations[final_recommendations['category'].isin(['Technical', 'Market'])].head(5)
for f in direction_features.index:
    print(f"   ‚Ä¢ {f}")

print("\nüìä For Value Assessment:")
value_features = final_recommendations[final_recommendations['category'] == 'Fundamental'].head(5)
for f in value_features.index:
    print(f"   ‚Ä¢ {f}")

print("\nüåê For Market Context:")
market_features = final_recommendations[final_recommendations['category'] == 'Market'].head(3)
for f in market_features.index:
    print(f"   ‚Ä¢ {f}")

In [None]:
# Export recommendations
export_data = {
    'selected_features': final_recommendations.head(15).index.tolist(),
    'feature_scores': final_recommendations.head(15)['final_score'].to_dict(),
    'technical_features': final_recommendations[final_recommendations['category'] == 'Technical'].head(8).index.tolist(),
    'fundamental_features': final_recommendations[final_recommendations['category'] == 'Fundamental'].head(5).index.tolist(),
    'market_features': final_recommendations[final_recommendations['category'] == 'Market'].head(3).index.tolist()
}

print("\nüìÅ Feature Selection Summary:")
print(f"   Total features analyzed: {len(X.columns)}")
print(f"   Selected features: {len(export_data['selected_features'])}")
print(f"   - Technical: {len(export_data['technical_features'])}")
print(f"   - Fundamental: {len(export_data['fundamental_features'])}")
print(f"   - Market: {len(export_data['market_features'])}")

# Save to JSON for use in ML pipeline
import json
with open('../src/selected_features.json', 'w') as f:
    json.dump(export_data, f, indent=2)
print("\n‚úÖ Feature selection saved to selected_features.json")

## Summary

### Key Findings:
1. **Most Important Technical Features**: Momentum indicators (MACD, RSI) and volatility measures (ATR, BB)
2. **Most Important Fundamental Features**: Valuation ratios (P/E, P/B) and profitability (ROE)
3. **Most Important Market Features**: Market correlation and sector relative strength

### Recommendations:
- Use top 10-15 features to avoid overfitting
- Include mix of technical, fundamental, and market features
- Monitor feature stability over time
- Re-run analysis periodically to capture regime changes