# Feature Importance Analysis

**Purpose**: Understand which features drive model predictions

This notebook provides:
- LightGBM built-in feature importance
- SHAP value analysis
- Permutation importance
- Partial dependence plots
- Feature interaction detection

## Setup

In [None]:
import sys
sys.path.insert(0, '../')

from packages.training import FeatureExtractor, FeatureBuilder, ModelTrainer
from packages.storage import ClientFactory, get_connection_params
from notebook_utils import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from loguru import logger
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance, partial_dependence
import shap

setup_plotting()

## Configuration

In [None]:
NETWORK = 'ethereum'
START_DATE = '2024-01-01'
END_DATE = '2024-02-29'
WINDOW_DAYS = 7
TEST_SIZE = 0.2
RANDOM_STATE = 42

print(f"Network: {NETWORK}")
print(f"Analysis Period: {START_DATE} to {END_DATE}")

## Load Data and Train Model

In [None]:
connection_params = get_connection_params(NETWORK)
client_factory = ClientFactory(connection_params)

with client_factory.client_context() as client:
    extractor = FeatureExtractor(client)
    data = extractor.extract_training_data(
        start_date=START_DATE,
        end_date=END_DATE,
        window_days=WINDOW_DAYS
    )

builder = FeatureBuilder()
X, y = builder.build_training_features(data)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print(f"Data loaded: {X.shape}")
print(f"Features: {len(X.columns)}")

In [None]:
trainer = ModelTrainer(model_type='alert_scorer')
model, metrics = trainer.train(X_train, y_train, cv_folds=5)

print(f"Model trained with AUC: {metrics['test_auc']:.4f}")

## LightGBM Built-in Feature Importance

In [None]:
feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': model.model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 Most Important Features:")
print(feature_importance_df.head(20).to_string(index=False))

plt.figure(figsize=(10, 8))
top_n = 20
top_features = feature_importance_df.head(top_n)
plt.barh(range(top_n), top_features['importance'])
plt.yticks(range(top_n), top_features['feature'])
plt.xlabel('Importance')
plt.title(f'Top {top_n} Features by LightGBM Importance')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## Permutation Importance

In [None]:
print("Calculating permutation importance (this may take a while)...")
perm_importance = permutation_importance(
    model.model, X_test, y_test,
    n_repeats=10,
    random_state=RANDOM_STATE,
    scoring='roc_auc'
)

perm_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance_mean': perm_importance.importances_mean,
    'importance_std': perm_importance.importances_std
}).sort_values('importance_mean', ascending=False)

print("\nTop 20 Features by Permutation Importance:")
print(perm_importance_df.head(20).to_string(index=False))

In [None]:
plt.figure(figsize=(10, 8))
top_n = 20
top_perm = perm_importance_df.head(top_n)
plt.barh(range(top_n), top_perm['importance_mean'],
         xerr=top_perm['importance_std'], capsize=3)
plt.yticks(range(top_n), top_perm['feature'])
plt.xlabel('Permutation Importance')
plt.title(f'Top {top_n} Features by Permutation Importance')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## SHAP Analysis - Initialization

In [None]:
print("Initializing SHAP explainer...")
explainer = shap.TreeExplainer(model.model)

sample_size = min(1000, len(X_test))
X_test_sample = X_test.sample(n=sample_size, random_state=RANDOM_STATE)

print(f"Calculating SHAP values for {sample_size} samples...")
shap_values = explainer.shap_values(X_test_sample)
print("SHAP values calculated")

## SHAP Summary Plot

In [None]:
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_test_sample, plot_type="bar", show=False)
plt.title('SHAP Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
shap.summary_plot(shap_values, X_test_sample, show=False)
plt.title('SHAP Summary Plot (Feature Impact)')
plt.tight_layout()
plt.show()

## SHAP Dependence Plots

In [None]:
top_features_for_shap = feature_importance_df.head(6)['feature'].tolist()

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, feature in enumerate(top_features_for_shap):
    if feature in X_test_sample.columns:
        shap.dependence_plot(
            feature, shap_values, X_test_sample,
            ax=axes[idx], show=False
        )
        axes[idx].set_title(f'SHAP Dependence: {feature}')

plt.tight_layout()
plt.show()

## SHAP Force Plot - Individual Predictions

In [None]:
shap.initjs()

print("High risk prediction example:")
high_risk_idx = y_test_sample[y_test_sample == 1].index[0]
shap.force_plot(
    explainer.expected_value,
    shap_values[X_test_sample.index.get_loc(high_risk_idx)],
    X_test_sample.loc[high_risk_idx]
)

In [None]:
print("Low risk prediction example:")
low_risk_idx = y_test_sample[y_test_sample == 0].index[0]
shap.force_plot(
    explainer.expected_value,
    shap_values[X_test_sample.index.get_loc(low_risk_idx)],
    X_test_sample.loc[low_risk_idx]
)

## Feature Interaction Analysis

In [None]:
top_2_features = feature_importance_df.head(2)['feature'].tolist()

if len(top_2_features) >= 2:
    print(f"Analyzing interaction between: {top_2_features[0]} and {top_2_features[1]}")
    
    shap_interaction_values = explainer.shap_interaction_values(X_test_sample)
    
    plt.figure(figsize=(10, 8))
    shap.dependence_plot(
        (top_2_features[0], top_2_features[1]),
        shap_interaction_values,
        X_test_sample,
        show=False
    )
    plt.title(f'Feature Interaction: {top_2_features[0]} × {top_2_features[1]}')
    plt.tight_layout()
    plt.show()

## Partial Dependence Plots

In [None]:
top_features_for_pd = feature_importance_df.head(4)['feature'].tolist()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, feature in enumerate(top_features_for_pd):
    feature_idx = X.columns.get_loc(feature)
    pd_result = partial_dependence(
        model.model, X_test, [feature_idx],
        grid_resolution=50
    )
    
    axes[idx].plot(pd_result['values'][0], pd_result['average'][0], linewidth=2)
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Partial Dependence')
    axes[idx].set_title(f'Partial Dependence: {feature}')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Compare Importance Methods

In [None]:
shap_importance = pd.DataFrame({
    'feature': X_test_sample.columns,
    'shap_importance': np.abs(shap_values).mean(axis=0)
}).sort_values('shap_importance', ascending=False)

comparison_df = feature_importance_df[['feature', 'importance']].copy()
comparison_df.columns = ['feature', 'lightgbm_importance']
comparison_df = comparison_df.merge(perm_importance_df[['feature', 'importance_mean']], on='feature')
comparison_df.columns = ['feature', 'lightgbm_importance', 'perm_importance']
comparison_df = comparison_df.merge(shap_importance, on='feature')

for col in ['lightgbm_importance', 'perm_importance', 'shap_importance']:
    comparison_df[f'{col}_rank'] = comparison_df[col].rank(ascending=False)

comparison_df = comparison_df.sort_values('lightgbm_importance', ascending=False)

print("\nTop 15 Features - Comparison Across Methods:")
display_cols = ['feature', 'lightgbm_importance_rank', 'perm_importance_rank', 'shap_importance_rank']
print(comparison_df[display_cols].head(15).to_string(index=False))

In [None]:
top_features_comparison = comparison_df.head(15)

fig, ax = plt.subplots(figsize=(12, 8))
x = np.arange(len(top_features_comparison))
width = 0.25

ax.bar(x - width, top_features_comparison['lightgbm_importance_rank'], 
       width, label='LightGBM', alpha=0.8)
ax.bar(x, top_features_comparison['perm_importance_rank'], 
       width, label='Permutation', alpha=0.8)
ax.bar(x + width, top_features_comparison['shap_importance_rank'], 
       width, label='SHAP', alpha=0.8)

ax.set_xlabel('Feature')
ax.set_ylabel('Rank (lower is better)')
ax.set_title('Feature Importance Ranking Comparison')
ax.set_xticks(x)
ax.set_xticklabels(top_features_comparison['feature'], rotation=45, ha='right')
ax.legend()
ax.invert_yaxis()
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## Feature Groups Analysis

In [None]:
def categorize_feature(feature_name):
    feature_lower = feature_name.lower()
    if 'volume' in feature_lower or 'amount' in feature_lower:
        return 'Volume'
    elif 'count' in feature_lower or 'number' in feature_lower:
        return 'Count'
    elif 'time' in feature_lower or 'date' in feature_lower or 'duration' in feature_lower:
        return 'Temporal'
    elif 'ratio' in feature_lower or 'rate' in feature_lower:
        return 'Ratio'
    elif 'severity' in feature_lower:
        return 'Severity'
    else:
        return 'Other'

feature_importance_df['category'] = feature_importance_df['feature'].apply(categorize_feature)
category_importance = feature_importance_df.groupby('category')['importance'].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
category_importance.plot(kind='bar')
plt.title('Feature Importance by Category')
plt.xlabel('Category')
plt.ylabel('Total Importance')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

print("\nImportance by Feature Category:")
print(category_importance)

## Top Features Summary

In [None]:
top_n = 10
top_features_summary = feature_importance_df.head(top_n)

print("\n" + "="*70)
print(f"TOP {top_n} MOST IMPORTANT FEATURES")
print("="*70)
for idx, row in top_features_summary.iterrows():
    print(f"{row['feature']:40s} {row['importance']:>10.4f} ({row['category']})")
print("="*70)

total_importance = feature_importance_df['importance'].sum()
top_n_importance = top_features_summary['importance'].sum()
coverage = (top_n_importance / total_importance * 100)

print(f"\nTop {top_n} features account for {coverage:.1f}% of total importance")

## Conclusions

**Feature Importance Insights**:

1. **Top Features**: Review the most impactful features
2. **Method Consistency**: Compare rankings across different methods
3. **Feature Categories**: Understand which types of features matter most
4. **Interactions**: Identify important feature relationships

**Key Observations**:
- SHAP provides directional impact (positive/negative)
- Permutation importance shows real-world impact
- Built-in importance is fast but less interpretable
- Partial dependence reveals non-linear relationships

**Next Steps**:
- Focus data collection on important features
- Consider feature engineering based on interactions
- Review Error Analysis to understand prediction failures
- Use insights for model refinement