# Feature Analysis

**Purpose**: Analyze engineered features for model training

This notebook helps you:
- Build features using FeatureBuilder
- Analyze feature distributions
- Detect correlations and multicollinearity
- Identify outliers
- Validate feature engineering

## Setup

In [None]:
import sys
sys.path.insert(0, '../')

from packages.training import FeatureExtractor, FeatureBuilder, ModelTrainer
from packages.storage import ClientFactory, get_connection_params
from notebook_utils import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from loguru import logger
from scipy import stats

setup_plotting()

## Configuration

In [None]:
NETWORK = 'ethereum'
START_DATE = '2024-01-01'
END_DATE = '2024-01-31'
WINDOW_DAYS = 7

print(f"Network: {NETWORK}")
print(f"Date Range: {START_DATE} to {END_DATE}")
print(f"Window: {WINDOW_DAYS} days")

## Extract Training Data

In [None]:
connection_params = get_connection_params(NETWORK)
client_factory = ClientFactory(connection_params)

with client_factory.client_context() as client:
    extractor = FeatureExtractor(client)
    data = extractor.extract_training_data(
        start_date=START_DATE,
        end_date=END_DATE,
        window_days=WINDOW_DAYS
    )

print(f"Extracted data shape: {data.shape}")
print(f"Columns: {data.columns.tolist()[:10]}...")

## Build Features

In [None]:
builder = FeatureBuilder()
X, y = builder.build_training_features(data)

print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures: {X.columns.tolist()[:15]}...")
print(f"\nClass distribution:")
print(y.value_counts())

## Feature Summary Statistics

In [None]:
X.describe()

## Feature Distributions

In [None]:
feature_cols = X.columns.tolist()
print(f"Total features: {len(feature_cols)}")

if len(feature_cols) >= 6:
    plot_feature_distributions(X, feature_cols[:6])
    plt.show()

In [None]:
if len(feature_cols) >= 12:
    plot_feature_distributions(X, feature_cols[6:12])
    plt.show()

## Check for Missing Values

In [None]:
missing = X.isnull().sum()
missing_pct = (missing / len(X) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("Features with missing values:")
    print(missing_df)
else:
    print("No missing values found")

## Feature Correlation Matrix

In [None]:
if len(feature_cols) <= 30:
    plot_correlation_matrix(X, figsize=(14, 12))
    plt.show()
else:
    print(f"Too many features ({len(feature_cols)}) for full correlation matrix")
    print("Showing correlation for first 20 features:")
    plot_correlation_matrix(X[feature_cols[:20]], figsize=(14, 12))
    plt.show()

## High Correlation Detection

In [None]:
corr_matrix = X.corr().abs()
upper_triangle = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

high_corr = [(column, row, upper_triangle.loc[row, column])
             for column in upper_triangle.columns
             for row in upper_triangle.index
             if upper_triangle.loc[row, column] > 0.8]

if high_corr:
    print(f"Found {len(high_corr)} feature pairs with correlation > 0.8:")
    for feat1, feat2, corr_val in sorted(high_corr, key=lambda x: x[2], reverse=True)[:10]:
        print(f"  {feat1} <-> {feat2}: {corr_val:.3f}")
else:
    print("No highly correlated features found (threshold: 0.8)")

## Outlier Detection

In [None]:
outlier_summary = []

for col in feature_cols[:10]:
    Q1 = X[col].quantile(0.25)
    Q3 = X[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = ((X[col] < lower_bound) | (X[col] > upper_bound)).sum()
    outlier_pct = (outliers / len(X) * 100).round(2)
    
    outlier_summary.append({
        'Feature': col,
        'Outliers': outliers,
        'Percentage': outlier_pct
    })

outlier_df = pd.DataFrame(outlier_summary).sort_values('Percentage', ascending=False)
print("Outlier Summary (first 10 features):")
print(outlier_df)

## Box Plots for Outlier Visualization

In [None]:
if len(feature_cols) >= 6:
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for idx, col in enumerate(feature_cols[:6]):
        axes[idx].boxplot(X[col].dropna())
        axes[idx].set_title(f'{col}')
        axes[idx].set_ylabel('Value')
    
    plt.tight_layout()
    plt.show()

## Feature Scaling Analysis

In [None]:
feature_ranges = pd.DataFrame({
    'Feature': feature_cols[:10],
    'Min': [X[col].min() for col in feature_cols[:10]],
    'Max': [X[col].max() for col in feature_cols[:10]],
    'Mean': [X[col].mean() for col in feature_cols[:10]],
    'Std': [X[col].std() for col in feature_cols[:10]]
})

print("Feature Scaling Summary (first 10 features):")
print(feature_ranges.round(4))

## Feature Distribution by Target Class

In [None]:
if len(feature_cols) >= 4:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.flatten()
    
    for idx, col in enumerate(feature_cols[:4]):
        for class_val in y.unique()[:2]:
            mask = y == class_val
            axes[idx].hist(X.loc[mask, col], bins=30, alpha=0.6, label=f'Class {class_val}')
        
        axes[idx].set_title(f'{col} by Target Class')
        axes[idx].set_xlabel('Value')
        axes[idx].set_ylabel('Frequency')
        axes[idx].legend()
    
    plt.tight_layout()
    plt.show()

## Feature Importance Preview (Correlation with Target)

In [None]:
X_with_target = X.copy()
X_with_target['target'] = y

target_corr = X_with_target.corr()['target'].drop('target').abs().sort_values(ascending=False)

print("Top 15 features by correlation with target:")
print(target_corr.head(15).round(4))

plt.figure(figsize=(10, 6))
target_corr.head(15).plot(kind='barh')
plt.title('Top 15 Features by Absolute Correlation with Target')
plt.xlabel('Absolute Correlation')
plt.ylabel('Feature')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Scatter Plots for Top Correlated Features

In [None]:
top_features = target_corr.head(4).index.tolist()

if len(top_features) >= 4:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.flatten()
    
    for idx, feat in enumerate(top_features):
        for class_val in y.unique()[:2]:
            mask = y == class_val
            axes[idx].scatter(X.loc[mask, feat], y[mask], alpha=0.5, label=f'Class {class_val}')
        
        axes[idx].set_title(f'{feat} vs Target')
        axes[idx].set_xlabel(feat)
        axes[idx].set_ylabel('Target')
        axes[idx].legend()
    
    plt.tight_layout()
    plt.show()

## Conclusions

**Key Findings**:

1. **Feature Quality**: Review missing values and outliers
2. **Multicollinearity**: Check for highly correlated features
3. **Scaling**: Understand feature value ranges
4. **Predictive Power**: Identify features correlated with target
5. **Class Separation**: Analyze feature distributions by class

**Next Steps**:
- Remove or combine highly correlated features
- Handle outliers if necessary
- Proceed to Model Training notebook
- Consider feature selection strategies