# 02 - Feature Analysis

Deep dive into feature engineering and distributions

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.features.build_features import build_and_scale_features
from src.features.feature_config import FEATURE_CONFIG

sns.set_style('whitegrid')
%matplotlib inline

## Build Features

In [None]:
df_clean = pd.read_csv('../data/processed/btc_clean.csv')
df_clean['date'] = pd.to_datetime(df_clean['date'])

df_features, df_scaled, scaler = build_and_scale_features(df_clean, FEATURE_CONFIG)

## Feature Correlations

In [None]:
feature_cols = df_features.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in feature_cols if 'date' not in c.lower()][:15]

plt.figure(figsize=(14, 12))
sns.heatmap(df_features[feature_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## Feature Distributions

In [None]:
key_features = ['return', 'volatility_5d', 'volatility_20d', 'momentum_10d', 'rsi', 'volume_ratio']
key_features = [f for f in key_features if f in df_features.columns]

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, feature in enumerate(key_features):
    axes[idx].hist(df_features[feature].dropna(), bins=50, edgecolor='black')
    axes[idx].set_title(f'{feature} Distribution', fontweight='bold')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()