# 02 - Feature Analysis

Exploratory analysis of trading features.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import yaml

plt.style.use('dark_background')
sns.set_palette('husl')

In [None]:
# Load configuration
config_path = Path('../config/default.yaml')
with open(config_path) as f:
    config = yaml.safe_load(f)

SYMBOL = config['market']['symbol']

In [None]:
# Load data
data_path = Path('../data/raw') / f'{SYMBOL}_1m.parquet'

if data_path.exists():
    data = pd.read_parquet(data_path)
    print(f"Loaded {len(data)} rows")
else:
    # Create synthetic data
    print("Creating synthetic data...")
    n = 10000
    np.random.seed(42)
    returns = np.random.randn(n) * 0.001
    prices = 50000 * np.exp(np.cumsum(returns))
    
    data = pd.DataFrame({
        'open': prices * (1 + np.random.randn(n) * 0.0001),
        'high': prices * (1 + np.abs(np.random.randn(n) * 0.0005)),
        'low': prices * (1 - np.abs(np.random.randn(n) * 0.0005)),
        'close': prices,
        'volume': np.random.randint(10, 100, n) * 0.1,
    }, index=pd.date_range(start='2024-01-01', periods=n, freq='1min'))

data.head()

## Calculate Features

In [None]:
# Get feature parameters from config
rsi_period = config['features']['technical']['rsi']['period']
bb_period = config['features']['technical']['bollinger']['period']
bb_std = config['features']['technical']['bollinger']['std_dev']
atr_period = config['features']['technical']['atr']['period']

print(f"RSI period: {rsi_period}")
print(f"Bollinger period: {bb_period}, std: {bb_std}")
print(f"ATR period: {atr_period}")

In [None]:
# Calculate returns
data['returns'] = data['close'].pct_change()

# RSI
delta = data['close'].diff()
gain = delta.where(delta > 0, 0).rolling(rsi_period).mean()
loss = (-delta.where(delta < 0, 0)).rolling(rsi_period).mean()
rs = gain / loss
data['rsi'] = 100 - (100 / (1 + rs))

# Bollinger Bands
data['bb_middle'] = data['close'].rolling(bb_period).mean()
data['bb_std'] = data['close'].rolling(bb_period).std()
data['bb_upper'] = data['bb_middle'] + bb_std * data['bb_std']
data['bb_lower'] = data['bb_middle'] - bb_std * data['bb_std']
data['bb_width'] = (data['bb_upper'] - data['bb_lower']) / data['bb_middle']

# ATR
high_low = data['high'] - data['low']
high_close = (data['high'] - data['close'].shift()).abs()
low_close = (data['low'] - data['close'].shift()).abs()
tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
data['atr'] = tr.rolling(atr_period).mean()

# Volume features
data['volume_sma'] = data['volume'].rolling(20).mean()
data['volume_ratio'] = data['volume'] / data['volume_sma']

# Drop NaN
features = data.dropna()
print(f"Features shape: {features.shape}")

## Feature Visualization

In [None]:
# Plot price and features
fig, axes = plt.subplots(4, 1, figsize=(14, 12), sharex=True)

# Price with Bollinger Bands
axes[0].plot(features.index[-500:], features['close'].iloc[-500:], label='Close', linewidth=1)
axes[0].fill_between(features.index[-500:], 
                     features['bb_lower'].iloc[-500:], 
                     features['bb_upper'].iloc[-500:], 
                     alpha=0.2, label='BB')
axes[0].set_title('Price with Bollinger Bands')
axes[0].legend()

# RSI
axes[1].plot(features.index[-500:], features['rsi'].iloc[-500:], color='yellow')
axes[1].axhline(y=70, color='r', linestyle='--', alpha=0.5)
axes[1].axhline(y=30, color='g', linestyle='--', alpha=0.5)
axes[1].set_title('RSI')
axes[1].set_ylim(0, 100)

# ATR
axes[2].plot(features.index[-500:], features['atr'].iloc[-500:], color='cyan')
axes[2].set_title('ATR')

# Volume
axes[3].bar(features.index[-500:], features['volume'].iloc[-500:], width=0.0005, alpha=0.7)
axes[3].plot(features.index[-500:], features['volume_sma'].iloc[-500:], color='orange', label='SMA')
axes[3].set_title('Volume')
axes[3].legend()

plt.tight_layout()
plt.show()

In [None]:
# Feature correlations
feature_cols = ['returns', 'rsi', 'bb_width', 'atr', 'volume_ratio']
correlation = features[feature_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
# Feature distributions
fig, axes = plt.subplots(2, 3, figsize=(14, 8))

axes[0, 0].hist(features['returns'].dropna(), bins=50, alpha=0.7, edgecolor='white')
axes[0, 0].set_title('Returns Distribution')

axes[0, 1].hist(features['rsi'].dropna(), bins=50, alpha=0.7, edgecolor='white')
axes[0, 1].set_title('RSI Distribution')

axes[0, 2].hist(features['bb_width'].dropna(), bins=50, alpha=0.7, edgecolor='white')
axes[0, 2].set_title('BB Width Distribution')

axes[1, 0].hist(features['atr'].dropna(), bins=50, alpha=0.7, edgecolor='white')
axes[1, 0].set_title('ATR Distribution')

axes[1, 1].hist(features['volume_ratio'].dropna(), bins=50, alpha=0.7, edgecolor='white')
axes[1, 1].set_title('Volume Ratio Distribution')

axes[1, 2].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Save processed features
output_path = Path('../data/processed') / f'{SYMBOL}_features.parquet'
output_path.parent.mkdir(parents=True, exist_ok=True)
features.to_parquet(output_path)
print(f"Saved features to {output_path}")