# 01. Feature Exploration

This notebook explores the features generated by the analytics pipeline.
It connects to the `analyzers_features` table and provides statistical overviews and outlier analysis.

In [4]:
import sys
from analytics_utils import get_db_client, query_to_df, setup_plotting
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

setup_plotting()

ModuleNotFoundError: No module named 'analytics_utils'

## Configuration
Set the target network and processing date.

In [1]:
NETWORK = 'torus'  # e.g., 'torus', 'ethereum', 'bitcoin'
PROCESSING_DATE = '2025-04-18' # DATE of the pipeline run
WINDOW_DAYS = 120 # Window size used for feature calculation

## Data Loading
Fetch features from ClickHouse.

In [None]:
client = get_db_client(NETWORK)

query = f"""
SELECT *
FROM analyzers_features
WHERE processing_date = toDate('{PROCESSING_DATE}')
AND window_days = {WINDOW_DAYS}
"""

df = query_to_df(client, query)
print(f"Loaded {len(df)} rows")
df.head(3)

## Feature Statistics
Review basic statistics for numerical features.

In [None]:
features_to_analyze = [
    'total_volume_usd', 'degree_total', 'pagerank',
    'flow_concentration', 'reciprocity_ratio', 'centrality_score'
]

# Convert Decimal columns to float for analysis if needed
for col in features_to_analyze:
    if col in df.columns:
        df[col] = df[col].astype(float)

df[features_to_analyze].describe()

## Distributions
Visualizing key feature distributions.

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, col in enumerate(features_to_analyze):
    if col in df.columns:
        sns.histplot(df[col], bins=50, ax=axes[i], log_scale=(False, True))
        axes[i].set_title(f'{col} Distribution (Log Scale)')

plt.tight_layout()
plt.show()

## Correlations
Identify relationships between features.

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df[features_to_analyze].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.show()

## Top Risk Addresses
Identifying potential high-risk actors based on centrality and volume.

In [None]:
# Filter for high-risk candidates
high_risk = df.sort_values('centrality_score', ascending=False).head(20)

display_cols = ['address', 'centrality_score', 'pagerank', 'total_volume_usd', 'degree_total']
high_risk[display_cols]