# Data EDA

This notebook profiles generated raw, processed, and feature datasets.

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

base = Path('data') if Path('data').exists() else Path('../data')
raw_telemetry = pd.read_csv(base / 'raw' / 'telemetry.csv')
raw_weather = pd.read_csv(base / 'raw' / 'weather.csv')
processed = pd.read_csv(base / 'processed' / 'telemetry_processed.csv')
features = pd.read_csv(base / 'processed' / 'ceiling_risk_features.csv')

raw_telemetry.head()

In [None]:
summary = {
    'raw_telemetry_rows': len(raw_telemetry),
    'raw_weather_rows': len(raw_weather),
    'processed_rows': len(processed),
    'feature_rows': len(features),
    'drones': raw_telemetry['drone_id'].nunique(),
}
pd.Series(summary)

In [None]:
grouped = features.groupby('drone_id', as_index=False).agg(
    avg_risk=('risk_score', 'mean'),
    p95_risk=('risk_score', lambda s: s.quantile(0.95)),
    alert_share=('ceiling_cross_within_8s', 'mean')
)
grouped

In [None]:
plot_df = features.copy()
plot_df['timestamp_iso'] = pd.to_datetime(plot_df['timestamp_iso'])
fig, ax = plt.subplots(figsize=(10, 4))
for drone_id, part in plot_df.groupby('drone_id'):
    ax.plot(part['timestamp_iso'], part['risk_score'], label=drone_id)
ax.set_title('Risk Score Over Time by Drone')
ax.set_xlabel('Timestamp')
ax.set_ylabel('Risk Score')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()

In [None]:
ttc = pd.to_numeric(features['time_to_ceiling_cross_sec'], errors='coerce').dropna()
fig, ax = plt.subplots(figsize=(7, 4))
ax.hist(ttc, bins=20)
ax.set_title('Time-to-Ceiling-Cross Distribution (<=8s windows)')
ax.set_xlabel('Seconds')
ax.set_ylabel('Count')
ax.grid(alpha=0.3)
plt.tight_layout()

## Feature Inputs for Altitude Warning

This section isolates the primary model inputs so readers can quickly understand which features drive near-term ceiling-risk indicators.

In [None]:
feature_cols = [
    'altitude_margin_to_ceiling_ft',
    'vertical_speed_fps',
    'weather_stress_factor',
    'predicted_altitude_ft_8s',
    'ceiling_cross_within_8s'
]
plot_features = features[feature_cols].copy()

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
plot_features['altitude_margin_to_ceiling_ft'].hist(ax=axes[0, 0], bins=25)
axes[0, 0].set_title('Altitude Margin to Ceiling (ft)')
axes[0, 0].set_xlabel('Margin (ft)')
axes[0, 0].set_ylabel('Count')

plot_features['vertical_speed_fps'].hist(ax=axes[0, 1], bins=25)
axes[0, 1].set_title('Vertical Speed (ft/s)')
axes[0, 1].set_xlabel('Vertical Speed (ft/s)')
axes[0, 1].set_ylabel('Count')

plot_features['weather_stress_factor'].hist(ax=axes[1, 0], bins=25)
axes[1, 0].set_title('Weather Stress Factor (0..1)')
axes[1, 0].set_xlabel('Weather Stress')
axes[1, 0].set_ylabel('Count')

for label, group in plot_features.groupby('ceiling_cross_within_8s'):
    axes[1, 1].scatter(group['predicted_altitude_ft_8s'], group['altitude_margin_to_ceiling_ft'], alpha=0.6, label=f'target={label}')
axes[1, 1].set_title('Projection vs Margin by Target')
axes[1, 1].set_xlabel('Predicted Altitude in 8s (ft)')
axes[1, 1].set_ylabel('Altitude Margin to Ceiling (ft)')
axes[1, 1].legend()

for ax in axes.flat:
    ax.grid(alpha=0.3)

plt.tight_layout()