# NTSB Aviation Accident Database Analysis

Quick starter notebook for analyzing NTSB aviation accident data.

## Setup

First, ensure you've extracted data from the MDB files:
```fish
./scripts/extract_all_tables.fish datasets/avall.mdb
```

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import duckdb

# Configure visualization
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## Load Data

Load the main events table using DuckDB for efficient querying.

In [None]:
# Load events using DuckDB (fast SQL on CSV)
events = duckdb.query("""
    SELECT *
    FROM 'data/events.csv'
    WHERE ev_year >= 2008
""").to_df()

print(f"Loaded {len(events):,} events")
events.head()

In [None]:
# Basic statistics
events.info()

## Exploratory Analysis

In [None]:
# Accidents by year
accidents_by_year = events.groupby('ev_year').size()

plt.figure(figsize=(12, 6))
accidents_by_year.plot(kind='line', marker='o')
plt.title('Aviation Accidents by Year')
plt.xlabel('Year')
plt.ylabel('Number of Accidents')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Accidents by type
event_types = events['ev_type'].value_counts().head(10)

plt.figure(figsize=(10, 6))
event_types.plot(kind='barh')
plt.title('Top 10 Event Types')
plt.xlabel('Count')
plt.ylabel('Event Type')
plt.tight_layout()
plt.show()

In [None]:
# Fatalities over time
fatalities_by_year = events.groupby('ev_year')['inj_tot_f'].sum()

plt.figure(figsize=(12, 6))
fatalities_by_year.plot(kind='bar')
plt.title('Total Fatalities by Year')
plt.xlabel('Year')
plt.ylabel('Total Fatalities')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Geographic Analysis

In [None]:
# Accidents by state
state_counts = events['ev_state'].value_counts().head(15)

plt.figure(figsize=(12, 6))
state_counts.plot(kind='bar')
plt.title('Top 15 States by Accident Count')
plt.xlabel('State')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Join with Aircraft Data

In [None]:
# Load aircraft table
aircraft = pd.read_csv('data/aircraft.csv')

# Join events with aircraft
events_aircraft = events.merge(
    aircraft,
    left_on='ev_id',
    right_on='ev_id',
    how='left'
)

print(f"Joined dataset: {len(events_aircraft):,} records")
events_aircraft.head()

In [None]:
# Top aircraft makes involved in accidents
top_makes = events_aircraft['acft_make'].value_counts().head(10)

plt.figure(figsize=(10, 6))
top_makes.plot(kind='barh')
plt.title('Top 10 Aircraft Makes by Accident Count')
plt.xlabel('Number of Accidents')
plt.ylabel('Aircraft Make')
plt.tight_layout()
plt.show()

## Advanced Queries with DuckDB

In [None]:
# Complex query: Recent fatal accidents with aircraft details
fatal_query = """
    SELECT 
        e.ev_id,
        e.ev_date,
        e.ev_state,
        e.ev_city,
        e.inj_tot_f as fatalities,
        a.acft_make,
        a.acft_model
    FROM 'data/events.csv' e
    LEFT JOIN 'data/aircraft.csv' a
        ON e.ev_id = a.ev_id
    WHERE e.inj_tot_f > 0
        AND e.ev_year >= 2020
    ORDER BY e.inj_tot_f DESC, e.ev_date DESC
    LIMIT 20
"""

fatal_accidents = duckdb.query(fatal_query).to_df()
fatal_accidents

## Export Results

In [None]:
# Save analysis results
fatal_accidents.to_csv('outputs/fatal_accidents_2020_plus.csv', index=False)
accidents_by_year.to_csv('outputs/accidents_by_year.csv')

print("Analysis results saved to outputs/")