# NYC Collision Analysis

This notebook queries the `collision_summary` table created by the ETL pipeline.

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Connect to PostgreSQL
POSTGRES_URL = os.getenv("POSTGRES_URL")
if not POSTGRES_URL:
    raise RuntimeError("POSTGRES_URL not set in .env")

engine = create_engine(POSTGRES_URL)
print("Connected to PostgreSQL!")

## Query collision_summary table

In [None]:
# Query all data from collision_summary
query = text("SELECT * FROM collision_summary ORDER BY crash_day DESC, total_collisions DESC")

with engine.connect() as conn:
    df = pd.read_sql(query, conn)

print(f"Total rows: {len(df)}")
df.head(10)

## Summary Statistics

In [None]:
# Basic statistics
print("\n=== Summary Statistics ===")
print(f"Date range: {df['crash_day'].min()} to {df['crash_day'].max()}")
print(f"Total collisions: {df['total_collisions'].sum():,}")
print(f"Total injury incidents: {df['injury_incidents'].sum():,}")
print(f"Total fatal incidents: {df['fatal_incidents'].sum():,}")
print(f"\nCollisions by borough:")
df.groupby('borough')['total_collisions'].sum().sort_values(ascending=False)

## Collisions by Borough

In [None]:
# Aggregate by borough
borough_stats = df.groupby('borough').agg({
    'total_collisions': 'sum',
    'injury_incidents': 'sum',
    'fatal_incidents': 'sum'
}).sort_values('total_collisions', ascending=False)

print("\n=== Borough Statistics ===")
borough_stats

In [None]:
# Visualize borough statistics
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Total collisions
borough_stats['total_collisions'].plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Total Collisions by Borough')
axes[0].set_ylabel('Count')
axes[0].set_xlabel('Borough')
axes[0].tick_params(axis='x', rotation=45)

# Injury incidents
borough_stats['injury_incidents'].plot(kind='bar', ax=axes[1], color='orange')
axes[1].set_title('Injury Incidents by Borough')
axes[1].set_ylabel('Count')
axes[1].set_xlabel('Borough')
axes[1].tick_params(axis='x', rotation=45)

# Fatal incidents
borough_stats['fatal_incidents'].plot(kind='bar', ax=axes[2], color='red')
axes[2].set_title('Fatal Incidents by Borough')
axes[2].set_ylabel('Count')
axes[2].set_xlabel('Borough')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Time Series Analysis

In [None]:
# Convert crash_day to datetime for time series analysis
df['crash_day'] = pd.to_datetime(df['crash_day'])

# Daily collision trends
daily_totals = df.groupby('crash_day')['total_collisions'].sum().sort_index()

plt.figure(figsize=(14, 6))
plt.plot(daily_totals.index, daily_totals.values, marker='o', linestyle='-', linewidth=1.5)
plt.title('Daily Collision Trends (Since Oct 1, 2024)', fontsize=14)
plt.xlabel('Date')
plt.ylabel('Total Collisions')
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print(f"Average daily collisions: {daily_totals.mean():.1f}")
print(f"Highest collision day: {daily_totals.idxmax().date()} ({daily_totals.max()} collisions)")
print(f"Lowest collision day: {daily_totals.idxmin().date()} ({daily_totals.min()} collisions)")

## Custom Query Example

Query specific boroughs or date ranges:

In [None]:
# Example: Query only Brooklyn and Queens
custom_query = text("""
SELECT 
    borough,
    crash_day,
    total_collisions,
    injury_incidents,
    fatal_incidents
FROM collision_summary
WHERE borough IN ('BROOKLYN', 'QUEENS')
ORDER BY crash_day DESC
LIMIT 20
""")

with engine.connect() as conn:
    df_custom = pd.read_sql(custom_query, conn)

df_custom

## Injury Rate Analysis

In [None]:
# Calculate injury rate by borough
df['injury_rate'] = (df['injury_incidents'] / df['total_collisions'] * 100).round(2)

borough_injury_rates = df.groupby('borough').agg({
    'total_collisions': 'sum',
    'injury_incidents': 'sum'
})
borough_injury_rates['injury_rate'] = (
    borough_injury_rates['injury_incidents'] / borough_injury_rates['total_collisions'] * 100
).round(2)

print("\n=== Injury Rates by Borough ===")
borough_injury_rates.sort_values('injury_rate', ascending=False)