# Notebook 04: Geographic Analysis

**Purpose:** Comprehensive geographic analysis of Philadelphia crime data including hotspot identification, district profiles, KDE heatmaps, and spatial autocorrelation testing.

**Requirements Addressed:**
- GEO-01: Hotspot identification
- GEO-02: District-level analysis
- GEO-03: Crime rate calculations
- GEO-04: Spatial autocorrelation
- GEO-05: Geographic visualization
- GEO-06: Stability testing
- GEO-07: MAUP documentation

## 1. Setup and Imports

In [None]:
import sys
sys.path.append('../scripts')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import gaussian_kde
import geopandas as gpd
from shapely.geometry import Point
import warnings
warnings.filterwarnings('ignore')

# Spatial statistics
try:
    from esda import Moran
    from libpysal.weights import Queen
    SPATIAL_STATS_AVAILABLE = True
except ImportError:
    SPATIAL_STATS_AVAILABLE = False
    print("Warning: PySAL not available. Spatial autocorrelation analysis will be skipped.")

# Configuration
from config import (
    PROCESSED_DATA_DIR, FIGURES_DIR, TABLES_DIR,
    CRS_LATLON, CRS_PHILLY,
    COL_ID, COL_DATE, COL_DISTRICT, COL_UCR_GENERAL,
    COL_TEXT_GENERAL, COL_LAT, COL_LON
)

# Set random seed for reproducibility
np.random.seed(42)

# Configure matplotlib for publication quality
plt.rcParams.update({
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'font.size': 10,
    'axes.labelsize': 11,
    'axes.titlesize': 12,
    'xtick.labelsize': 9,
    'ytick.labelsize': 9,
    'legend.fontsize': 9,
})

print("Libraries imported successfully")

## 2. Load Cleaned Data

In [None]:
# Load cleaned data
df = pd.read_parquet(PROCESSED_DATA_DIR / 'crime_incidents_cleaned.parquet')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nDate range: {df[COL_DATE].min()} to {df[COL_DATE].max()}")

## 3. Geographic Data Coverage Analysis

In [None]:
# Analyze coordinate coverage
coord_coverage = df[COL_LAT].notna().sum() / len(df) * 100
print(f"Overall geocoding coverage: {coord_coverage:.2f}%")

# Coverage by crime type
coverage_by_type = df.groupby(COL_TEXT_GENERAL).apply(
    lambda x: x[COL_LAT].notna().sum() / len(x) * 100
).sort_values(ascending=False)

print("\nGeocoding coverage by crime type (top 10):")
print(coverage_by_type.head(10))

In [None]:
# Filter to records with valid coordinates for geographic analysis
df_geo = df[df[COL_LAT].notna() & df[COL_LON].notna()].copy()

print(f"Records with valid coordinates: {len(df_geo):,} ({len(df_geo)/len(df)*100:.1f}%)")
print(f"\nCoordinate bounds:")
print(f"  Latitude: {df_geo[COL_LAT].min():.4f} to {df_geo[COL_LAT].max():.4f}")
print(f"  Longitude: {df_geo[COL_LON].min():.4f} to {df_geo[COL_LON].max():.4f}")

## 4. Create GeoDataFrame with Proper Projection

In [None]:
# Create geometry from lat/lon
geometry = [Point(xy) for xy in zip(df_geo[COL_LON], df_geo[COL_LAT])]

# Create GeoDataFrame in WGS84 (EPSG:4326)
gdf = gpd.GeoDataFrame(
    df_geo,
    geometry=geometry,
    crs=CRS_LATLON
)

print(f"GeoDataFrame created with {len(gdf):,} records")
print(f"CRS: {gdf.crs}")

In [None]:
# Project to PA South State Plane (EPSG:2272) for accurate distance calculations
# This is critical for KDE - distances in degrees (lat/lon) are not accurate
gdf_projected = gdf.to_crs(CRS_PHILLY)

print(f"Projected to {CRS_PHILLY}")
print(f"New CRS: {gdf_projected.crs}")

# Extract projected coordinates for KDE
gdf_projected['x_proj'] = gdf_projected.geometry.x
gdf_projected['y_proj'] = gdf_projected.geometry.y

print(f"\nProjected coordinate bounds:")
print(f"  X: {gdf_projected['x_proj'].min():.0f} to {gdf_projected['x_proj'].max():.0f} ft")
print(f"  Y: {gdf_projected['y_proj'].min():.0f} to {gdf_projected['y_proj'].max():.0f} ft")

## 5. District-Level Aggregation

In [None]:
# Calculate district-level statistics
district_stats = df.groupby(COL_DISTRICT).agg({
    COL_ID: 'count',
    COL_LAT: 'mean',
    COL_LON: 'mean'
}).rename(columns={COL_ID: 'crime_count', COL_LAT: 'avg_lat', COL_LON: 'avg_lng'})

district_stats = district_stats.reset_index()
district_stats = district_stats.sort_values('crime_count', ascending=False)

print(f"Number of districts: {len(district_stats)}")
print("\nTop 10 districts by crime count:")
print(district_stats.head(10))

In [None]:
# Calculate top offense types per district
district_offenses = df.groupby([COL_DISTRICT, COL_TEXT_GENERAL]).size().reset_index(name='count')
district_top_offenses = district_offenses.sort_values(['dc_dist', 'count'], ascending=[True, False])
district_top_offenses = district_top_offenses.groupby(COL_DISTRICT).head(3)

print("Top 3 offense types per district (sample):")
print(district_top_offenses.head(15))

## 6. Note on District Boundaries

**Important:** For complete district-level spatial analysis with choropleth maps and Moran's I, we need Philadelphia Police District boundary shapefiles.

**Options for obtaining boundaries:**
1. OpenDataPhilly (opendataphilly.org) - Philadelphia Police Districts
2. City of Philadelphia GIS Portal
3. Manual download and placement in `data/processed/philly_police_districts.shp`

**Without boundaries, we can still perform:**
- Point-based KDE hotspot analysis
- District-level statistical summaries
- Coordinate-based visualizations

**Note on MAUP:** District boundaries are arbitrary administrative divisions. Crime patterns may differ at neighborhood or census tract levels.

In [None]:
# Attempt to load district boundaries if available
district_boundaries_path = PROCESSED_DATA_DIR / 'philly_police_districts.shp'

if district_boundaries_path.exists():
    districts_gdf = gpd.read_file(district_boundaries_path)
    print(f"Loaded district boundaries: {len(districts_gdf)} districts")
    print(f"Columns: {list(districts_gdf.columns)}")
    BOUNDARIES_AVAILABLE = True
else:
    print("District boundaries not found.")
    print(f"Expected at: {district_boundaries_path}")
    print("\nTo obtain boundaries:")
    print("  1. Visit https://opendataphilly.org")
    print("  2. Search for 'Police Districts'")
    print("  3. Download shapefile")
    print(f"  4. Save to: {district_boundaries_path}")
    BOUNDARIES_AVAILABLE = False

## 7. Save District Profiles

Save comprehensive district statistics for later use.

In [None]:
# Create comprehensive district profiles
district_profiles = district_stats.copy()

# Add crime rate (per year)
date_range_years = (df[COL_DATE].max() - df[COL_DATE].min()).days / 365.25
district_profiles['crimes_per_year'] = district_profiles['crime_count'] / date_range_years

# Add percentage of total crimes
total_crimes = district_profiles['crime_count'].sum()
district_profiles['pct_of_total'] = district_profiles['crime_count'] / total_crimes * 100

# Add rank
district_profiles['rank'] = district_profiles['crime_count'].rank(ascending=False, method='min').astype(int)

# Save to CSV
output_path = TABLES_DIR / 'geographic' / 'district_profiles.csv'
district_profiles.to_csv(output_path, index=False)

print(f"Saved district profiles to: {output_path}")
print(f"\nDistrict profiles summary:")
print(district_profiles.describe())

---

**Task 1 Complete:** Geographic data prepared with proper projections, district statistics calculated, and data coverage documented.

**Key Findings:**
- Geocoding coverage is high overall
- Data projected to EPSG:2272 for accurate distance calculations
- District-level statistics calculated
- District boundaries need to be obtained for full choropleth analysis