# Census Tract Crime Rates (HYP-SOCIO)

**Objective:** Compute population-normalized crime rates at census tract level for socioeconomic hypothesis testing.

## Overview

This notebook implements census tract crime rate analysis to:
1. Spatially join crime incidents to Census tract boundaries
2. Compute per-100,000-resident crime rates (FBI UCR convention)
3. Flag tracts with unreliable population data for future analysis exclusion
4. Generate choropleth map showing crime rate distribution

**Outputs:**
- `reports/tract_crime_rates.png` - Choropleth map of crime rates by tract
- `reports/tract_crime_rates.csv` - Summary statistics with rates and counts
- `reports/tracts_with_rates.geojson` - GeoJSON for interactive mapping
- `reports/flagged_tracts_report.md` - Documentation of unreliable tracts
- `data/processed/tract_crime_rates.parquet` - Analysis-ready dataset for downstream use

In [None]:
# Parameters (can be injected by papermill)
VERSION = "v1.0"
FAST_MODE = False

In [None]:
import time
from pathlib import Path
import sys

# Robust repo_root detection: works from notebooks/ dir or project root
cwd = Path.cwd()
if (cwd / 'config' / 'phase2_config.yaml').exists():
    repo_root = cwd  # Running from project root (papermill)
elif (cwd.parent / 'config' / 'phase2_config.yaml').exists():
    repo_root = cwd.parent  # Running from notebooks/ dir
else:
    raise RuntimeError(f"Cannot find config from cwd={cwd}")

print(f"DEBUG repo_root: {repo_root}")
sys.path.insert(0, str(repo_root))

REPORTS_DIR = (repo_root / 'reports').resolve()
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

PROCESSED_DIR = (repo_root / 'data' / 'processed').resolve()
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print(f"Reports dir: {REPORTS_DIR}")
print(f"Processed data dir: {PROCESSED_DIR}")

artifacts = []
RUNTIME_START = time.time()

In [None]:
from datetime import datetime
import platform

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from shapely.geometry import Point

print("Reproducibility Info")
print("=" * 40)
print(f"Timestamp (local): {datetime.now().isoformat()}")
print(f"Python: {sys.version.split()[0]}")
print(f"Platform: {platform.platform()}")
print(f"Pandas: {pd.__version__}")
print(f"NumPy: {np.__version__}")
print(f"GeoPandas: {gpd.__version__}")

## 1. Data Loading and Configuration

In [None]:
from analysis.utils import load_data, classify_crime_category
from analysis.spatial_utils import load_boundaries, clean_coordinates
from analysis.phase2_config_loader import load_phase2_config

# Load configuration
config = load_phase2_config()
pop_col = config.census.population_column  # 'total_pop'
rate_per = config.census.rate_per  # 100000 per UCR convention
min_pop = config.census.min_population  # 100

print(f"Census Configuration:")
print(f"  - Population column: {pop_col}")
print(f"  - Rate per: {rate_per:,} (FBI UCR convention)")
print(f"  - Minimum population threshold: {min_pop}")

# Load crime data
df = load_data(clean=True)
df = classify_crime_category(df)
print(f"\nLoaded {len(df):,} crime records")
print(f"Crime categories: {df['crime_category'].value_counts().to_dict()}")

# Load tract boundaries with population
tracts_gdf = load_boundaries('census_tracts')
print(f"\nCensus tracts loaded: {len(tracts_gdf)}")

## 2. Validate Population Data

In [None]:
print(f"Population column: {pop_col}")
print(f"\nPopulation statistics:")
print(tracts_gdf[pop_col].describe())

# Check for missing/zero population
zero_pop = (tracts_gdf[pop_col] == 0).sum()
low_pop_count = (tracts_gdf[pop_col] < min_pop).sum()
null_pop = tracts_gdf[pop_col].isna().sum()

print(f"\nTracts with zero population: {zero_pop}")
print(f"Tracts with population < {min_pop}: {low_pop_count}")
print(f"Tracts with null population: {null_pop}")

total_pop = tracts_gdf[pop_col].sum()
print(f"\nTotal population across all tracts: {total_pop:,.0f}")

## 3. Spatial Join - Crimes to Census Tracts

In [None]:
# Clean coordinates (filter to valid Philadelphia bounds)
df_coords = clean_coordinates(df, x_col='point_x', y_col='point_y')
print(f"Records with valid coordinates: {len(df_coords):,} ({len(df_coords)/len(df)*100:.1f}%)")

# Create GeoDataFrame from crime data
geometry = [Point(xy) for xy in zip(df_coords['point_x'], df_coords['point_y'])]
crimes_gdf = gpd.GeoDataFrame(df_coords, geometry=geometry, crs="EPSG:4326")
print(f"\nCreated GeoDataFrame with {len(crimes_gdf):,} crime points")

In [None]:
# Ensure both have same CRS
if tracts_gdf.crs != crimes_gdf.crs:
    tracts_gdf = tracts_gdf.to_crs(crimes_gdf.crs)

# Perform spatial join - join crimes to tracts
print("Performing spatial join (this may take a moment)...")
crimes_with_tract = gpd.sjoin(
    crimes_gdf, 
    tracts_gdf[['GEOID', pop_col, 'geometry']], 
    how='left', 
    predicate='within'
)

# Check join success rate
joined_count = crimes_with_tract['GEOID'].notna().sum()
joined_pct = joined_count / len(crimes_with_tract) * 100
print(f"\nCrimes joined to tracts: {joined_count:,} ({joined_pct:.1f}%)")

# Handle unjoined (may be outside Philadelphia boundaries or in water)
unjoined = crimes_with_tract['GEOID'].isna().sum()
if unjoined > 0:
    print(f"Unjoined crimes (outside tract boundaries): {unjoined:,}")

## 4. Calculate Crime Counts by Tract

In [None]:
# Total crimes per tract
tract_crimes = crimes_with_tract.groupby('GEOID').size().reset_index(name='total_crimes')

# Crimes by category
tract_category = crimes_with_tract.groupby(['GEOID', 'crime_category']).size().unstack(fill_value=0)
tract_category.columns = [f'{col.lower()}_crimes' for col in tract_category.columns]
tract_category = tract_category.reset_index()

# Merge total and category counts
tract_stats = tract_crimes.merge(tract_category, on='GEOID', how='left')

print(f"Tracts with crime data: {len(tract_stats)}")
print(f"\nTotal crimes by category:")
for col in [c for c in tract_stats.columns if c.endswith('_crimes') and c != 'total_crimes']:
    print(f"  - {col}: {tract_stats[col].sum():,}")

## 5. Calculate Crime Rates (per 100,000)

In [None]:
# Merge population data
tract_stats = tract_stats.merge(
    tracts_gdf[['GEOID', pop_col]], 
    on='GEOID', 
    how='left'
)

# Calculate rate per 100,000
tract_stats['crime_rate'] = (tract_stats['total_crimes'] / tract_stats[pop_col]) * rate_per

# Calculate category-specific rates
for col in [c for c in tract_stats.columns if c.endswith('_crimes') and c != 'total_crimes']:
    rate_col = col.replace('_crimes', '_rate')
    tract_stats[rate_col] = (tract_stats[col] / tract_stats[pop_col]) * rate_per

print(f"Crime rate statistics (per {rate_per:,}):")
print(tract_stats['crime_rate'].describe())

## 6. Flag Unreliable Tracts

In [None]:
# Flag tracts below minimum population threshold
tract_stats['low_population'] = tract_stats[pop_col] < min_pop
tract_stats['zero_population'] = tract_stats[pop_col] == 0

# Calculate reliability flag
tract_stats['rate_reliable'] = ~tract_stats['low_population']

unreliable_count = tract_stats['low_population'].sum()
print(f"Tracts flagged as unreliable (pop < {min_pop}): {unreliable_count}")

# Handle infinite rates from zero population
tract_stats.loc[tract_stats['zero_population'], 'crime_rate'] = np.nan
for col in [c for c in tract_stats.columns if c.endswith('_rate')]:
    tract_stats.loc[tract_stats['zero_population'], col] = np.nan

# Document flagged tracts
flagged_tracts = tract_stats[tract_stats['low_population']][['GEOID', pop_col, 'total_crimes']].copy()
if len(flagged_tracts) > 0:
    print(f"\nFlagged tracts (unreliable rates):")
    print(flagged_tracts.to_string(index=False))
else:
    print(f"\nNo tracts flagged as unreliable.")

## 7. Choropleth Map

In [None]:
# Merge rates to boundaries for mapping
tracts_with_rates = tracts_gdf.merge(tract_stats, on='GEOID', how='left')

# Fill NaN for tracts with no crimes
tracts_with_rates['total_crimes'] = tracts_with_rates['total_crimes'].fillna(0)
tracts_with_rates['crime_rate'] = tracts_with_rates['crime_rate'].fillna(0)
tracts_with_rates['rate_reliable'] = tracts_with_rates['rate_reliable'].fillna(True)
tracts_with_rates['low_population'] = tracts_with_rates['low_population'].fillna(False)

print(f"Merged {len(tracts_with_rates)} tracts with rate data")

In [None]:
fig, ax = plt.subplots(figsize=(14, 12))

# Filter to reliable tracts for color scaling
reliable_mask = tracts_with_rates['rate_reliable'] == True
reliable_rates = tracts_with_rates.loc[reliable_mask, 'crime_rate']

# Use quantiles for robust color scaling (avoid outlier influence)
vmin = reliable_rates.quantile(0.05)
vmax = reliable_rates.quantile(0.95)

# Yellow-Orange-Red colormap
colors = ['#FFFFE0', '#FFEDA0', '#FED976', '#FEB24C', '#FD8D3C', '#FC4E2A', '#E31A1C', '#B10026']
cmap = LinearSegmentedColormap.from_list('YlOrRd', colors)

# Plot reliable tracts with color scale
tracts_with_rates[reliable_mask].plot(
    column='crime_rate', 
    cmap=cmap, 
    linewidth=0.3, 
    edgecolor='gray', 
    legend=True, 
    ax=ax,
    vmin=vmin, 
    vmax=vmax,
    legend_kwds={'label': f'Crime Rate (per {rate_per:,})', 'orientation': 'horizontal'}
)

# Plot unreliable tracts in gray
unreliable_mask = tracts_with_rates['rate_reliable'] == False
if unreliable_mask.sum() > 0:
    tracts_with_rates[unreliable_mask].plot(
        color='lightgray', 
        linewidth=0.3, 
        edgecolor='gray', 
        ax=ax
    )

ax.set_title('Philadelphia Crime Rates by Census Tract\n(Gray = low population, rates unreliable)', fontsize=14)
ax.set_axis_off()

plt.tight_layout()
png_path = REPORTS_DIR / 'tract_crime_rates.png'
plt.savefig(png_path, dpi=300, bbox_inches='tight')
artifacts.append(('tract_crime_rates.png', 'Census tract crime rate choropleth'))
print(f"\nSaved: {png_path}")
plt.show()

## 8. Summary Statistics

In [None]:
# Filter to reliable tracts only
reliable_tracts = tract_stats[tract_stats['rate_reliable'] == True].copy()

print("Top 10 Tracts by Crime Rate:")
top_10 = reliable_tracts.nlargest(10, 'crime_rate')[['GEOID', pop_col, 'total_crimes', 'crime_rate']]
print(top_10.to_string(index=False))

print("\nBottom 10 Tracts by Crime Rate (lowest crime areas):")
bottom_10 = reliable_tracts.nsmallest(10, 'crime_rate')[['GEOID', pop_col, 'total_crimes', 'crime_rate']]
print(bottom_10.to_string(index=False))

In [None]:
# Rate distribution for reliable tracts
rate_stats = {
    'Total tracts': len(tracts_gdf),
    'Tracts with crimes': len(tract_stats),
    'Reliable tracts': len(reliable_tracts),
    'Flagged tracts': unreliable_count,
    'Mean rate': reliable_tracts['crime_rate'].mean(),
    'Median rate': reliable_tracts['crime_rate'].median(),
    'Std rate': reliable_tracts['crime_rate'].std(),
    'Min rate': reliable_tracts['crime_rate'].min(),
    'Max rate': reliable_tracts['crime_rate'].max()
}

print("\nRate Distribution Summary:")
for k, v in rate_stats.items():
    if isinstance(v, float):
        print(f"  {k}: {v:,.1f}")
    else:
        print(f"  {k}: {v:,}")

## 9. Export Results

In [None]:
# Export full results
output_cols = ['GEOID', pop_col, 'total_crimes', 'crime_rate', 
               'violent_crimes', 'violent_rate', 'property_crimes', 'property_rate',
               'other_crimes', 'other_rate',
               'low_population', 'rate_reliable']
               
# Filter to columns that exist
output_cols = [c for c in output_cols if c in tract_stats.columns]
tract_export = tract_stats[output_cols].copy()

# Save CSV
csv_path = REPORTS_DIR / 'tract_crime_rates.csv'
tract_export.to_csv(csv_path, index=False)
artifacts.append(('tract_crime_rates.csv', 'Tract crime rates and counts'))
print(f"Saved: {csv_path}")
print(f"  - Rows: {len(tract_export)}")
print(f"  - Columns: {output_cols}")

# Save as parquet for downstream analysis
parquet_path = PROCESSED_DIR / 'tract_crime_rates.parquet'
tract_export.to_parquet(parquet_path)
artifacts.append(('data/processed/tract_crime_rates.parquet', 'Parquet for downstream analysis'))
print(f"\nSaved: {parquet_path}")

In [None]:
# Export as GeoJSON for interactive mapping
# Select relevant columns for export
geojson_cols = ['GEOID', 'NAME_x', pop_col, 'total_crimes', 'crime_rate', 
                'rate_reliable', 'geometry']
geojson_cols = [c for c in geojson_cols if c in tracts_with_rates.columns]

geojson_path = REPORTS_DIR / 'tracts_with_rates.geojson'
tracts_with_rates[geojson_cols].to_file(geojson_path, driver='GeoJSON')
artifacts.append(('tracts_with_rates.geojson', 'GeoJSON with crime rates'))
print(f"Saved: {geojson_path}")

In [None]:
# Generate flagged tracts report
flagged_report = f"""# Flagged Census Tracts Report

## Summary

- Total census tracts: {len(tracts_gdf)}
- Tracts with reliable population (>= {min_pop}): {len(reliable_tracts)}
- Tracts flagged as unreliable: {unreliable_count}

## Methodology

Crime rates are calculated per {rate_per:,} residents (FBI UCR convention).
Tracts with population below {min_pop} are flagged as unreliable because:
- Small population denominators produce unstable rates
- May represent non-residential areas (parks, industrial zones)
- Statistical inference unreliable with small populations

## Flagged Tracts (population < {min_pop})

| GEOID | Population | Total Crimes | Note |
|-------|------------|--------------|------|
"""

for _, row in flagged_tracts.iterrows():
    pop_val = row[pop_col]
    note = "Zero pop" if pop_val == 0 else f"Low pop ({int(pop_val)})"
    flagged_report += f"| {row['GEOID']} | {int(pop_val)} | {int(row['total_crimes'])} | {note} |\n"

if len(flagged_tracts) == 0:
    flagged_report += "| (none) | - | - | All tracts have reliable population |\n"

flagged_report += f"""
## Recommendation

Exclude flagged tracts from rate-based analyses. Use raw counts instead for these tracts.
For socioeconomic hypothesis testing, focus on the {len(reliable_tracts)} reliable tracts.

---
*Generated: {datetime.now().isoformat()}*
"""

report_path = REPORTS_DIR / 'flagged_tracts_report.md'
with open(report_path, 'w') as f:
    f.write(flagged_report)
    
artifacts.append(('flagged_tracts_report.md', 'Unreliable tract documentation'))
print(f"Saved: {report_path}")

## Conclusions and Recommendations

In [None]:
print("\n" + "="*70)
print("CENSUS TRACT CRIME RATE ANALYSIS FINDINGS")
print("="*70)

print(f"\n**Data Coverage:**")
print(f"  - Total crime records: {len(df):,}")
print(f"  - Records with valid coordinates: {len(df_coords):,} ({len(df_coords)/len(df)*100:.1f}%)")
print(f"  - Successfully joined to tracts: {joined_count:,} ({joined_pct:.1f}%)")

print(f"\n**Census Tract Summary:**")
print(f"  - Total census tracts: {len(tracts_gdf)}")
print(f"  - Tracts with crime data: {len(tract_stats)}")
print(f"  - Reliable tracts (pop >= {min_pop}): {len(reliable_tracts)}")
print(f"  - Flagged tracts: {unreliable_count}")

print(f"\n**Crime Rate Distribution (per {rate_per:,}, reliable tracts only):**")
print(f"  - Mean: {reliable_tracts['crime_rate'].mean():,.1f}")
print(f"  - Median: {reliable_tracts['crime_rate'].median():,.1f}")
print(f"  - Std Dev: {reliable_tracts['crime_rate'].std():,.1f}")
print(f"  - Range: {reliable_tracts['crime_rate'].min():,.1f} to {reliable_tracts['crime_rate'].max():,.1f}")

# Calculate coefficient of variation for rate disparity
cv = reliable_tracts['crime_rate'].std() / reliable_tracts['crime_rate'].mean() * 100
print(f"  - Coefficient of variation: {cv:.1f}%")

print(f"\n**Recommendations for Socioeconomic Analysis (HYP-SOCIO):**")
print(f"  1. Use the {len(reliable_tracts)} reliable tracts for correlation analysis")
print(f"  2. Exclude {unreliable_count} flagged tracts from rate-based comparisons")
print(f"  3. Consider log-transforming rates for normality in statistical tests")
print(f"  4. High CV ({cv:.1f}%) suggests significant spatial inequality for policy analysis")

In [None]:
print("\n" + "="*60)
print("NOTEBOOK COMPLETE: Census Tract Crime Rates (HYP-SOCIO)")
print("="*60)
print(f"\nArtifacts generated:")
for name, desc in artifacts:
    print(f"  - {name}: {desc}")
print(f"\nRuntime: {time.time() - RUNTIME_START:.1f} seconds")

## Validation Checklist

- [x] Notebook executes end-to-end without errors
- [x] Reproducibility cell present with version info
- [x] `reports/tract_crime_rates.png` exists at 300 DPI
- [x] Choropleth shows reliable tracts in color, unreliable in gray
- [x] `reports/tract_crime_rates.csv` contains GEOID, population, counts, and rates
- [x] `data/processed/tract_crime_rates.parquet` exists for downstream use
- [x] `reports/flagged_tracts_report.md` documents unreliable tracts
- [x] Rate per 100,000 used (matching FBI UCR convention)
- [x] Spatial join success rate > 95%