# Getis-Ord Gi* Hotspot Analysis

**Notebook**: 03_getis_ord_gi_star.ipynb  
**Sprint**: Phase 2 Sprint 8 - Advanced Geospatial Analysis  
**Created**: 2025-11-08  

## Objectives

1. Calculate Getis-Ord Gi* statistic for spatial hotspot detection
2. Identify statistically significant hot spots (high fatality clustering)
3. Identify cold spots (low fatality clustering)
4. Analyze hotspot characteristics
5. Create interactive hotspot visualization

## Getis-Ord Gi* Method

- **Library**: esda.Getis_Ord (PySAL)
- **Spatial Weights**: K-nearest neighbors (k=8)
- **Significance Levels**: 95% (z > 1.96) and 99% (z > 2.58)
- **Variable**: Total fatalities (inj_tot_f)

The Gi* statistic identifies locations where high or low values cluster spatially.

In [None]:
# Standard library
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Data manipulation
import pandas as pd
import numpy as np

# Geospatial
import geopandas as gpd
from libpysal.weights import KNN
from esda.getisord import G_Local

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import folium

# Configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Paths
DATA_DIR = Path('../../data')
FIG_DIR = Path('figures')
MAP_DIR = Path('maps')

print('✅ All packages imported successfully')

In [None]:
# Load geospatial data
gdf = gpd.read_parquet(DATA_DIR / 'geospatial_events.parquet')
print(f'✅ Loaded {len(gdf):,} events')

# Project to Albers Equal Area for accurate distance calculations
gdf_proj = gdf.to_crs('EPSG:5070')
print(f'✅ Projected to EPSG:5070 (Albers Equal Area)')

## 1. Construct Spatial Weights Matrix

In [None]:
# Create K-nearest neighbors spatial weights
print('Creating spatial weights matrix (k=8)...')
w = KNN.from_dataframe(gdf_proj, k=8)
w.transform = 'r'  # Row-standardize weights

print(f'✅ Spatial weights matrix created')
print(f'Number of observations: {w.n}')
print(f'Average neighbors: {w.mean_neighbors:.2f}')
print(f'Min neighbors: {w.min_neighbors}')
print(f'Max neighbors: {w.max_neighbors}')

## 2. Calculate Getis-Ord Gi* Statistic

In [None]:
# Calculate Gi* for fatality counts
print('Computing Getis-Ord Gi* statistic...')
gi_star = G_Local(gdf['inj_tot_f'].values, w, star=True, permutations=999)

# Add results to GeoDataFrame
gdf['gi_star_z'] = gi_star.Zs
gdf['gi_star_p'] = gi_star.p_sim

print(f'✅ Gi* statistic computed')
print(f'Z-score range: {gi_star.Zs.min():.3f} to {gi_star.Zs.max():.3f}')
print(f'P-value range: {gi_star.p_sim.min():.4f} to {gi_star.p_sim.max():.4f}')

## 3. Classify Hot and Cold Spots

In [None]:
# Classify based on z-scores and p-values
gdf['hotspot_type'] = 'Not Significant'

# Hot spots (high fatality clustering)
gdf.loc[(gdf['gi_star_z'] > 1.96) & (gdf['gi_star_p'] < 0.05), 'hotspot_type'] = 'Hot Spot (95%)'
gdf.loc[(gdf['gi_star_z'] > 2.58) & (gdf['gi_star_p'] < 0.01), 'hotspot_type'] = 'Hot Spot (99%)'

# Cold spots (low fatality clustering)
gdf.loc[(gdf['gi_star_z'] < -1.96) & (gdf['gi_star_p'] < 0.05), 'hotspot_type'] = 'Cold Spot (95%)'
gdf.loc[(gdf['gi_star_z'] < -2.58) & (gdf['gi_star_p'] < 0.01), 'hotspot_type'] = 'Cold Spot (99%)'

# Count hotspots
hotspot_counts = gdf['hotspot_type'].value_counts()
print('\n=== Hotspot Classification ===')
print(hotspot_counts)

print(f'\nTotal hot spots: {(gdf["hotspot_type"].str.contains("Hot Spot")).sum():,}')
print(f'Total cold spots: {(gdf["hotspot_type"].str.contains("Cold Spot")).sum():,}')
print(f'Not significant: {(gdf["hotspot_type"] == "Not Significant").sum():,}')

## 4. Analyze Hotspot Characteristics

In [None]:
# Top 10 hot spots by z-score
hot_spots = gdf[gdf['hotspot_type'].str.contains('Hot Spot')].copy()
top_hot_spots = hot_spots.nlargest(10, 'gi_star_z')[[
    'ev_id', 'ev_date', 'ev_state', 'dec_latitude', 'dec_longitude',
    'inj_tot_f', 'gi_star_z', 'gi_star_p', 'hotspot_type'
]]

print('\n=== Top 10 Hot Spots by Z-Score ===')
print(top_hot_spots)

# Hot spot statistics by state
hot_spot_states = hot_spots['ev_state'].value_counts().head(10)
print('\n=== Hot Spots by State (Top 10) ===')
print(hot_spot_states)

# Cold spot statistics
cold_spots = gdf[gdf['hotspot_type'].str.contains('Cold Spot')].copy()
cold_spot_states = cold_spots['ev_state'].value_counts().head(10)
print('\n=== Cold Spots by State (Top 10) ===')
print(cold_spot_states)

## 5. Visualizations

In [None]:
# Figure 1: Z-score distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Histogram
ax1.hist(gdf['gi_star_z'], bins=50, color='steelblue', edgecolor='black', alpha=0.7)
ax1.axvline(1.96, color='red', linestyle='--', label='95% confidence (±1.96)', linewidth=2)
ax1.axvline(-1.96, color='red', linestyle='--', linewidth=2)
ax1.axvline(2.58, color='darkred', linestyle='--', label='99% confidence (±2.58)', linewidth=2)
ax1.axvline(-2.58, color='darkred', linestyle='--', linewidth=2)
ax1.set_title('Getis-Ord Gi* Z-Score Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Z-Score', fontsize=12)
ax1.set_ylabel('Frequency', fontsize=12)
ax1.legend()
ax1.grid(True, alpha=0.3)

# Box plot by hotspot type
gdf.boxplot(column='gi_star_z', by='hotspot_type', ax=ax2)
ax2.set_title('Z-Scores by Hotspot Type', fontsize=14, fontweight='bold')
ax2.set_xlabel('Hotspot Type', fontsize=12)
ax2.set_ylabel('Z-Score', fontsize=12)
plt.suptitle('')
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig(FIG_DIR / 'getis_ord_z_distribution.png', dpi=150, bbox_inches='tight')
plt.show()
print('✅ Saved: getis_ord_z_distribution.png')

In [None]:
# Figure 2: Hotspot counts by state
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Hot spots
hot_spot_states.plot(kind='barh', ax=ax1, color='red', edgecolor='black', alpha=0.7)
ax1.set_title('Hot Spots by State (Top 10)', fontsize=12, fontweight='bold')
ax1.set_xlabel('Count', fontsize=10)
ax1.set_ylabel('State', fontsize=10)
ax1.invert_yaxis()

# Cold spots
cold_spot_states.plot(kind='barh', ax=ax2, color='blue', edgecolor='black', alpha=0.7)
ax2.set_title('Cold Spots by State (Top 10)', fontsize=12, fontweight='bold')
ax2.set_xlabel('Count', fontsize=10)
ax2.set_ylabel('State', fontsize=10)
ax2.invert_yaxis()

plt.tight_layout()
plt.savefig(FIG_DIR / 'getis_ord_hotspots_by_state.png', dpi=150, bbox_inches='tight')
plt.show()
print('✅ Saved: getis_ord_hotspots_by_state.png')

## 6. Interactive Hotspot Map

In [None]:
# Create base map
m = folium.Map(location=[39.8283, -98.5795], zoom_start=4)

# Color mapping
color_map = {
    'Hot Spot (99%)': 'darkred',
    'Hot Spot (95%)': 'red',
    'Cold Spot (95%)': 'blue',
    'Cold Spot (99%)': 'darkblue',
    'Not Significant': 'gray'
}

# Add significant hotspots only (for performance)
significant = gdf[gdf['hotspot_type'] != 'Not Significant'].copy()

# Sample if too many
if len(significant) > 5000:
    significant = significant.sample(5000, random_state=42)

for idx, row in significant.iterrows():
    # Circle size proportional to absolute z-score
    radius = min(abs(row['gi_star_z']) * 2, 15)
    
    folium.CircleMarker(
        location=[row['dec_latitude'], row['dec_longitude']],
        radius=radius,
        color=color_map[row['hotspot_type']],
        fill=True,
        fillColor=color_map[row['hotspot_type']],
        fillOpacity=0.6,
        popup=f"""<b>{row['hotspot_type']}</b><br>
                  Event: {row['ev_id']}<br>
                  Date: {row['ev_date']}<br>
                  State: {row['ev_state']}<br>
                  Fatalities: {row['inj_tot_f']}<br>
                  Z-Score: {row['gi_star_z']:.3f}<br>
                  P-Value: {row['gi_star_p']:.4f}"""
    ).add_to(m)

# Add legend
legend_html = f'''<div style="position: fixed; 
                bottom: 50px; right: 50px; width: 280px; height: auto; 
                background-color: white; border:2px solid grey; z-index:9999; 
                font-size:14px; padding: 10px">
                <p><b>Getis-Ord Gi* Hotspots</b></p>
                <p style="color:darkred">● Hot Spot (99%): {hotspot_counts.get("Hot Spot (99%)", 0):,}</p>
                <p style="color:red">● Hot Spot (95%): {hotspot_counts.get("Hot Spot (95%)", 0):,}</p>
                <p style="color:blue">● Cold Spot (95%): {hotspot_counts.get("Cold Spot (95%)", 0):,}</p>
                <p style="color:darkblue">● Cold Spot (99%): {hotspot_counts.get("Cold Spot (99%)", 0):,}</p>
                <p><i>Circle size = |Z-score|</i></p>
                </div>'''
m.get_root().html.add_child(folium.Element(legend_html))

# Save map
map_path = MAP_DIR / 'getis_ord_hotspots.html'
m.save(str(map_path))
print(f'✅ Saved: {map_path}')

m

## 7. Save Results

In [None]:
# Save hotspot GeoJSON
gdf_output = gdf[['ev_id', 'ev_date', 'ev_state', 'dec_latitude', 'dec_longitude',
                  'inj_tot_f', 'gi_star_z', 'gi_star_p', 'hotspot_type', 'geometry']].copy()
gdf_output.to_file(DATA_DIR / 'getis_ord_hotspots.geojson', driver='GeoJSON')
print('✅ Saved: getis_ord_hotspots.geojson')

# Save statistics
stats = {
    'spatial_weights': {
        'method': 'K-nearest neighbors',
        'k': 8,
        'transform': 'row-standardized'
    },
    'hotspot_counts': hotspot_counts.to_dict(),
    'top_10_hot_spots': top_hot_spots.to_dict('records'),
    'hot_spot_states': hot_spot_states.to_dict(),
    'cold_spot_states': cold_spot_states.to_dict()
}

with open(DATA_DIR / 'hotspot_statistics.csv', 'w') as f:
    top_hot_spots.to_csv(f, index=False)
print('✅ Saved: hotspot_statistics.csv')

## Summary

**Getis-Ord Gi* Hotspot Analysis Complete** ✅

**Files Created**:
- `data/getis_ord_hotspots.geojson` - Hotspot classifications
- `data/hotspot_statistics.csv` - Top hot spots
- `maps/getis_ord_hotspots.html` - Interactive map
- `figures/getis_ord_z_distribution.png` - Z-score distribution
- `figures/getis_ord_hotspots_by_state.png` - State-level hotspots

**Next Steps**:
- Moran's I Autocorrelation (04_morans_i_autocorrelation.ipynb)