# ZIP Code Neighbor Network Visualization

可视化预计算的 ZIP code 邻接关系网络

**目标：**
1. 验证空间关系计算的正确性
2. 识别社区集群
3. 理解 NYC 的地理结构

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from sqlalchemy import text

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

from noah_converter.utils.config import load_config
from noah_converter.utils.db_connection import PostgreSQLConnection

# 设置样式
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (15, 10)

print("✓ Imports successful")

## 1. 连接数据库并加载数据

In [None]:
# 加载配置
config = load_config()
pg_conn = PostgreSQLConnection(config.source_db)

print("✓ Connected to PostgreSQL")

In [None]:
# 加载 ZIP centroids
query = """
SELECT zip_code, center_lat, center_lon, area_km2, perimeter_km
FROM zip_centroids
ORDER BY zip_code
"""

with pg_conn.engine.connect() as conn:
    df_zips = pd.read_sql(text(query), conn)

print(f"✓ Loaded {len(df_zips)} ZIP codes")
df_zips.head()

In [None]:
# 加载 ZIP neighbors
query = """
SELECT from_zip, to_zip, distance_km, is_adjacent, shared_boundary_km
FROM zip_neighbors
ORDER BY from_zip, to_zip
"""

with pg_conn.engine.connect() as conn:
    df_neighbors = pd.read_sql(text(query), conn)

print(f"✓ Loaded {len(df_neighbors)} neighbor relationships")
print(f"   - Adjacent: {df_neighbors['is_adjacent'].sum()}")
print(f"   - Nearby: {(~df_neighbors['is_adjacent']).sum()}")
df_neighbors.head()

## 2. 基本统计分析

In [None]:
# 距离分布
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# All neighbors
axes[0].hist(df_neighbors['distance_km'], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Distance (km)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distance Distribution: All Neighbors')
axes[0].axvline(df_neighbors['distance_km'].mean(), color='red', linestyle='--', label=f"Mean: {df_neighbors['distance_km'].mean():.2f} km")
axes[0].legend()

# Adjacent only
adjacent_distances = df_neighbors[df_neighbors['is_adjacent']]['distance_km']
axes[1].hist(adjacent_distances, bins=20, edgecolor='black', alpha=0.7, color='green')
axes[1].set_xlabel('Distance (km)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distance Distribution: Adjacent ZIPs Only')
axes[1].axvline(adjacent_distances.mean(), color='red', linestyle='--', label=f"Mean: {adjacent_distances.mean():.2f} km")
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"Distance statistics:")
print(f"  All neighbors: {df_neighbors['distance_km'].describe()}")
print(f"\n  Adjacent only: {adjacent_distances.describe()}")

In [None]:
# 每个 ZIP 的邻居数量
neighbor_counts = pd.concat([
    df_neighbors['from_zip'].value_counts(),
    df_neighbors['to_zip'].value_counts()
]).groupby(level=0).sum().sort_values(ascending=False)

print(f"Neighbor count statistics:")
print(neighbor_counts.describe())
print(f"\nTop 10 ZIPs by neighbor count:")
print(neighbor_counts.head(10))
print(f"\nBottom 10 ZIPs by neighbor count:")
print(neighbor_counts.tail(10))

# 可视化
plt.figure(figsize=(12, 6))
neighbor_counts.plot(kind='hist', bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Number of Neighbors')
plt.ylabel('Number of ZIPs')
plt.title('Distribution of Neighbor Counts per ZIP')
plt.axvline(neighbor_counts.mean(), color='red', linestyle='--', label=f"Mean: {neighbor_counts.mean():.1f}")
plt.legend()
plt.show()

## 3. 网络图可视化

In [None]:
# 创建 NetworkX 图
G = nx.Graph()

# 添加节点（ZIPs）
for _, row in df_zips.iterrows():
    G.add_node(row['zip_code'], 
               lat=row['center_lat'], 
               lon=row['center_lon'],
               area=row['area_km2'])

# 添加边（邻接关系）
for _, row in df_neighbors.iterrows():
    G.add_edge(row['from_zip'], row['to_zip'],
               distance=row['distance_km'],
               is_adjacent=row['is_adjacent'],
               weight=1.0 / row['distance_km'])  # 距离越近，权重越大

print(f"✓ Created network graph:")
print(f"   - Nodes: {G.number_of_nodes()}")
print(f"   - Edges: {G.number_of_edges()}")
print(f"   - Density: {nx.density(G):.4f}")
print(f"   - Connected: {nx.is_connected(G)}")

In [None]:
# 方案 1: 使用地理坐标布局（真实位置）
pos_geo = {node: (data['lon'], data['lat']) for node, data in G.nodes(data=True)}

plt.figure(figsize=(16, 12))

# 绘制边 - 区分 adjacent 和 nearby
adjacent_edges = [(u, v) for u, v, d in G.edges(data=True) if d['is_adjacent']]
nearby_edges = [(u, v) for u, v, d in G.edges(data=True) if not d['is_adjacent']]

# 绘制 nearby edges（虚线，灰色）
nx.draw_networkx_edges(G, pos_geo, edgelist=nearby_edges,
                       edge_color='lightgray', style='dashed', width=0.5, alpha=0.3)

# 绘制 adjacent edges（实线，黑色）
nx.draw_networkx_edges(G, pos_geo, edgelist=adjacent_edges,
                       edge_color='black', width=1.5, alpha=0.6)

# 绘制节点
node_sizes = [G.nodes[node]['area'] * 5 for node in G.nodes()]  # 大小与面积成比例
nx.draw_networkx_nodes(G, pos_geo, 
                       node_color='skyblue', 
                       node_size=node_sizes,
                       edgecolors='navy',
                       linewidths=1.5,
                       alpha=0.8)

# 添加标签（可能太密集，根据需要调整）
# nx.draw_networkx_labels(G, pos_geo, font_size=6, font_color='black')

plt.title('NYC ZIP Code Neighbor Network (Geographic Layout)', fontsize=16, fontweight='bold')
plt.xlabel('Longitude', fontsize=12)
plt.ylabel('Latitude', fontsize=12)
plt.axis('on')
plt.grid(True, alpha=0.3)

# 添加图例
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], color='black', linewidth=1.5, label='Adjacent (touching)'),
    Line2D([0], [0], color='lightgray', linewidth=0.5, linestyle='dashed', label='Nearby (within 10km)')
]
plt.legend(handles=legend_elements, loc='upper right', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# 方案 2: 使用 Spring Layout（强调网络结构）
pos_spring = nx.spring_layout(G, k=0.5, iterations=50, seed=42)

plt.figure(figsize=(16, 12))

# 绘制边
nx.draw_networkx_edges(G, pos_spring, edgelist=nearby_edges,
                       edge_color='lightgray', style='dashed', width=0.3, alpha=0.2)
nx.draw_networkx_edges(G, pos_spring, edgelist=adjacent_edges,
                       edge_color='black', width=1.0, alpha=0.5)

# 绘制节点（根据 degree 着色）
degrees = dict(G.degree())
node_colors = [degrees[node] for node in G.nodes()]
nx.draw_networkx_nodes(G, pos_spring,
                       node_color=node_colors,
                       node_size=300,
                       cmap='YlOrRd',
                       edgecolors='black',
                       linewidths=1,
                       alpha=0.9)

plt.title('NYC ZIP Code Neighbor Network (Spring Layout)', fontsize=16, fontweight='bold')
plt.colorbar(plt.cm.ScalarMappable(cmap='YlOrRd', norm=plt.Normalize(vmin=min(node_colors), vmax=max(node_colors))),
             label='Degree (number of neighbors)', shrink=0.8)
plt.axis('off')
plt.tight_layout()
plt.show()

## 4. 网络分析

In [None]:
# 计算中心性指标
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)

# 转换为 DataFrame
df_centrality = pd.DataFrame({
    'zip_code': list(degree_centrality.keys()),
    'degree': list(degree_centrality.values()),
    'betweenness': list(betweenness_centrality.values()),
    'closeness': list(closeness_centrality.values())
}).sort_values('degree', ascending=False)

print("Top 10 ZIPs by Degree Centrality (most connected):")
print(df_centrality.head(10))

print("\nTop 10 ZIPs by Betweenness Centrality (bridge positions):")
print(df_centrality.sort_values('betweenness', ascending=False).head(10))

In [None]:
# 社区检测（Louvain algorithm）
try:
    import community as community_louvain
    communities = community_louvain.best_partition(G)
    num_communities = len(set(communities.values()))
    
    print(f"✓ Detected {num_communities} communities")
    
    # 可视化社区
    plt.figure(figsize=(16, 12))
    
    # 为每个社区分配颜色
    node_colors = [communities[node] for node in G.nodes()]
    
    nx.draw_networkx_edges(G, pos_geo, edge_color='lightgray', width=0.5, alpha=0.3)
    nx.draw_networkx_nodes(G, pos_geo,
                           node_color=node_colors,
                           node_size=200,
                           cmap='tab20',
                           edgecolors='black',
                           linewidths=1,
                           alpha=0.8)
    
    plt.title(f'ZIP Code Communities ({num_communities} clusters detected)', fontsize=16, fontweight='bold')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.axis('on')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
except ImportError:
    print("⚠️  python-louvain not installed. Install with: pip install python-louvain")

## 5. 导出结果用于 Neo4j 迁移

In [None]:
# 验证数据质量
print("Data Quality Checks:")
print("=" * 60)

# 1. 所有 ZIP 都有邻居吗？
all_zips = set(df_zips['zip_code'])
zips_with_neighbors = set(df_neighbors['from_zip']) | set(df_neighbors['to_zip'])
isolated_zips = all_zips - zips_with_neighbors

print(f"\n1. Isolated ZIPs (no neighbors): {len(isolated_zips)}")
if isolated_zips:
    print(f"   {isolated_zips}")

# 2. 距离合理吗？
unreasonable_distances = df_neighbors[df_neighbors['distance_km'] > 15]
print(f"\n2. Unreasonably far neighbors (>15km): {len(unreasonable_distances)}")
if len(unreasonable_distances) > 0:
    print(unreasonable_distances.head())

# 3. Adjacent 关系是否合理？
adjacent_far = df_neighbors[(df_neighbors['is_adjacent']) & (df_neighbors['distance_km'] > 10)]
print(f"\n3. Adjacent ZIPs with distance >10km (suspicious): {len(adjacent_far)}")
if len(adjacent_far) > 0:
    print(adjacent_far)

print("\n" + "=" * 60)
print("✅ Data quality checks complete!")
print("\nReady for Neo4j migration.")

In [None]:
# 清理
pg_conn.close()
print("✓ Database connection closed")

## 总结

通过这个 notebook，我们：

1. ✅ 验证了 ZIP 邻接关系的计算正确性
2. ✅ 可视化了 NYC ZIP code 网络结构
3. ✅ 分析了网络拓扑特性（中心性、社区）
4. ✅ 确认数据质量，准备迁移到 Neo4j

**下一步：**
- 运行完整数据迁移脚本
- 在 Neo4j 中创建 NEIGHBORS 关系