# Debug: NaN Cities in Network

This notebook investigates why the top city by TEU is showing as NaN.

**Potential issues to check:**
1. Missing city names in the raw data
2. Incorrect node-to-city mapping
3. Data cleaning issues in `build_migration_network_for_year`
4. Index misalignment between adjacency matrix and city names

## 1. Setup

In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Add src directory to path
notebook_dir = Path.cwd()
if 'notebooks' in str(notebook_dir):
    src_dir = notebook_dir.parent
else:
    src_dir = notebook_dir / 'src'

sys.path.insert(0, str(src_dir))

from config import BaseConfig
from utils.graph import build_migration_network_for_year

config = BaseConfig()
print("Setup complete!")

Setup complete!


## 2. Load Raw Data

In [2]:
# Load migration data
df = pd.read_csv(config.data_csv_path)

print(f"Total records: {len(df)}")
print(f"\nColumn names:")
print(df.columns.tolist())

# Check for missing values in city-related columns
print(f"\nMissing values in key columns:")
city_cols = ['current_city', 'hometown_Name_Prefecture', 'first_Name_Prefecture']
for col in city_cols:
    if col in df.columns:
        missing = df[col].isna().sum()
        print(f"  {col}: {missing} ({missing/len(df)*100:.1f}%)")

Total records: 169989

Column names:
['current_province', 'current_city', 'current_county', 'current_members_live_with', 'gender', 'year_born', 'edu_level', 'hometown_code', 'hometown', 'year_current_flow', 'month_current_flow', 'average_family_cost_per_month', 'average_family_income_per_month', 'year_first_flow', 'month_first_flow', 'first_flow_code', 'first_flow_location', 'num_flows_total', 'if_change_household_local', 'if_stay', 'how_long_to_stay', 'pc_key', 'hometown_Name_Province', 'hometown_Name_Prefecture', 'hometown_Name_County', 'hometown_lon', 'hometown_lat', 'first_Name_Province', 'first_Name_Prefecture', 'first_Name_County', 'first_lon', 'first_lat', 'current_lon', 'current_lat']

Missing values in key columns:
  current_city: 0 (0.0%)
  hometown_Name_Prefecture: 17600 (10.4%)
  first_Name_Prefecture: 33994 (20.0%)


## 3. Examine Network Building for Year 2015

In [3]:
# Test with year 2015 (one of the years being processed)
TEST_YEAR = 2015

# Build network using the utility function
G = build_migration_network_for_year(df, TEST_YEAR)

print(f"Network for year {TEST_YEAR}:")
print(f"  Nodes: {G.number_of_nodes()}")
print(f"  Edges: {G.number_of_edges()}")
print(f"\nNode types:")
print(f"  Type: {type(list(G.nodes())[0])}")

# Get list of nodes
node_list = list(G.nodes())
print(f"\nFirst 20 nodes:")
for i, node in enumerate(node_list[:20]):
    print(f"  {i}: {repr(node)} (type: {type(node).__name__})")

Network for year 2015:
  Nodes: 378
  Edges: 5486

Node types:
  Type: <class 'str'>

First 20 nodes:
  0: '鹰潭市' (type: str)
  1: '南昌市' (type: str)
  2: '深圳市' (type: str)
  3: '九江市' (type: str)
  4: '保定市' (type: str)
  5: '吉林市' (type: str)
  6: '第十师' (type: str)
  7: '赣州市' (type: str)
  8: nan (type: float)
  9: '菏泽市' (type: str)
  10: '苏州市' (type: str)
  11: '温州市' (type: str)
  12: '莆田市' (type: str)
  13: '开封市' (type: str)
  14: '百色市' (type: str)
  15: '阜新市' (type: str)
  16: '延边朝鲜族自治州' (type: str)
  17: '延边州' (type: str)
  18: '绥化市' (type: str)
  19: '鞍山市' (type: str)


## 4. Check for NaN Nodes

In [4]:
# Count how many nodes are NaN or None
nan_count = 0
none_count = 0
nan_nodes = []

for node in node_list:
    if pd.isna(node):
        nan_count += 1
        nan_nodes.append(node)
    elif node is None:
        none_count += 1
    elif str(node).lower() == 'nan':
        nan_count += 1
        nan_nodes.append(node)

print(f"Node quality check:")
print(f"  Total nodes: {len(node_list)}")
print(f"  NaN nodes: {nan_count}")
print(f"  None nodes: {none_count}")
print(f"  Valid nodes: {len(node_list) - nan_count - none_count}")

if nan_nodes:
    print(f"\n⚠️ Found {len(nan_nodes)} NaN nodes!")
    print(f"Sample NaN nodes: {nan_nodes[:5]}")

Node quality check:
  Total nodes: 378
  NaN nodes: 1
  None nodes: 0
  Valid nodes: 377

⚠️ Found 1 NaN nodes!
Sample NaN nodes: [nan]


## 5. Compute TEU and Check Top Nodes

In [5]:
# Convert to adjacency matrix
adj_matrix = nx.to_numpy_array(G, weight='weight')

# Compute TEU (node strength)
out_strength = adj_matrix.sum(axis=1)
in_strength = adj_matrix.sum(axis=0)
teu = out_strength + in_strength

print(f"TEU statistics:")
print(f"  Min: {teu.min():.0f}")
print(f"  Max: {teu.max():.0f}")
print(f"  Mean: {teu.mean():.0f}")
print(f"  Median: {np.median(teu):.0f}")

# Get top 10 indices by TEU
top_10_indices = np.argsort(teu)[-10:][::-1]

print(f"\nTop 10 nodes by TEU:")
print(f"{'Rank':<6} {'Index':<8} {'City Name':<30} {'TEU':<10}")
print("="*60)
for rank, idx in enumerate(top_10_indices, 1):
    city_name = node_list[idx]
    print(f"{rank:<6} {idx:<8} {repr(city_name):<30} {teu[idx]:<10.0f}")

TEU statistics:
  Min: 1
  Max: 6086
  Mean: 85
  Median: 38

Top 10 nodes by TEU:
Rank   Index    City Name                      TEU       
1      8        nan                            6086      
2      41       '北京市'                          864       
3      31       '上海市'                          771       
4      2        '深圳市'                          561       
5      23       '重庆市'                          527       
6      195      '广州市'                          526       
7      20       '天津市'                          465       
8      167      '成都市'                          358       
9      10       '苏州市'                          357       
10     138      '郑州市'                          335       


## 6. Investigate Raw Data for NaN Nodes

In [None]:
print("Analyzing raw data records that create NaN node...\\n")

df_year = df[df['year_current_flow'] == TEST_YEAR].copy()

# Find records where current_city is NaN (creates outgoing edges from NaN)
nan_current = df_year[df_year['current_city'].isna()].copy()
print(f"Records with NaN current_city: {len(nan_current)}")

# Find records where hometown is NaN (creates incoming edges to NaN)
nan_hometown = df_year[df_year['hometown_Name_Prefecture'].isna()].copy()
print(f"Records with NaN hometown: {len(nan_hometown)}")

print(f"\\n{'='*60}")
print("What does NaN current_city represent?")
print(f"{'='*60}")

if len(nan_current) > 0:
    # Check if these people have ANY location data
    print(f"\\nLocation data availability for NaN current_city records:")
    print(f"  Has current_province: {nan_current['current_province'].notna().sum()} / {len(nan_current)}")
    print(f"  Has current_county: {nan_current['current_county'].notna().sum()} / {len(nan_current)}")
    print(f"  Has current_lon/lat: {nan_current['current_lon'].notna().sum()} / {len(nan_current)}")
    
    # Show sample provinces/counties
    if nan_current['current_province'].notna().sum() > 0:
        print(f"\\n  Top provinces for NaN current_city:")
        print(nan_current['current_province'].value_counts().head(10))
    
    if nan_current['current_county'].notna().sum() > 0:
        print(f"\\n  Top counties for NaN current_city:")
        print(nan_current['current_county'].value_counts().head(10))
    
    # Where are they coming from?
    print(f"\\n  Where are these people coming from (hometown)?")
    if 'hometown_Name_Prefecture' in nan_current.columns:
        print(nan_current['hometown_Name_Prefecture'].value_counts().head(10))

print(f"\\n{'='*60}")
print("What does NaN hometown represent?")
print(f"{'='*60}")

if len(nan_hometown) > 0:
    # Where are they going?
    print(f"\\nLocation data for NaN hometown records:")
    print(f"  Current cities (where they're going):")
    print(nan_hometown['current_city'].value_counts().head(10))
    
    # Check if they have ANY hometown data
    print(f"\\n  Has hometown_province: {nan_hometown['hometown'].notna().sum()} / {len(nan_hometown)}")
    print(f"  Has first_flow location: {nan_hometown['first_Name_Prefecture'].notna().sum()} / {len(nan_hometown)}")

print(f"\\n{'='*60}")
print("INTERPRETATION")
print(f"{'='*60}")
print("\\nThe NaN node represents:")
print("  1. People whose current city is unknown (missing data)")
print("  2. People whose hometown is unknown (missing data)")
print("\\nThis could be due to:")
print("  - Incomplete survey responses")
print("  - Data entry errors")
print("  - Privacy redactions")
print("  - Coding issues in data collection")
print("\\nRecommendation:")
print("  → Filter out these records OR")
print("  → Impute missing cities based on province/county data")

## 13. Analyze What NaN Represents

In [None]:
if nan_nodes and len(connected_cities) > 0:
    print("Creating map visualization...\n")
    
    # Load city coordinates
    city_coords_map = {}
    for city in connected_cities:
        city_data = df[df['current_city'] == city][['current_lon', 'current_lat']].dropna()
        if len(city_data) == 0:
            # Try hometown
            city_data = df[df['hometown_Name_Prefecture'] == city][['hometown_lon', 'hometown_lat']].dropna()
            if len(city_data) > 0:
                city_coords_map[city] = (city_data.iloc[0]['hometown_lon'], city_data.iloc[0]['hometown_lat'])
        else:
            city_coords_map[city] = (city_data.iloc[0]['current_lon'], city_data.iloc[0]['current_lat'])
    
    print(f"Found coordinates for {len(city_coords_map)} / {len(connected_cities)} cities")
    
    # Create the map
    fig, ax = plt.subplots(figsize=(18, 14))
    
    # Load China provinces shapefile
    try:
        china_map = gpd.read_file(config.china_provinces_path)
        china_map.plot(ax=ax, color='#f0f0f0', edgecolor='#d9d9d9', linewidth=0.8)
    except Exception as e:
        print(f"Warning: Could not load China map shapefile: {e}")
        print("Continuing without background map...")
    
    ax.set_title(f"Migration Flows Connected to NaN Node (Year {TEST_YEAR})", 
                 fontsize=20, fontweight='bold', pad=20)
    
    # Draw cities connected to NaN
    for city, (lon, lat) in city_coords_map.items():
        if lon < 70 or lon > 140 or lat < 10 or lat > 60:
            continue  # Skip bad coordinates
        
        flow = city_flows.get(city, 0)
        size = min(flow * 5 + 100, 1000)  # Size based on flow volume
        
        # Color by incoming vs outgoing
        is_incoming = any(u == city for u, v, _ in incoming_edges if not pd.isna(u))
        is_outgoing = any(v == city for u, v, _ in outgoing_edges if not pd.isna(v))
        
        if is_incoming and is_outgoing:
            color = '#9333ea'  # Purple - both directions
        elif is_incoming:
            color = '#ef4444'  # Red - incoming (people going TO NaN from this city)
        else:
            color = '#3b82f6'  # Blue - outgoing (people from NaN going to this city)
        
        ax.scatter(lon, lat, s=size, c=color, edgecolors='white', 
                   linewidth=2, zorder=5, alpha=0.7)
        
        # Label top 10 cities
        if flow >= sorted(city_flows.values(), reverse=True)[min(9, len(city_flows)-1)]:
            ax.text(lon + 0.5, lat + 0.3, f"{city}\\n({flow})", 
                    fontsize=10, fontweight='bold', color='#333333', 
                    bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7),
                    zorder=6)
    
    # Add legend
    from matplotlib.lines import Line2D
    legend_elements = [
        Line2D([0], [0], marker='o', color='w', markerfacecolor='#ef4444', 
               markersize=12, label=f'To NaN (incoming)'),
        Line2D([0], [0], marker='o', color='w', markerfacecolor='#3b82f6', 
               markersize=12, label=f'From NaN (outgoing)'),
        Line2D([0], [0], marker='o', color='w', markerfacecolor='#9333ea', 
               markersize=12, label='Both directions'),
    ]
    ax.legend(handles=legend_elements, loc='upper right', fontsize=12, framealpha=0.9)
    
    # Add text annotation
    text_info = f"Total migration flow: {sum(city_flows.values())} people\\n"
    text_info += f"Connected cities: {len(city_coords_map)}\\n"
    text_info += f"Incoming: {len(incoming_edges)} | Outgoing: {len(outgoing_edges)}"
    ax.text(0.02, 0.98, text_info, transform=ax.transAxes, 
            fontsize=11, verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
    
    # Focus on mainland China
    ax.set_xlim(73, 136)
    ax.set_ylim(18, 54)
    ax.set_xlabel('Longitude', fontsize=12)
    ax.set_ylabel('Latitude', fontsize=12)
    
    plt.tight_layout()
    plt.show()
    
    print("\\n✓ Map visualization complete!")
else:
    print("Cannot create map: No NaN nodes or no connected cities with coordinates.")

In [None]:
import geopandas as gpd
import matplotlib.patches as patches

# Check if we have NaN nodes to visualize
if nan_nodes:
    print(f"Visualizing connections for NaN node on China map...\n")
    
    # Get the NaN node (just first one if multiple)
    nan_node = nan_nodes[0]
    
    # Get all edges involving the NaN node
    incoming_edges = list(G.in_edges(nan_node, data=True))
    outgoing_edges = list(G.out_edges(nan_node, data=True))
    
    print(f"NaN Node Statistics:")
    print(f"  Incoming edges: {len(incoming_edges)}")
    print(f"  Outgoing edges: {len(outgoing_edges)}")
    print(f"  Total edges: {len(incoming_edges) + len(outgoing_edges)}")
    
    # Get cities connected to NaN node
    connected_cities = set()
    for u, v, data in incoming_edges:
        if not pd.isna(u):
            connected_cities.add(u)
    for u, v, data in outgoing_edges:
        if not pd.isna(v):
            connected_cities.add(v)
    
    print(f"  Unique cities connected: {len(connected_cities)}")
    
    # Get top cities by flow
    city_flows = {}
    for u, v, data in incoming_edges:
        if not pd.isna(u):
            weight = data.get('weight', 1)
            city_flows[u] = city_flows.get(u, 0) + weight
    for u, v, data in outgoing_edges:
        if not pd.isna(v):
            weight = data.get('weight', 1)
            city_flows[v] = city_flows.get(v, 0) + weight
    
    top_cities = sorted(city_flows.items(), key=lambda x: x[1], reverse=True)[:20]
    print(f"\n  Top 20 cities by flow with NaN node:")
    for rank, (city, flow) in enumerate(top_cities, 1):
        print(f"    {rank}. {city}: {flow} people")
    
else:
    print("No NaN nodes found to visualize.")

## 12. Visualize NaN Node Connections on China Map

In [6]:
# If we found NaN nodes, let's see where they come from in the raw data
print("Checking raw data for year 2015...\n")

# Filter for year 2015
df_year = df[df['year_current_flow'] == TEST_YEAR].copy()

print(f"Total records for {TEST_YEAR}: {len(df_year)}")

# Check current_city column
print(f"\nCurrent city analysis:")
print(f"  Total values: {len(df_year)}")
print(f"  Missing (NaN): {df_year['current_city'].isna().sum()}")
print(f"  Unique cities: {df_year['current_city'].nunique()}")

# Check hometown column
print(f"\nHometown city analysis:")
if 'hometown_Name_Prefecture' in df_year.columns:
    print(f"  Total values: {len(df_year)}")
    print(f"  Missing (NaN): {df_year['hometown_Name_Prefecture'].isna().sum()}")
    print(f"  Unique cities: {df_year['hometown_Name_Prefecture'].nunique()}")

# Show sample of records with missing current_city
missing_current = df_year[df_year['current_city'].isna()]
if len(missing_current) > 0:
    print(f"\n⚠️ Found {len(missing_current)} records with missing current_city")
    print(f"\nSample records:")
    cols_to_show = ['current_city', 'hometown_Name_Prefecture', 'first_Name_Prefecture', 
                    'year_current_flow', 'current_members_live_with']
    print(missing_current[cols_to_show].head(10))

Checking raw data for year 2015...

Total records for 2015: 18651

Current city analysis:
  Total values: 18651
  Missing (NaN): 0
  Unique cities: 343

Hometown city analysis:
  Total values: 18651
  Missing (NaN): 1920
  Unique cities: 327


## 7. Trace How Network is Built

In [7]:
# Let's manually trace through how the network is built
print("Manual network building trace:\n")

# Read the source code of build_migration_network_for_year
import inspect
source = inspect.getsource(build_migration_network_for_year)
print("Function source code:")
print("="*60)
print(source)

Manual network building trace:

Function source code:
def build_migration_network_for_year(df: pd.DataFrame, year: int) -> nx.DiGraph:
    """
    Build directed migration network for a specific year.
    
    Parameters:
    -----------
    df : DataFrame
        Migration data
    year : int
        Year to analyze
        either year_first_flow or year_current_flow matches this year.
    
    Returns:
    --------
    G : nx.DiGraph
        Network with nodes (cities) and edges (migration flows)
    """
    G = nx.DiGraph()
    
    # Filter data for the year
    year_data = df[
        ((df['year_first_flow'] == year) | (df['year_current_flow'] == year))
    ]

    
    for _, row in year_data.iterrows():
        # Extract city names and coordinates
        hometown = row['hometown_Name_Prefecture']
        first_city = row['first_Name_Prefecture']
        current_city = row['current_city']
        
        hometown_pos = (row['hometown_lon'], row['hometown_lat'])
        first_pos

## 8. Check Edge Construction

In [8]:
# Let's see what edges are being created with NaN nodes
print("Checking edges involving NaN nodes...\n")

# Get edges from the graph
edges_with_nan = []
for u, v, data in G.edges(data=True):
    if pd.isna(u) or pd.isna(v) or str(u).lower() == 'nan' or str(v).lower() == 'nan':
        edges_with_nan.append((u, v, data.get('weight', 0)))

print(f"Edges involving NaN nodes: {len(edges_with_nan)}")

if edges_with_nan:
    print(f"\nSample edges with NaN:")
    for u, v, w in edges_with_nan[:10]:
        print(f"  {repr(u)} -> {repr(v)} (weight={w})")
    
    # Check degree of NaN node
    for node in nan_nodes[:1]:  # Just check first NaN node
        in_degree = G.in_degree(node, weight='weight')
        out_degree = G.out_degree(node, weight='weight')
        print(f"\nDegree statistics for NaN node:")
        print(f"  In-degree (weighted): {in_degree}")
        print(f"  Out-degree (weighted): {out_degree}")
        print(f"  Total strength: {in_degree + out_degree}")

Checking edges involving NaN nodes...

Edges involving NaN nodes: 573

Sample edges with NaN:
  '南昌市' -> nan (weight=5)
  '深圳市' -> nan (weight=2)
  '九江市' -> nan (weight=3)
  '保定市' -> nan (weight=5)
  '吉林市' -> nan (weight=7)
  '赣州市' -> nan (weight=9)
  nan -> '赣州市' (weight=19)
  nan -> '莆田市' (weight=3)
  nan -> '黔东南苗族侗族自治州' (weight=4)
  nan -> '大连市' (weight=32)

Degree statistics for NaN node:
  In-degree (weighted): 1322
  Out-degree (weighted): 4764
  Total strength: 6086


## 9. Check Raw Data Records Creating NaN Edges

In [None]:
# Find records where either current_city or hometown is NaN
print("Records with NaN in migration flow:\n")

df_year = df[df['year_current_flow'] == TEST_YEAR].copy()

# Records where current_city is NaN
nan_current = df_year[df_year['current_city'].isna()]
print(f"Records with NaN current_city: {len(nan_current)}")

# Records where hometown is NaN
if 'hometown_Name_Prefecture' in df_year.columns:
    nan_hometown = df_year[df_year['hometown_Name_Prefecture'].isna()]
    print(f"Records with NaN hometown: {len(nan_hometown)}")
    
    # DETAILED ANALYSIS OF NaN HOMETOWN RECORDS
    print(f"\n{'='*80}")
    print("DETAILED ANALYSIS: Why is hometown_Name_Prefecture NaN?")
    print(f"{'='*80}")
    
    if len(nan_hometown) > 0:
        # Check what hometown-related data IS available
        print(f"\nAvailability of hometown-related fields:")
        print(f"  hometown_code: {nan_hometown['hometown_code'].notna().sum()} / {len(nan_hometown)} records")
        print(f"  hometown (raw): {nan_hometown['hometown'].notna().sum()} / {len(nan_hometown)} records")
        print(f"  hometown_Name_Province: {nan_hometown['hometown_Name_Province'].notna().sum()} / {len(nan_hometown)} records")
        print(f"  hometown_Name_County: {nan_hometown['hometown_Name_County'].notna().sum()} / {len(nan_hometown)} records")
        print(f"  hometown_lon: {nan_hometown['hometown_lon'].notna().sum()} / {len(nan_hometown)} records")
        print(f"  hometown_lat: {nan_hometown['hometown_lat'].notna().sum()} / {len(nan_hometown)} records")
        
        # Show sample of actual values for these fields
        print(f"\n{'='*80}")
        print("Sample of NaN hometown records (showing all hometown-related fields):")
        print(f"{'='*80}")
        hometown_cols = ['hometown_code', 'hometown', 'hometown_Name_Province', 
                         'hometown_Name_Prefecture', 'hometown_Name_County',
                         'hometown_lon', 'hometown_lat', 'current_city']
        print(nan_hometown[hometown_cols].head(20).to_string())
        
        # Check if there's a pattern in hometown_code
        print(f"\n{'='*80}")
        print("Hometown codes for NaN prefecture records:")
        print(f"{'='*80}")
        print(nan_hometown['hometown_code'].value_counts().head(20))
        
        # Check if there's a pattern in raw hometown field
        print(f"\n{'='*80}")
        print("Raw hometown values for NaN prefecture records:")
        print(f"{'='*80}")
        print(nan_hometown['hometown'].value_counts().head(20))
        
        # Check provinces
        print(f"\n{'='*80}")
        print("Hometown provinces for NaN prefecture records:")
        print(f"{'='*80}")
        print(nan_hometown['hometown_Name_Province'].value_counts().head(20))

# Show distribution of these records
if len(nan_current) > 0:
    print(f"\nNaN current_city - Hometown distribution:")
    if 'hometown_Name_Prefecture' in df_year.columns:
        hometown_dist = nan_current['hometown_Name_Prefecture'].value_counts().head(10)
        print(hometown_dist)

if 'hometown_Name_Prefecture' in df_year.columns and len(nan_hometown) > 0:
    print(f"\n{'='*80}")
    print("NaN hometown - Current city distribution:")
    print(f"{'='*80}")
    current_dist = nan_hometown['current_city'].value_counts().head(20)
    print(current_dist)

## 10. Proposed Solutions

In [None]:
# Show COMPLETE records for people with NaN hometown to understand the full picture
print("="*80)
print("COMPLETE RECORDS: First 10 people with NaN hometown_Name_Prefecture")
print("="*80)

if 'nan_hometown' in locals() and len(nan_hometown) > 0:
    # Display all columns for first 10 records
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', 50)
    
    print("\nShowing ALL fields for 10 sample records:\n")
    sample_records = nan_hometown.head(10)
    
    for idx, (row_idx, row) in enumerate(sample_records.iterrows(), 1):
        print(f"\n{'─'*80}")
        print(f"Record #{idx} (Index: {row_idx})")
        print(f"{'─'*80}")
        for col in df.columns:
            value = row[col]
            # Highlight NaN values
            if pd.isna(value):
                print(f"  {col:<35} : ❌ NaN")
            else:
                print(f"  {col:<35} : {value}")
    
    # Reset pandas display options
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.max_colwidth')
    
    # Summary statistics
    print(f"\n{'='*80}")
    print("PATTERN ANALYSIS")
    print(f"{'='*80}")
    
    # Check if NaN hometown is associated with certain provinces or regions
    print("\nDo these NaN hometown records have other location markers?")
    print(f"  Have first_flow location: {nan_hometown['first_Name_Prefecture'].notna().sum()} / {len(nan_hometown)}")
    print(f"  Have current location: {nan_hometown['current_city'].notna().sum()} / {len(nan_hometown)}")
    
    # Check year patterns
    print(f"\nYear patterns:")
    if 'year_first_flow' in nan_hometown.columns:
        print(f"  year_first_flow distribution:")
        print(nan_hometown['year_first_flow'].value_counts().head(10))
    
    # Check demographic patterns
    print(f"\nDemographic patterns:")
    if 'gender' in nan_hometown.columns:
        print(f"  Gender: {nan_hometown['gender'].value_counts()}")
    if 'edu_level' in nan_hometown.columns:
        print(f"\n  Education level:")
        print(nan_hometown['edu_level'].value_counts())
        
else:
    print("No NaN hometown records to analyze.")

## 9.5. Examine Complete Records with NaN Hometown

In [10]:
print("="*60)
print("DIAGNOSTIC SUMMARY")
print("="*60)

print(f"\n1. Data Quality Issues Found:")
if nan_count > 0:
    print(f"   ✗ {nan_count} NaN nodes in network ({nan_count/len(node_list)*100:.1f}%)")
else:
    print(f"   ✓ No NaN nodes found")

if len(missing_current) > 0:
    print(f"   ✗ {len(missing_current)} records with missing current_city")
else:
    print(f"   ✓ No missing current_city values")

print(f"\n2. Recommended Solutions:")
print("   Option A: Filter out NaN nodes when building network")
print("   Option B: Replace NaN with a placeholder (e.g., 'Unknown City')")
print("   Option C: Exclude records with missing city information")

print(f"\n3. Implementation:")
print("   Modify build_migration_network_for_year() to handle NaN values")
print("   Add data validation before creating edges")

# Quantify impact
if nan_count > 0 and len(nan_nodes) > 0:
    nan_node_idx = node_list.index(nan_nodes[0])
    nan_teu = teu[nan_node_idx]
    print(f"\n4. Impact:")
    print(f"   NaN node has TEU = {nan_teu:.0f}")
    print(f"   This is {nan_teu/teu.sum()*100:.1f}% of total migration flow")
    print(f"   Rank: #{len(teu) - np.searchsorted(np.sort(teu), nan_teu)} out of {len(teu)}")

DIAGNOSTIC SUMMARY

1. Data Quality Issues Found:
   ✗ 1 NaN nodes in network (0.3%)
   ✓ No missing current_city values

2. Recommended Solutions:
   Option A: Filter out NaN nodes when building network
   Option B: Replace NaN with a placeholder (e.g., 'Unknown City')
   Option C: Exclude records with missing city information

3. Implementation:
   Modify build_migration_network_for_year() to handle NaN values
   Add data validation before creating edges

4. Impact:
   NaN node has TEU = 6086
   This is 18.8% of total migration flow
   Rank: #1 out of 378


## 11. Test Fix: Filter NaN Nodes

In [11]:
# Test solution: rebuild network excluding NaN nodes
print("Testing fix: Exclude NaN nodes\n")

# Filter data to exclude NaN cities
df_year_clean = df_year[
    df_year['current_city'].notna() & 
    df_year['hometown_Name_Prefecture'].notna()
].copy()

print(f"Original records: {len(df_year)}")
print(f"Clean records: {len(df_year_clean)}")
print(f"Removed: {len(df_year) - len(df_year_clean)} ({(len(df_year) - len(df_year_clean))/len(df_year)*100:.1f}%)")

# Rebuild network with clean data
# This is a simple test - actual fix should be in the build function
G_clean = nx.DiGraph()

for _, row in df_year_clean.iterrows():
    current = row['current_city']
    hometown = row['hometown_Name_Prefecture']
    
    # Skip if either is NaN (double check)
    if pd.isna(current) or pd.isna(hometown):
        continue
    
    # Add edge
    if G_clean.has_edge(hometown, current):
        G_clean[hometown][current]['weight'] += 1
    else:
        G_clean.add_edge(hometown, current, weight=1)

print(f"\nCleaned network:")
print(f"  Nodes: {G_clean.number_of_nodes()}")
print(f"  Edges: {G_clean.number_of_edges()}")

# Check for NaN nodes in cleaned network
node_list_clean = list(G_clean.nodes())
nan_in_clean = sum(1 for node in node_list_clean if pd.isna(node))
print(f"  NaN nodes: {nan_in_clean}")

# Compute TEU for clean network
adj_clean = nx.to_numpy_array(G_clean, weight='weight')
teu_clean = adj_clean.sum(axis=1) + adj_clean.sum(axis=0)

top_10_clean = np.argsort(teu_clean)[-10:][::-1]
print(f"\nTop 10 nodes in cleaned network:")
for rank, idx in enumerate(top_10_clean, 1):
    city = node_list_clean[idx]
    print(f"  {rank}. {city}: TEU={teu_clean[idx]:.0f}")

Testing fix: Exclude NaN nodes

Original records: 18651
Clean records: 16731
Removed: 1920 (10.3%)

Cleaned network:
  Nodes: 376
  Edges: 5552
  NaN nodes: 0

Top 10 nodes in cleaned network:
  1. 重庆市: TEU=1618
  2. 北京市: TEU=603
  3. 上海市: TEU=458
  4. 南宁市: TEU=426
  5. 哈尔滨市: TEU=412
  6. 合肥市: TEU=392
  7. 成都市: TEU=385
  8. 天津市: TEU=369
  9. 石家庄市: TEU=360
  10. 郑州市: TEU=346
