In [7]:
import sys
from pathlib import Path

# Add src directory to path (notebook is in src/notebooks/)
notebook_dir = Path.cwd()
if 'notebooks' in str(notebook_dir):
    # Running from notebooks directory
    src_dir = notebook_dir.parent
else:
    # Running from project root
    src_dir = notebook_dir / 'src'

sys.path.insert(0, str(src_dir))

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from config import BaseConfig
from utils.graph import build_migration_network_for_year

config = BaseConfig()

In [8]:
# Load migration data
df = pd.read_csv(config.data_csv_path)

print(f"Total migration records: {len(df):,}")
print(f"Data loaded: {len(df)} migration records")
print(f"Year range: {df['year_first_flow'].min():.0f} - {df['year_current_flow'].max():.0f}")
print(f"\nAvailable years:")
all_years = sorted(set(df['year_first_flow'].dropna().astype(int)) | 
                   set(df['year_current_flow'].dropna().astype(int)))
print(f"  {min(all_years)} to {max(all_years)} ({len(all_years)} unique years)")
print(f"\nFirst few rows:")
df.head()

Total migration records: 169,989
Data loaded: 169989 migration records
Year range: 1944 - 2017

Available years:
  1944 to 2017 (71 unique years)

First few rows:


Unnamed: 0,current_province,current_city,current_county,current_members_live_with,gender,year_born,edu_level,hometown_code,hometown,year_current_flow,...,hometown_Name_County,hometown_lon,hometown_lat,first_Name_Province,first_Name_Prefecture,first_Name_County,first_lon,first_lat,current_lon,current_lat
0,广东省,深圳市,光明新区,4,2,1989,4,360681,江西省鹰潭市贵溪市,2015,...,贵溪市,117.186973,28.188428,江西省,南昌市,青山湖区,115.905297,28.719082,,
1,广东省,深圳市,光明新区,3,2,1981,5,360402,江西省九江市濂溪区,2015,...,濂溪区,116.039436,29.634605,广东省,深圳市,南山区,113.937903,22.554902,,
2,福建省,厦门市,同安区,4,2,1986,4,511623,四川省广安市邻水县,2013,...,邻水县,106.99183,30.258922,福建省,泉州市,丰泽区,118.617882,24.922059,118.102758,24.776209
3,新疆生产建设兵团,第六师,军户农场,3,1,1973,3,652301,新疆维吾尔自治区昌吉回族自治州昌吉市,2000,...,昌吉市,87.059347,44.091644,,,,,,,
4,北京市,北京市,朝阳区,2,2,1984,5,511702,四川省达州市通川区,2010,...,通川区,107.432388,31.362205,四川省,成都市,武侯区,104.022906,30.610118,116.508837,39.951928


In [11]:
# Define year range to process
START_YEAR = 2000
END_YEAR = 2016

# Filter available years within the range
years_to_process = [y for y in all_years if START_YEAR <= y <= END_YEAR]

print(f"\n=== Processing Networks for Years {START_YEAR}-{END_YEAR} ===")
print(f"Years to process: {years_to_process}")
print(f"Total: {len(years_to_process)} years\n")


=== Processing Networks for Years 2000-2016 ===
Years to process: [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]
Total: 17 years



In [12]:
# Create output directory
output_dir = Path('network_data')
output_dir.mkdir(exist_ok=True)

# Store summary statistics
summary_stats = []

# Process each year
for YEAR in years_to_process:
    print(f"Processing year {YEAR}...")
    
    # Build network for this year
    G = build_migration_network_for_year(df, YEAR)
    
    if G.number_of_nodes() == 0 or G.number_of_edges() == 0:
        print(f"  ⚠ Skipping {YEAR} - no data")
        continue
    
    # Save weighted directed network
    adj_matrix_weighted_directed = nx.to_scipy_sparse_array(G, weight='weight', format='csr')
    np.save(output_dir / f'network_{YEAR}_weighted_directed.npy', adj_matrix_weighted_directed.toarray())
    
    # Store statistics
    summary_stats.append({
        'year': YEAR,
        'nodes': G.number_of_nodes(),
        'edges': G.number_of_edges(),
        'total_migration': sum(nx.get_edge_attributes(G, 'weight').values()),
        'density': nx.density(G),
        'avg_degree': 2 * G.number_of_edges() / G.number_of_nodes() if G.number_of_nodes() > 0 else 0
    })
    
    print(f"  ✓ Saved {YEAR}: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

print(f"\n=== Processing Complete ===")
print(f"Networks saved to: {output_dir.absolute()}")
print(f"Total files created: {len(summary_stats)}")

# Display summary table
summary_df = pd.DataFrame(summary_stats)
print("\n=== Network Summary Statistics ===")
print(summary_df.to_string(index=False))

Processing year 2000...
  ✓ Saved 2000: 372 nodes, 3389 edges
Processing year 2001...
  ✓ Saved 2001: 367 nodes, 2246 edges
Processing year 2002...
  ✓ Saved 2002: 365 nodes, 2845 edges
Processing year 2003...
  ✓ Saved 2003: 369 nodes, 3092 edges
Processing year 2004...
  ✓ Saved 2004: 366 nodes, 2950 edges
Processing year 2005...
  ✓ Saved 2005: 374 nodes, 3418 edges
Processing year 2006...
  ✓ Saved 2006: 371 nodes, 3537 edges
Processing year 2007...
  ✓ Saved 2007: 374 nodes, 4045 edges
Processing year 2008...
  ✓ Saved 2008: 376 nodes, 4301 edges
Processing year 2009...
  ✓ Saved 2009: 378 nodes, 4203 edges
Processing year 2010...
  ✓ Saved 2010: 377 nodes, 4794 edges
Processing year 2011...
  ✓ Saved 2011: 375 nodes, 4085 edges
Processing year 2012...
  ✓ Saved 2012: 381 nodes, 4899 edges
Processing year 2013...
  ✓ Saved 2013: 381 nodes, 4924 edges
Processing year 2014...
  ✓ Saved 2014: 379 nodes, 4997 edges
Processing year 2015...
  ✓ Saved 2015: 378 nodes, 5486 edges
Processi

In [13]:
# Example: Load and use the saved networks

# Choose a year to load (use first processed year as example)
example_year = years_to_process[0] if years_to_process else 2015

print(f"=== Loading Weighted Directed Network for Year {example_year} ===\n")

# Load the weighted directed adjacency matrix
adj_matrix = np.load(output_dir / f'network_{example_year}_weighted_directed.npy')

print(f"Shape: {adj_matrix.shape}")
print(f"Type: {adj_matrix.dtype}")
print(f"Total edges: {np.count_nonzero(adj_matrix)}")
print(f"Total migration volume: {np.sum(adj_matrix):.0f}")
print(f"Max edge weight: {np.max(adj_matrix):.0f}")
print(f"Mean edge weight (non-zero): {adj_matrix[adj_matrix > 0].mean():.2f}")

# Convert back to NetworkX graph if needed
G_loaded = nx.from_numpy_array(adj_matrix, create_using=nx.DiGraph)
print(f"\nNetworkX DiGraph:")
print(f"  Nodes: {G_loaded.number_of_nodes()}")
print(f"  Edges: {G_loaded.number_of_edges()}")

# Example: Load multiple years for comparison
print(f"\n=== Multiple Years Comparison ===")
for year in years_to_process[:5]:  # Show first 5 years as example
    adj = np.load(output_dir / f'network_{year}_weighted_directed.npy')
    total_migration = np.sum(adj)
    print(f"Year {year}: {adj.shape[0]} nodes, {np.count_nonzero(adj)} edges, migration volume: {total_migration:.0f}")

=== Loading Weighted Directed Network for Year 2000 ===

Shape: (372, 372)
Type: int64
Total edges: 3389
Total migration volume: 8623
Max edge weight: 239
Mean edge weight (non-zero): 2.54

NetworkX DiGraph:
  Nodes: 372
  Edges: 3389

=== Multiple Years Comparison ===
Year 2000: 372 nodes, 3389 edges, migration volume: 8623
Year 2001: 367 nodes, 2246 edges, migration volume: 4287
Year 2002: 365 nodes, 2845 edges, migration volume: 5985
Year 2003: 369 nodes, 3092 edges, migration volume: 6770
Year 2004: 366 nodes, 2950 edges, migration volume: 6235


In [14]:
# Convert weighted directed network to unweighted and undirected

# Load a weighted directed network
year = example_year
adj_weighted_directed = np.load(output_dir / f'network_{year}_weighted_directed.npy')

print(f"=== Converting Network (Year {year}) ===\n")

# 1. Convert to unweighted (binary): any edge > 0 becomes 1
adj_unweighted_directed = (adj_weighted_directed > 0).astype(int)
print(f"Unweighted Directed:")
print(f"  Non-zero entries: {np.count_nonzero(adj_unweighted_directed)}")

# 2. Convert to undirected (symmetrize by taking maximum or sum)
# Option A: Use maximum (if there's an edge in either direction, keep it)
adj_unweighted_undirected = np.maximum(adj_unweighted_directed, adj_unweighted_directed.T)

# Option B: Alternative - average weights for weighted undirected
adj_weighted_undirected = (adj_weighted_directed + adj_weighted_directed.T) / 2

print(f"\nUnweighted Undirected:")
print(f"  Non-zero entries: {np.count_nonzero(adj_unweighted_undirected)}")
print(f"  Edges: {np.count_nonzero(adj_unweighted_undirected) // 2}")  # Divide by 2 for undirected

# Convert to NetworkX graphs to verify
G_unweighted_undirected = nx.from_numpy_array(adj_unweighted_undirected)
print(f"\nNetworkX Graph (Unweighted Undirected):")
print(f"  Nodes: {G_unweighted_undirected.number_of_nodes()}")
print(f"  Edges: {G_unweighted_undirected.number_of_edges()}")

# Save the converted version if needed
# np.save(output_dir / f'network_{year}_unweighted_undirected.npy', adj_unweighted_undirected)

=== Converting Network (Year 2000) ===

Unweighted Directed:
  Non-zero entries: 3389

Unweighted Undirected:
  Non-zero entries: 6199
  Edges: 3099

NetworkX Graph (Unweighted Undirected):
  Nodes: 372
  Edges: 3100
