# Visualization of Chinese Migration Network Data

This notebook visualizes:
1. **df_geo**: Geographic boundaries of Chinese administrative regions
2. **cleaned_data**: Migration flows between cities

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (15, 10)
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

## 1. Load Data

In [34]:
# Load geographic data with special handling for the geometry column
# The geometry column contains complex coordinate data that can cause parsing issues
try:
    # First try: Use quoting to handle commas within fields
    df_geo = pd.read_csv('df_sin_geo.csv', quoting=1, on_bad_lines='skip')
    print(f"Loaded with QUOTE_ALL")
except Exception as e:
    try:
        # Second try: Skip geometry column which has complex data
        df_geo = pd.read_csv('df_geo.csv', usecols=lambda x: x != 'geometry', on_bad_lines='skip')
        print(f"Loaded without geometry column")
    except Exception as e2:
        # Final try: Use python engine which is more flexible
        df_geo = pd.read_csv('df_geo.csv', engine='python', on_bad_lines='skip')
        print(f"Loaded with Python engine")

print(f"\nGeographic data shape: {df_geo.shape}")
print(f"Columns: {df_geo.columns.tolist()}")
print(f"\nFirst few rows:")
df_geo.head()

Loaded with QUOTE_ALL

Geographic data shape: (2901, 14)
Columns: ['Code_County', 'Code_Perfecture', 'Code_Province', 'Name_Province', 'Name_Perfecture', 'Name_County', 'Pinyin', 'Pop_2000', 'Pop_2010', 'Pop_2017', 'Pop_2018', 'Area', 'Density', 'pc_key']

First few rows:


Unnamed: 0,Code_County,Code_Perfecture,Code_Province,Name_Province,Name_Perfecture,Name_County,Pinyin,Pop_2000,Pop_2010,Pop_2017,Pop_2018,Area,Density,pc_key
0,110101,1101,11,北京市,北京市,东城区,Dōngchéng Qū [incl. Chongwen],881763.0,919253.0,,822000.0,41.79,19669.78,北京市_北京市_东城区
1,110102,1101,11,北京市,北京市,西城区,Xīchéng Qū [incl. Xuanwu],1232823.0,1243315.0,,1179000.0,50.47,23360.41,北京市_北京市_西城区
2,110114,1101,11,北京市,北京市,昌平区,Chāngpíng Qū,614821.0,1660501.0,,2108000.0,1342.0,1570.79,北京市_北京市_昌平区
3,110115,1101,11,北京市,北京市,大兴区,Dàxīng Qū,671444.0,1365112.0,,1796000.0,1053.0,1705.6,北京市_北京市_大兴区
4,110111,1101,11,北京市,北京市,房山区,Fángshān Qū,814367.0,944832.0,,1188000.0,1995.0,595.49,北京市_北京市_房山区


In [35]:
# Load migration data
cleaned_data = pd.read_csv('../src/data/data.csv')
print(f"Migration data shape: {cleaned_data.shape}")
print(f"\nColumns: {cleaned_data.columns.tolist()}")
print(f"\nFirst few rows:")
cleaned_data.head()

Migration data shape: (169989, 34)

Columns: ['current_province', 'current_city', 'current_county', 'current_members_live_with', 'gender', 'year_born', 'edu_level', 'hometown_code', 'hometown', 'year_current_flow', 'month_current_flow', 'average_family_cost_per_month', 'average_family_income_per_month', 'year_first_flow', 'month_first_flow', 'first_flow_code', 'first_flow_location', 'num_flows_total', 'if_change_household_local', 'if_stay', 'how_long_to_stay', 'pc_key', 'hometown_Name_Province', 'hometown_Name_Prefecture', 'hometown_Name_County', 'hometown_lon', 'hometown_lat', 'first_Name_Province', 'first_Name_Prefecture', 'first_Name_County', 'first_lon', 'first_lat', 'current_lon', 'current_lat']

First few rows:


Unnamed: 0,current_province,current_city,current_county,current_members_live_with,gender,year_born,edu_level,hometown_code,hometown,year_current_flow,...,hometown_Name_County,hometown_lon,hometown_lat,first_Name_Province,first_Name_Prefecture,first_Name_County,first_lon,first_lat,current_lon,current_lat
0,广东省,深圳市,光明新区,4,2,1989,4,360681,江西省鹰潭市贵溪市,2015,...,贵溪市,117.186973,28.188428,江西省,南昌市,青山湖区,115.905297,28.719082,,
1,广东省,深圳市,光明新区,3,2,1981,5,360402,江西省九江市濂溪区,2015,...,濂溪区,116.039436,29.634605,广东省,深圳市,南山区,113.937903,22.554902,,
2,福建省,厦门市,同安区,4,2,1986,4,511623,四川省广安市邻水县,2013,...,邻水县,106.99183,30.258922,福建省,泉州市,丰泽区,118.617882,24.922059,118.102758,24.776209
3,新疆生产建设兵团,第六师,军户农场,3,1,1973,3,652301,新疆维吾尔自治区昌吉回族自治州昌吉市,2000,...,昌吉市,87.059347,44.091644,,,,,,,
4,北京市,北京市,朝阳区,2,2,1984,5,511702,四川省达州市通川区,2010,...,通川区,107.432388,31.362205,四川省,成都市,武侯区,104.022906,30.610118,116.508837,39.951928


In [36]:
#make a new column with a string in the form current_province "_" current_city "_" current_county
cleaned_data['location'] = cleaned_data['current_province'] + "_" + cleaned_data['current_city'] + "_" + cleaned_data['current_county']

In [37]:
#make a new column cleaned_data['current_code'] by taking cleaned_data['location'] and df_geo['pckey'] and finding where cleaned_data['location'] matches df_geo['pckey'], then taking the corresponding df_geo['Code_County'] value
cleaned_data = cleaned_data.merge(df_geo[['pc_key', 'Code_County']], left_on='location', right_on='pc_key', how='left')
cleaned_data.rename(columns={'Code_County': 'current_code'}, inplace=True)
#current_code should be an int


In [38]:
cleaned_data['current_code'] = cleaned_data['current_code'].astype('Int64')

In [39]:
#copy cleaned_data to a new df
df_final = cleaned_data.copy()
#for checking if current_code is nan we just need to check if df_final['current_code'] is nan
#but for checking if hometown_code or first_flow_code is nan, we need to check if they are in df_geo['Code_County']
df_final['hometown_code_valid'] = df_final['hometown_code'].isin(df_geo['Code_County'])
df_final['first_flow_code_valid'] = df_final['first_flow_code'].isin(df_geo['Code_County'])
df_final['current_code_valid'] = df_final['current_code'].notna()

#df_final models the migration of a person from hometown_code to first_flow_code to current_code,
#however if the hometown_code or first_flow_code is nan, 
#we cannot use that row for migration analysis, 
#but we can still use hometown->first_flow if current_code is nan, 
# and we can still use first_flow->current if hometown_code is nan, 
# if first_flow_code is also nan, then we cannot use that row at all

df_final['migration_type'] = np.where(
    df_final['hometown_code_valid'] & df_final['first_flow_code_valid'] & df_final['current_code_valid'], 'full_migration',
    np.where(
        df_final['hometown_code_valid'] & df_final['first_flow_code_valid'] & ~df_final['current_code_valid'], 'hometown_to_first_flow',
        np.where(
            ~df_final['hometown_code_valid'] & df_final['first_flow_code_valid'] & df_final['current_code_valid'], 'first_flow_to_current',
            'invalid'
        )
    )
)

In [40]:
#make a new df and drop invalid migrations
df_migration = df_final[df_final['migration_type'] != 'invalid'].copy()
#create a migration id based on the index
df_migration['migration_id'] = df_migration.index

In [41]:
#Now create a new df that models the migration on steps timewise, so we will convert hometown, firstflow and current to the time it happend year_first_flow, month_first_flow, year_current_flow, month_current_flow, year_born
df_migration_steps = pd.DataFrame()
df_migration_steps['migration_id'] = df_migration['migration_id']

# Create two separate step DataFrames, each with its own migration_id
step1 = pd.DataFrame({
    'migration_id': df_migration['migration_id'],  # Include migration_id here!
    'step': 'hometown_to_first_flow',
    'from_code': df_migration['hometown_code'],
    'to_code': df_migration['first_flow_code'],
    'year': df_migration['year_first_flow'],
    'month': df_migration['month_first_flow']
})

step2 = pd.DataFrame({
    'migration_id': df_migration['migration_id'],  # Include migration_id here!
    'step': 'first_flow_to_current',
    'from_code': df_migration['first_flow_code'],
    'to_code': df_migration['current_code'],
    'year': df_migration['year_current_flow'],
    'month': df_migration['month_current_flow']
})

# Now concatenate - each step has migration_id
df_migration_steps = pd.concat([step1, step2], ignore_index=True)


In [42]:
#drop the nan rows of df_migration_steps
df_migration_steps = df_migration_steps.dropna(subset=['from_code', 'to_code', 'year', 'month'])

In [43]:
#check how many flows have the same from and to code, and give the amount
df_migration_steps['same_from_to'] = df_migration_steps['from_code'] == df_migration_steps['to_code']

In [44]:
#drop the same from to rows true 
df_migration_steps = df_migration_steps[~df_migration_steps['same_from_to']]

In [45]:
df_migration

Unnamed: 0,current_province,current_city,current_county,current_members_live_with,gender,year_born,edu_level,hometown_code,hometown,year_current_flow,...,current_lon,current_lat,location,pc_key_y,current_code,hometown_code_valid,first_flow_code_valid,current_code_valid,migration_type,migration_id
0,广东省,深圳市,光明新区,4,2,1989,4,360681,江西省鹰潭市贵溪市,2015,...,,,广东省_深圳市_光明新区,,,True,True,False,hometown_to_first_flow,0
1,广东省,深圳市,光明新区,3,2,1981,5,360402,江西省九江市濂溪区,2015,...,,,广东省_深圳市_光明新区,,,True,True,False,hometown_to_first_flow,1
2,福建省,厦门市,同安区,4,2,1986,4,511623,四川省广安市邻水县,2013,...,118.102758,24.776209,福建省_厦门市_同安区,福建省_厦门市_同安区,350212,True,True,True,full_migration,2
4,北京市,北京市,朝阳区,2,2,1984,5,511702,四川省达州市通川区,2010,...,116.508837,39.951928,北京市_北京市_朝阳区,北京市_北京市_朝阳区,110105,True,True,True,full_migration,4
5,辽宁省,营口市,鲅鱼圈区,3,2,1990,3,230229,黑龙江省齐齐哈尔市克山县,2001,...,122.171994,40.259183,辽宁省_营口市_鲅鱼圈区,辽宁省_营口市_鲅鱼圈区,210804,True,True,True,full_migration,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169984,上海市,上海市,闵行区,3,2,1959,2,330602,浙江省绍兴市越城区,1982,...,121.414389,31.089025,上海市_上海市_闵行区,上海市_上海市_闵行区,310112,True,True,True,full_migration,169984
169985,安徽省,宣城市,宁国市,4,1,1976,3,341823,安徽省宣城市泾县,2005,...,118.985855,30.497997,安徽省_宣城市_宁国市,安徽省_宣城市_宁国市,341881,True,True,True,full_migration,169985
169986,北京市,北京市,海淀区,4,2,1977,6,131022,河北省廊坊市固安县,1997,...,116.227491,40.025917,北京市_北京市_海淀区,北京市_北京市_海淀区,110108,True,True,True,full_migration,169986
169987,浙江省,杭州市,江干区,4,2,1971,3,330213,浙江省宁波市奉化区,1992,...,120.291011,30.312594,浙江省_杭州市_江干区,浙江省_杭州市_江干区,330104,True,True,True,full_migration,169987


In [46]:
df_migration.columns

Index(['current_province', 'current_city', 'current_county',
       'current_members_live_with', 'gender', 'year_born', 'edu_level',
       'hometown_code', 'hometown', 'year_current_flow', 'month_current_flow',
       'average_family_cost_per_month', 'average_family_income_per_month',
       'year_first_flow', 'month_first_flow', 'first_flow_code',
       'first_flow_location', 'num_flows_total', 'if_change_household_local',
       'if_stay', 'how_long_to_stay', 'pc_key_x', 'hometown_Name_Province',
       'hometown_Name_Prefecture', 'hometown_Name_County', 'hometown_lon',
       'hometown_lat', 'first_Name_Province', 'first_Name_Prefecture',
       'first_Name_County', 'first_lon', 'first_lat', 'current_lon',
       'current_lat', 'location', 'pc_key_y', 'current_code',
       'hometown_code_valid', 'first_flow_code_valid', 'current_code_valid',
       'migration_type', 'migration_id'],
      dtype='object')

In [47]:
df_migration_steps
#extract the ones with migration id =7
df_migration_steps[df_migration_steps['migration_id'] == 309]

Unnamed: 0,migration_id,step,from_code,to_code,year,month,same_from_to
245,309,hometown_to_first_flow,231223,230602,2010,9,False


In [48]:
#using migration_id merge df_migration_steps with df_migration to get more info i want the columns: 'current_members_live_with', 'gender', 'edu_level','average_family_cost_per_month', 'average_family_income_per_month', 'num_flows_total', 'if_change_household_local','if_stay', 'how_long_to_stay',
#and if step == 'hometown_to_first_flow', then get 'hometown_lon','hometown_lat', 'first_lon', 'first_lat' save them as from_lon, from_lat, to_lon, to_lat
#and if step == 'first_flow_to_current', then get 'first_lon', 'first_lat', 'current_lon','current_lat' save them as from_lon, from_lat, to_lon, to_lat
df_migration_steps = df_migration_steps.merge(
    df_migration[['migration_id', 'current_members_live_with', 'gender', 'edu_level','average_family_cost_per_month', 'average_family_income_per_month', 'num_flows_total', 'if_change_household_local','if_stay', 'how_long_to_stay',
                  'hometown_lon','hometown_lat', 'first_lon', 'first_lat', 'current_lon','current_lat']],
    on='migration_id',
    how='left'
)
df_migration_steps['from_lon'] = np.where(
    df_migration_steps['step'] == 'hometown_to_first_flow',
    df_migration_steps['hometown_lon'],
    df_migration_steps['first_lon']
)
df_migration_steps['from_lat'] = np.where(
    df_migration_steps['step'] == 'hometown_to_first_flow',
    df_migration_steps['hometown_lat'],
    df_migration_steps['first_lat']
)
df_migration_steps['to_lon'] = np.where(
    df_migration_steps['step'] == 'hometown_to_first_flow',
    df_migration_steps['first_lon'],
    df_migration_steps['current_lon']
)
df_migration_steps['to_lat'] = np.where(
    df_migration_steps['step'] == 'hometown_to_first_flow',
    df_migration_steps['first_lat'],
    df_migration_steps['current_lat']
)
df_migration_steps = df_migration_steps.drop(columns=['hometown_lon','hometown_lat', 'first_lon', 'first_lat', 'current_lon','current_lat', 'same_from_to'])
df_migration_steps

Unnamed: 0,migration_id,step,from_code,to_code,year,month,current_members_live_with,gender,edu_level,average_family_cost_per_month,average_family_income_per_month,num_flows_total,if_change_household_local,if_stay,how_long_to_stay,from_lon,from_lat,to_lon,to_lat
0,0,hometown_to_first_flow,360681,360111,2008,11,4,2,4,10000.0,,4,1,1,3.0,117.186973,28.188428,115.905297,28.719082
1,1,hometown_to_first_flow,360402,440305,2000,5,3,2,5,60000.0,,3,1,1,5.0,116.039436,29.634605,113.937903,22.554902
2,2,hometown_to_first_flow,511623,350503,2001,3,4,2,4,40000.0,,2,1,1,5.0,106.991830,30.258922,118.617882,24.922059
3,4,hometown_to_first_flow,511702,510107,2003,9,2,2,5,9000.0,,2,1,1,5.0,107.432388,31.362205,104.022906,30.610118
4,5,hometown_to_first_flow,230229,210804,2001,10,3,2,3,5000.0,,1,3,1,5.0,125.670668,48.168083,122.171994,40.259183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178258,169976,first_flow_to_current,370112,110106,1995,1,4,1,6,17000.0,99000.0,2,1,1,5.0,117.178458,36.604883,116.242485,39.834557
178259,169977,first_flow_to_current,310110,310112,1997,9,5,1,6,50000.0,99000.0,1,1,1,5.0,121.524257,31.300951,121.414389,31.089025
178260,169983,first_flow_to_current,310101,310112,1990,3,3,2,3,10000.0,100000.0,1,3,1,5.0,121.479050,31.217317,121.414389,31.089025
178261,169985,first_flow_to_current,310112,341881,2005,10,4,1,3,15000.0,120000.0,3,2,1,5.0,121.414389,31.089025,118.985855,30.497997


In [49]:
#give me df_migration_steps sorted by migration_id
df_migration_steps.sort_values(by=['migration_id', 'step']).reset_index(drop=True)


Unnamed: 0,migration_id,step,from_code,to_code,year,month,current_members_live_with,gender,edu_level,average_family_cost_per_month,average_family_income_per_month,num_flows_total,if_change_household_local,if_stay,how_long_to_stay,from_lon,from_lat,to_lon,to_lat
0,0,hometown_to_first_flow,360681,360111,2008,11,4,2,4,10000.0,,4,1,1,3.0,117.186973,28.188428,115.905297,28.719082
1,1,hometown_to_first_flow,360402,440305,2000,5,3,2,5,60000.0,,3,1,1,5.0,116.039436,29.634605,113.937903,22.554902
2,2,first_flow_to_current,350503,350212,2013,11,4,2,4,40000.0,,2,1,1,5.0,118.617882,24.922059,118.102758,24.776209
3,2,hometown_to_first_flow,511623,350503,2001,3,4,2,4,40000.0,,2,1,1,5.0,106.991830,30.258922,118.617882,24.922059
4,4,first_flow_to_current,510107,110105,2010,3,2,2,5,9000.0,,2,1,1,5.0,104.022906,30.610118,116.508837,39.951928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178258,169985,hometown_to_first_flow,341823,310112,1996,2,4,1,3,15000.0,120000.0,3,2,1,5.0,118.362169,30.595836,121.414389,31.089025
178259,169986,first_flow_to_current,110105,110108,1997,7,4,2,6,120000.0,180000.0,1,1,1,5.0,116.508837,39.951928,116.227491,40.025917
178260,169986,hometown_to_first_flow,131022,110105,1997,7,4,2,6,120000.0,180000.0,1,1,1,5.0,116.278512,39.345288,116.508837,39.951928
178261,169987,hometown_to_first_flow,330213,330104,1992,3,4,2,3,100000.0,200000.0,2,1,1,5.0,121.368450,29.613198,120.291011,30.312594


In [50]:
df_migration_steps.to_csv('df_migration_steps.csv', index=False)

In [51]:
df_geo

Unnamed: 0,Code_County,Code_Perfecture,Code_Province,Name_Province,Name_Perfecture,Name_County,Pinyin,Pop_2000,Pop_2010,Pop_2017,Pop_2018,Area,Density,pc_key
0,110101,1101,11,北京市,北京市,东城区,Dōngchéng Qū [incl. Chongwen],881763.0,919253.0,,822000.0,41.79,19669.78,北京市_北京市_东城区
1,110102,1101,11,北京市,北京市,西城区,Xīchéng Qū [incl. Xuanwu],1232823.0,1243315.0,,1179000.0,50.47,23360.41,北京市_北京市_西城区
2,110114,1101,11,北京市,北京市,昌平区,Chāngpíng Qū,614821.0,1660501.0,,2108000.0,1342.00,1570.79,北京市_北京市_昌平区
3,110115,1101,11,北京市,北京市,大兴区,Dàxīng Qū,671444.0,1365112.0,,1796000.0,1053.00,1705.60,北京市_北京市_大兴区
4,110111,1101,11,北京市,北京市,房山区,Fángshān Qū,814367.0,944832.0,,1188000.0,1995.00,595.49,北京市_北京市_房山区
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2896,710113,7101,71,台湾省,台湾省,屏东县,,909364.0,864529.0,,819184.0,2775.60,295.14,台湾省_台湾省_屏东县
2897,710114,7101,71,台湾省,台湾省,台东县,,244612.0,228290.0,,216781.0,3515.25,61.67,台湾省_台湾省_台东县
2898,710102,7101,71,台湾省,台湾省,宜兰县,,465799.0,459061.0,,454178.0,2143.63,211.87,台湾省_台湾省_宜兰县
2899,710109,7101,71,台湾省,台湾省,云林县,,743562.0,713556.0,,681306.0,1290.83,527.80,台湾省_台湾省_云林县


In [52]:
# Create new df_geo with coordinates from df_migration

# Extract hometown codes and coordinates
hometown_geo = df_migration[['hometown_code', 'hometown_lon', 'hometown_lat']].copy()
hometown_geo.columns = ['code', 'lon', 'lat']

# Extract first flow codes and coordinates
first_geo = df_migration[['first_flow_code', 'first_lon', 'first_lat']].copy()
first_geo.columns = ['code', 'lon', 'lat']

# Extract current codes and coordinates
current_geo = df_migration[['current_code', 'current_lon', 'current_lat']].copy()
current_geo.columns = ['code', 'lon', 'lat']

# Combine all three
all_locations = pd.concat([hometown_geo, first_geo, current_geo], ignore_index=True)

# Remove rows with missing coordinates or codes
all_locations = all_locations.dropna(subset=['code', 'lon', 'lat'])

# Remove duplicates, keeping the first occurrence
df_geo_new = all_locations.drop_duplicates(subset=['code'], keep='first').reset_index(drop=True)

print(f"New df_geo shape: {df_geo_new.shape}")
print(f"Number of unique location codes with coordinates: {len(df_geo_new)}")
print(f"\nFirst few rows:")
df_geo_new.head()

New df_geo shape: (2697, 3)
Number of unique location codes with coordinates: 2697

First few rows:


Unnamed: 0,code,lon,lat
0,360681,117.186973,28.188428
1,360402,116.039436,29.634605
2,511623,106.99183,30.258922
3,511702,107.432388,31.362205
4,230229,125.670668,48.168083


In [55]:
# Join df_geo_new with the original df_geo
df_geo_combined = df_geo.merge(
    df_geo_new,
    left_on='Code_County',
    right_on='code',
    how='left'
)

# Drop the duplicate 'code' column since we already have 'Code_County'
df_geo_combined = df_geo_combined.drop(columns=['code'])

print(f"Combined df_geo shape: {df_geo_combined.shape}")
print(f"Columns: {df_geo_combined.columns.tolist()}")
print(f"\nRows with coordinates: {df_geo_combined['lon'].notna().sum()}")
print(f"Rows without coordinates: {df_geo_combined['lon'].isna().sum()}")
print(f"\nFirst few rows:")
df_geo_combined.head(10)

Combined df_geo shape: (2901, 16)
Columns: ['Code_County', 'Code_Perfecture', 'Code_Province', 'Name_Province', 'Name_Perfecture', 'Name_County', 'Pinyin', 'Pop_2000', 'Pop_2010', 'Pop_2017', 'Pop_2018', 'Area', 'Density', 'pc_key', 'lon', 'lat']

Rows with coordinates: 2697
Rows without coordinates: 204

First few rows:


Unnamed: 0,Code_County,Code_Perfecture,Code_Province,Name_Province,Name_Perfecture,Name_County,Pinyin,Pop_2000,Pop_2010,Pop_2017,Pop_2018,Area,Density,pc_key,lon,lat
0,110101,1101,11,北京市,北京市,东城区,Dōngchéng Qū [incl. Chongwen],881763.0,919253.0,,822000.0,41.79,19669.78,北京市_北京市_东城区,116.410638,39.910818
1,110102,1101,11,北京市,北京市,西城区,Xīchéng Qū [incl. Xuanwu],1232823.0,1243315.0,,1179000.0,50.47,23360.41,北京市_北京市_西城区,116.359265,39.910127
2,110114,1101,11,北京市,北京市,昌平区,Chāngpíng Qū,614821.0,1660501.0,,2108000.0,1342.0,1570.79,北京市_北京市_昌平区,116.204649,40.214497
3,110115,1101,11,北京市,北京市,大兴区,Dàxīng Qū,671444.0,1365112.0,,1796000.0,1053.0,1705.6,北京市_北京市_大兴区,116.414539,39.648675
4,110111,1101,11,北京市,北京市,房山区,Fángshān Qū,814367.0,944832.0,,1188000.0,1995.0,595.49,北京市_北京市_房山区,115.847163,39.717472
5,110116,1101,11,北京市,北京市,怀柔区,Huáiróu Qū,296002.0,372887.0,,414000.0,2123.0,195.01,北京市_北京市_怀柔区,116.579904,40.628582
6,110109,1101,11,北京市,北京市,门头沟区,Méntóugōu Qū,266591.0,290476.0,,331000.0,1448.0,228.59,北京市_北京市_门头沟区,115.785713,39.992851
7,110118,1101,11,北京市,北京市,密云区,Mìyún Qū,420019.0,467680.0,,495000.0,2226.0,222.37,北京市_北京市_密云区,116.988109,40.52506
8,110117,1101,11,北京市,北京市,平谷区,Pínggŭ Qū,396701.0,415958.0,,456000.0,948.0,481.01,北京市_北京市_平谷区,117.138748,40.207501
9,110113,1101,11,北京市,北京市,顺义区,Shùnyì Qū,636479.0,876620.0,,1169000.0,1009.0,1158.57,北京市_北京市_顺义区,116.717963,40.149286


In [56]:
# === BUILD MULTI-LEVEL GRAPH PARADIGMS ===
# We'll create 3 levels: County (6-digit), Prefecture (4-digit), Province (2-digit)

# First, add administrative level codes to df_migration_steps
df_migration_steps['from_province'] = (df_migration_steps['from_code'] // 10000).astype(int)
df_migration_steps['to_province'] = (df_migration_steps['to_code'] // 10000).astype(int)
df_migration_steps['from_prefecture'] = (df_migration_steps['from_code'] // 100).astype(int)
df_migration_steps['to_prefecture'] = (df_migration_steps['to_code'] // 100).astype(int)

print("Added administrative level codes to df_migration_steps")
print(f"Sample codes:")
print(f"  County: {df_migration_steps['from_code'].iloc[0]}")
print(f"  Prefecture: {df_migration_steps['from_prefecture'].iloc[0]}")
print(f"  Province: {df_migration_steps['from_province'].iloc[0]}")

# Now create coordinate lookups for each level
print("\n=== Creating coordinate lookups for each administrative level ===")

# County level (already have this in df_geo_combined)
county_coords = df_geo_combined[['Code_County', 'lon', 'lat']].dropna()
county_coords.columns = ['code', 'lon', 'lat']
print(f"County level: {len(county_coords)} locations with coordinates")

# Prefecture level - aggregate from county level (use mean of coordinates)
prefecture_coords = df_geo_combined[['Code_Perfecture', 'lon', 'lat']].dropna()
prefecture_coords = prefecture_coords.groupby('Code_Perfecture', as_index=False).agg({
    'lon': 'mean',
    'lat': 'mean'
})
prefecture_coords.columns = ['code', 'lon', 'lat']
print(f"Prefecture level: {len(prefecture_coords)} locations with coordinates")

# Province level - aggregate from county level (use mean of coordinates)
province_coords = df_geo_combined[['Code_Province', 'lon', 'lat']].dropna()
province_coords = province_coords.groupby('Code_Province', as_index=False).agg({
    'lon': 'mean',
    'lat': 'mean'
})
province_coords.columns = ['code', 'lon', 'lat']
print(f"Province level: {len(province_coords)} locations with coordinates")

Added administrative level codes to df_migration_steps
Sample codes:
  County: 360681
  Prefecture: 3606
  Province: 36

=== Creating coordinate lookups for each administrative level ===
County level: 2697 locations with coordinates
Prefecture level: 331 locations with coordinates
Province level: 31 locations with coordinates


In [57]:
# === AGGREGATION STRATEGY FOR SOCIOECONOMIC FEATURES ===
# When multiple migration steps are aggregated into one edge:
#
# - Flow count: COUNT (number of individual migrations)
# - Continuous variables (cost, income, how_long_to_stay, num_flows_total): MEAN
# - Ordinal variables (edu_level, current_members_live_with): MEAN (can be interpreted as average level)
# - Categorical variables (gender, if_change_household_local, if_stay): MODE (most common)
# - Temporal (year, month): MEAN (average migration time)

def aggregate_flows(df, from_col, to_col, coords_dict):
    """
    Aggregate migration flows at a given administrative level.
    
    Parameters:
    - df: dataframe with migration steps
    - from_col: column name for origin code (e.g., 'from_province')
    - to_col: column name for destination code (e.g., 'to_province')
    - coords_dict: dictionary mapping codes to (lon, lat)
    
    Returns:
    - Aggregated dataframe with one row per unique origin-destination pair
    """
    
    # Group by origin-destination pairs
    agg_dict = {
        'migration_id': 'count',  # Number of flows
        'year': 'mean',
        'month': 'mean',
        'current_members_live_with': 'mean',
        'edu_level': 'mean',
        'average_family_cost_per_month': 'mean',
        'average_family_income_per_month': 'mean',
        'num_flows_total': 'mean',
        'how_long_to_stay': 'mean',
        # For categorical variables, we'll handle separately
    }
    
    aggregated = df.groupby([from_col, to_col], as_index=False).agg(agg_dict)
    aggregated.rename(columns={'migration_id': 'flow_count'}, inplace=True)
    
    # Handle categorical variables with mode (most common value)
    def safe_mode(x):
        mode_result = x.mode()
        return mode_result.iloc[0] if len(mode_result) > 0 else x.iloc[0]
    
    categorical_agg = df.groupby([from_col, to_col], as_index=False).agg({
        'gender': safe_mode,
        'if_change_household_local': safe_mode,
        'if_stay': safe_mode
    })
    
    # Merge categorical aggregations
    aggregated = aggregated.merge(categorical_agg, on=[from_col, to_col])
    
    # Add coordinates
    aggregated = aggregated.merge(
        coords_dict.rename(columns={'code': from_col, 'lon': 'from_lon', 'lat': 'from_lat'}),
        on=from_col,
        how='left'
    )
    aggregated = aggregated.merge(
        coords_dict.rename(columns={'code': to_col, 'lon': 'to_lon', 'lat': 'to_lat'}),
        on=to_col,
        how='left'
    )
    
    return aggregated

print("Aggregation function defined.")

Aggregation function defined.


In [58]:
# === CREATE GRAPH DATAFRAMES AT THREE LEVELS ===

print("Creating multi-level migration graphs...\n")

# 1. PROVINCE LEVEL (2-digit codes)
print("1. PROVINCE LEVEL")
df_graph_province = aggregate_flows(
    df_migration_steps,
    'from_province',
    'to_province',
    province_coords
)
print(f"   Nodes (provinces): {len(set(df_graph_province['from_province']) | set(df_graph_province['to_province']))}")
print(f"   Edges (province-to-province flows): {len(df_graph_province)}")
print(f"   Total migrations: {df_graph_province['flow_count'].sum()}")

# 2. PREFECTURE LEVEL (4-digit codes)
print("\n2. PREFECTURE LEVEL")
df_graph_prefecture = aggregate_flows(
    df_migration_steps,
    'from_prefecture',
    'to_prefecture',
    prefecture_coords
)
print(f"   Nodes (prefectures): {len(set(df_graph_prefecture['from_prefecture']) | set(df_graph_prefecture['to_prefecture']))}")
print(f"   Edges (prefecture-to-prefecture flows): {len(df_graph_prefecture)}")
print(f"   Total migrations: {df_graph_prefecture['flow_count'].sum()}")

# 3. COUNTY LEVEL (6-digit codes)
print("\n3. COUNTY LEVEL")
df_graph_county = aggregate_flows(
    df_migration_steps,
    'from_code',
    'to_code',
    county_coords
)
print(f"   Nodes (counties): {len(set(df_graph_county['from_code']) | set(df_graph_county['to_code']))}")
print(f"   Edges (county-to-county flows): {len(df_graph_county)}")
print(f"   Total migrations: {df_graph_county['flow_count'].sum()}")

print("\n=== Summary ===")
print(f"All three graph paradigms created successfully!")
print(f"\nDataframes available:")
print(f"  - df_graph_province: Province-level migration graph")
print(f"  - df_graph_prefecture: Prefecture-level migration graph")
print(f"  - df_graph_county: County-level migration graph")

Creating multi-level migration graphs...

1. PROVINCE LEVEL
   Nodes (provinces): 32
   Edges (province-to-province flows): 954
   Total migrations: 178263

2. PREFECTURE LEVEL
   Nodes (prefectures): 360
   Edges (prefecture-to-prefecture flows): 22770
   Total migrations: 178263

3. COUNTY LEVEL
   Nodes (counties): 3386
   Edges (county-to-county flows): 95317
   Total migrations: 178263

=== Summary ===
All three graph paradigms created successfully!

Dataframes available:
  - df_graph_province: Province-level migration graph
  - df_graph_prefecture: Prefecture-level migration graph
  - df_graph_county: County-level migration graph


In [59]:
# Display sample data from each level
print("=== SAMPLE DATA ===\n")

print("PROVINCE LEVEL (first 5 rows):")
print(df_graph_province.head())
print(f"\nColumns: {df_graph_province.columns.tolist()}")

print("\n" + "="*80 + "\n")

print("PREFECTURE LEVEL (first 5 rows):")
print(df_graph_prefecture.head())

print("\n" + "="*80 + "\n")

print("COUNTY LEVEL (first 5 rows):")
print(df_graph_county.head())

=== SAMPLE DATA ===

PROVINCE LEVEL (first 5 rows):
   from_province  to_province  flow_count         year     month  \
0             11           11        1221  2008.515152  5.987715   
1             11           12         255  2011.282353  5.337255   
2             11           13         254  2013.066929  5.559055   
3             11           14          72  2011.097222  4.847222   
4             11           15          82  2010.536585  5.841463   

   current_members_live_with  edu_level  average_family_cost_per_month  \
0                   3.047502   4.401310                    6650.319410   
1                   3.309804   3.458824                    3814.117647   
2                   3.019685   3.606299                    3147.244094   
3                   3.069444   3.750000                    3247.222222   
4                   3.073171   3.780488                    3682.439024   

   average_family_income_per_month  num_flows_total  how_long_to_stay  gender  \
0            

In [70]:
# FIXED VERSION: create_migration_graph function
def create_migration_graph(df, from_col, to_col):
    """
    Create a NetworkX directed graph from migration flow dataframe.
    
    Parameters:
    - df: dataframe with aggregated migration flows
    - from_col: column name for origin code
    - to_col: column name for destination code
    
    Returns:
    - NetworkX DiGraph with node and edge attributes
    """
    
    # Create directed graph
    G = nx.DiGraph()
    
    # Add edges with attributes
    for idx, row in df.iterrows():
        from_node = row[from_col]
        to_node = row[to_col]
        
        # Skip if coordinates are missing
        if pd.isna(row['from_lon']) or pd.isna(row['to_lon']):
            continue
        
        # Add edge with all attributes
        G.add_edge(from_node, to_node,
                   weight=row['flow_count'],
                   flow_count=row['flow_count'],
                   avg_year=row['year'],
                   avg_month=row['month'],
                   avg_members=row['current_members_live_with'],
                   avg_edu_level=row['edu_level'],
                   avg_cost=row['average_family_cost_per_month'],
                   avg_income=row['average_family_income_per_month'],
                   avg_num_flows=row['num_flows_total'],
                   avg_stay_duration=row['how_long_to_stay'],
                   mode_gender=row['gender'],
                   mode_change_household=row['if_change_household_local'],
                   mode_if_stay=row['if_stay'])
    
    # Add node attributes (coordinates and aggregated features)
    # First, aggregate node-level features from both origin and destination
    node_features = {}
    
    for idx, row in df.iterrows():
        from_node = row[from_col]
        to_node = row[to_col]
        
        # Skip if coordinates are missing
        if pd.isna(row['from_lon']) or pd.isna(row['to_lon']):
            continue
        
        # Initialize node features if not exists
        if from_node not in node_features:
            node_features[from_node] = {
                'lon': row['from_lon'],
                'lat': row['from_lat'],
                'out_flows': [],
                'in_flows': []
            }
        if to_node not in node_features:
            node_features[to_node] = {
                'lon': row['to_lon'],
                'lat': row['to_lat'],
                'out_flows': [],
                'in_flows': []
            }
        
        # Track flows for degree calculations
        node_features[from_node]['out_flows'].append(row['flow_count'])
        node_features[to_node]['in_flows'].append(row['flow_count'])
    
    # Set node attributes - ONLY for nodes that exist in the graph
    for node in G.nodes():
        if node in node_features:
            features = node_features[node]
            if pd.notna(features['lon']) and pd.notna(features['lat']):
                G.nodes[node]['lon'] = features['lon']
                G.nodes[node]['lat'] = features['lat']
                G.nodes[node]['pos'] = (features['lon'], features['lat'])
                G.nodes[node]['out_flow_total'] = sum(features['out_flows'])
                G.nodes[node]['in_flow_total'] = sum(features['in_flows'])
                G.nodes[node]['total_flow'] = sum(features['out_flows']) + sum(features['in_flows'])
    
    return G

print("FIXED: create_migration_graph() function redefined.")

FIXED: create_migration_graph() function redefined.


In [71]:
# === CREATE NETWORKX GRAPHS AT THREE LEVELS ===

print("Creating NetworkX graphs...\n")

# 1. Province-level graph
G_province = create_migration_graph(df_graph_province, 'from_province', 'to_province')
print(f"PROVINCE LEVEL GRAPH:")
print(f"  Nodes: {G_province.number_of_nodes()}")
print(f"  Edges: {G_province.number_of_edges()}")
print(f"  Density: {nx.density(G_province):.4f}")
print(f"  Is connected: {nx.is_weakly_connected(G_province)}")

# 2. Prefecture-level graph
G_prefecture = create_migration_graph(df_graph_prefecture, 'from_prefecture', 'to_prefecture')
print(f"\nPREFECTURE LEVEL GRAPH:")
print(f"  Nodes: {G_prefecture.number_of_nodes()}")
print(f"  Edges: {G_prefecture.number_of_edges()}")
print(f"  Density: {nx.density(G_prefecture):.4f}")
print(f"  Is connected: {nx.is_weakly_connected(G_prefecture)}")

# 3. County-level graph
G_county = create_migration_graph(df_graph_county, 'from_code', 'to_code')
print(f"\nCOUNTY LEVEL GRAPH:")
print(f"  Nodes: {G_county.number_of_nodes()}")
print(f"  Edges: {G_county.number_of_edges()}")
print(f"  Density: {nx.density(G_county):.4f}")
print(f"  Is connected: {nx.is_weakly_connected(G_county)}")

print("\n" + "="*80)
print("All graphs created successfully!")
print("\nAvailable NetworkX graphs:")
print("  - G_province: Province-level migration network")
print("  - G_prefecture: Prefecture-level migration network")
print("  - G_county: County-level migration network")

Creating NetworkX graphs...

PROVINCE LEVEL GRAPH:
  Nodes: 31
  Edges: 944
  Density: 1.0151
  Is connected: True

PREFECTURE LEVEL GRAPH:
  Nodes: 331
  Edges: 22531
  Density: 0.2063
  Is connected: True

COUNTY LEVEL GRAPH:
  Nodes: 2697
  Edges: 88395
  Density: 0.0122
  Is connected: True

All graphs created successfully!

Available NetworkX graphs:
  - G_province: Province-level migration network
  - G_prefecture: Prefecture-level migration network
  - G_county: County-level migration network


In [72]:
# === EXAMPLE: HOW TO USE THE GRAPHS ===

print("=== EXAMPLES OF GRAPH USAGE ===\n")

# Example 1: Access node attributes
print("1. NODE ATTRIBUTES (Province level example):")
sample_node = list(G_province.nodes())[0]
print(f"   Node {sample_node}:")
for attr, value in G_province.nodes[sample_node].items():
    print(f"     {attr}: {value}")

# Example 2: Access edge attributes
print("\n2. EDGE ATTRIBUTES (Province level example):")
sample_edge = list(G_province.edges())[0]
print(f"   Edge {sample_edge[0]} → {sample_edge[1]}:")
for attr, value in G_province.edges[sample_edge].items():
    print(f"     {attr}: {value}")

# Example 3: Calculate basic network metrics
print("\n3. BASIC NETWORK METRICS:")
print(f"   Province level:")
print(f"     - Average in-degree: {sum(dict(G_province.in_degree()).values()) / G_province.number_of_nodes():.2f}")
print(f"     - Average out-degree: {sum(dict(G_province.out_degree()).values()) / G_province.number_of_nodes():.2f}")
print(f"     - Number of weakly connected components: {nx.number_weakly_connected_components(G_province)}")

# Example 4: Top migration flows
print("\n4. TOP 5 MIGRATION FLOWS (Province level by flow_count):")
edge_weights = [(u, v, data['flow_count']) for u, v, data in G_province.edges(data=True)]
edge_weights.sort(key=lambda x: x[2], reverse=True)
for i, (u, v, weight) in enumerate(edge_weights[:5], 1):
    print(f"   {i}. {u} → {v}: {weight:,} migrations")

# Example 5: Saving graphs
print("\n5. SAVE GRAPHS TO FILES:")
print("   You can save these graphs using:")
print("   - nx.write_gexf(G_province, 'province_migration.gexf')  # For Gephi")
print("   - nx.write_graphml(G_province, 'province_migration.graphml')  # For other tools")
print("   - nx.write_edgelist(G_province, 'province_migration.edgelist')  # Simple text format")

=== EXAMPLES OF GRAPH USAGE ===

1. NODE ATTRIBUTES (Province level example):
   Node 11.0:
     lon: 116.40518322333331
     lat: 40.06189728703077
     pos: (np.float64(116.40518322333331), np.float64(40.06189728703077))
     out_flow_total: 3479.0
     in_flow_total: 9917.0
     total_flow: 13396.0

2. EDGE ATTRIBUTES (Province level example):
   Edge 11.0 → 11.0:
     weight: 1221.0
     flow_count: 1221.0
     avg_year: 2008.5151515151515
     avg_month: 5.987714987714988
     avg_members: 3.0475020475020473
     avg_edu_level: 4.401310401310401
     avg_cost: 6650.31941031941
     avg_income: 13558.585585585586
     avg_num_flows: 1.7387387387387387
     avg_stay_duration: 4.255281690140845
     mode_gender: 1.0
     mode_change_household: 1.0
     mode_if_stay: 1.0

3. BASIC NETWORK METRICS:
   Province level:
     - Average in-degree: 30.45
     - Average out-degree: 30.45
     - Number of weakly connected components: 1

4. TOP 5 MIGRATION FLOWS (Province level by flow_count):


In [73]:
from collections import defaultdict

def build_migration_network_for_year(df_steps, year, level='province'):
    """
    Build a migration network for a specific year.
    
    Parameters:
    - df_steps: df_migration_steps dataframe
    - year: year to filter for
    - level: 'province', 'prefecture', or 'county'
    
    Returns:
    - NetworkX DiGraph for the specified year and level
    """
    # Filter data for the specified year (within 1 year tolerance)
    df_year = df_steps[(df_steps['year'] >= year - 0.5) & (df_steps['year'] <= year + 0.5)].copy()
    
    print(f"Filtering for year {year}: {len(df_year)} migration steps found")
    
    if len(df_year) == 0:
        print(f"Warning: No data found for year {year}")
        return nx.DiGraph()
    
    # Determine which columns to use based on level
    if level == 'province':
        from_col, to_col = 'from_province', 'to_province'
        coords = province_coords
    elif level == 'prefecture':
        from_col, to_col = 'from_prefecture', 'to_prefecture'
        coords = prefecture_coords
    else:  # county
        from_col, to_col = 'from_code', 'to_code'
        coords = county_coords
    
    # Aggregate flows for this year
    df_aggregated = aggregate_flows(df_year, from_col, to_col, coords)
    
    # Create graph
    G = create_migration_graph(df_aggregated, from_col, to_col)
    
    print(f"Created {level}-level graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
    
    return G

print("Function build_migration_network_for_year() defined.")

Function build_migration_network_for_year() defined.


In [74]:
def detect_communities(G):
    """
    Detect communities using Louvain algorithm.
    Returns partition dict and modularity score.
    """
    if G.number_of_nodes() == 0:
        print("Warning: Empty graph, cannot detect communities")
        return {}, 0.0
    
    # Convert to undirected for community detection
    G_und = G.to_undirected()
    
    # Use Louvain algorithm (need python-louvain package)
    try:
        import community as community_louvain
        partition = community_louvain.best_partition(G_und, weight='weight')
        modularity = community_louvain.modularity(partition, G_und, weight='weight')
        print("Using Louvain algorithm (python-louvain)")
    except ImportError:
        print("python-louvain not found, using greedy modularity")
        # Fallback to greedy modularity
        communities = nx.community.greedy_modularity_communities(G_und, weight='weight')
        partition = {}
        for i, community in enumerate(communities):
            for node in community:
                partition[node] = i
        modularity = nx.community.modularity(G_und, communities, weight='weight')
    
    return partition, modularity

print("Function detect_communities() defined.")

Function detect_communities() defined.


In [75]:
# === TEST: COMMUNITY DETECTION ON 2015 DATA ===

print("="*80)
print("COMMUNITY DETECTION TEST - YEAR 2015")
print("="*80)

# Test at province level
print("\n--- PROVINCE LEVEL ---")
G_test_province = build_migration_network_for_year(df_migration_steps, 2015, level='province')
partition_province, mod_province = detect_communities(G_test_province)
n_communities_province = len(set(partition_province.values()))

print(f"\nCommunity detection results (2015, province level):")
print(f"  Number of communities: {n_communities_province}")
print(f"  Modularity Q: {mod_province:.3f}")

# Count nodes per community
community_sizes_province = defaultdict(int)
for node, comm in partition_province.items():
    community_sizes_province[comm] += 1

print(f"  Community sizes: {dict(sorted(community_sizes_province.items()))}")

# Show which provinces are in each community
print(f"\n  Community composition:")
communities_dict = defaultdict(list)
for node, comm in partition_province.items():
    communities_dict[comm].append(node)

for comm_id in sorted(communities_dict.keys()):
    nodes = sorted(communities_dict[comm_id])
    print(f"    Community {comm_id} ({len(nodes)} provinces): {nodes}")

COMMUNITY DETECTION TEST - YEAR 2015

--- PROVINCE LEVEL ---
Filtering for year 2015: 13239 migration steps found


Created province-level graph: 31 nodes, 759 edges
Using Louvain algorithm (python-louvain)

Community detection results (2015, province level):
  Number of communities: 17
  Modularity Q: 0.666
  Community sizes: {0: 4, 1: 1, 2: 1, 3: 1, 4: 3, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 4, 11: 2, 12: 1, 13: 2, 14: 2, 15: 3, 16: 2}

  Community composition:
    Community 0 (4 provinces): [np.float64(11.0), np.float64(12.0), np.float64(13.0), np.float64(41.0)]
    Community 1 (1 provinces): [np.float64(15.0)]
    Community 2 (1 provinces): [np.float64(45.0)]
    Community 3 (1 provinces): [np.float64(14.0)]
    Community 4 (3 provinces): [np.float64(50.0), np.float64(51.0), np.float64(54.0)]
    Community 5 (1 provinces): [np.float64(52.0)]
    Community 6 (1 provinces): [np.float64(53.0)]
    Community 7 (1 provinces): [np.float64(61.0)]
    Community 8 (1 provinces): [np.float64(62.0)]
    Community 9 (1 provinces): [np.float64(63.0)]
    Community 10 (4 provinces): [np.float64(31.0), np.float64

In [76]:
# Optional: Test at prefecture level
print("\n" + "="*80)
print("\n--- PREFECTURE LEVEL (optional) ---")
G_test_prefecture = build_migration_network_for_year(df_migration_steps, 2015, level='prefecture')
partition_prefecture, mod_prefecture = detect_communities(G_test_prefecture)
n_communities_prefecture = len(set(partition_prefecture.values()))

print(f"\nCommunity detection results (2015, prefecture level):")
print(f"  Number of communities: {n_communities_prefecture}")
print(f"  Modularity Q: {mod_prefecture:.3f}")

# Count nodes per community
community_sizes_prefecture = defaultdict(int)
for node, comm in partition_prefecture.items():
    community_sizes_prefecture[comm] += 1

print(f"  Top 10 largest communities:")
sorted_communities = sorted(community_sizes_prefecture.items(), key=lambda x: x[1], reverse=True)
for i, (comm_id, size) in enumerate(sorted_communities[:10], 1):
    print(f"    {i}. Community {comm_id}: {size} prefectures")



--- PREFECTURE LEVEL (optional) ---
Filtering for year 2015: 13239 migration steps found
Created prefecture-level graph: 331 nodes, 5045 edges
Using Louvain algorithm (python-louvain)

Community detection results (2015, prefecture level):
  Number of communities: 13
  Modularity Q: 0.571
  Top 10 largest communities:
    1. Community 3: 49 prefectures
    2. Community 7: 46 prefectures
    3. Community 5: 40 prefectures
    4. Community 10: 38 prefectures
    5. Community 0: 34 prefectures
    6. Community 4: 27 prefectures
    7. Community 9: 22 prefectures
    8. Community 6: 20 prefectures
    9. Community 1: 14 prefectures
    10. Community 12: 13 prefectures


In [77]:
# Add community information to the graph as node attributes
def add_community_to_graph(G, partition):
    """Add community assignments as node attributes."""
    nx.set_node_attributes(G, partition, 'community')
    return G

# Add communities to the test graph
G_test_province = add_community_to_graph(G_test_province, partition_province)

print("="*80)
print("\nCommunity attributes added to graph!")
print(f"\nExample node with community attribute:")
sample_node = list(G_test_province.nodes())[0]
print(f"  Node {sample_node}:")
print(f"    Community: {G_test_province.nodes[sample_node].get('community', 'N/A')}")
print(f"    All attributes: {dict(G_test_province.nodes[sample_node])}")

# Summary statistics
print(f"\n=== SUMMARY ===")
print(f"Year 2015 migration network (province level):")
print(f"  - {G_test_province.number_of_nodes()} provinces")
print(f"  - {G_test_province.number_of_edges()} migration flows")
print(f"  - {n_communities_province} communities detected")
print(f"  - Modularity Q = {mod_province:.3f}")
print(f"  - Average community size: {G_test_province.number_of_nodes() / n_communities_province:.1f} provinces/community")


Community attributes added to graph!

Example node with community attribute:
  Node 11.0:
    Community: 0
    All attributes: {'lon': np.float64(116.40518322333331), 'lat': np.float64(40.06189728703077), 'pos': (np.float64(116.40518322333331), np.float64(40.06189728703077)), 'out_flow_total': np.float64(342.0), 'in_flow_total': np.float64(614.0), 'total_flow': np.float64(956.0), 'community': 0}

=== SUMMARY ===
Year 2015 migration network (province level):
  - 31 provinces
  - 759 migration flows
  - 17 communities detected
  - Modularity Q = 0.666
  - Average community size: 1.8 provinces/community


In [78]:
# === COUNTY LEVEL COMMUNITY DETECTION ===
print("\n" + "="*80)
print("\n--- COUNTY LEVEL ---")
G_test_county = build_migration_network_for_year(df_migration_steps, 2015, level='county')
partition_county, mod_county = detect_communities(G_test_county)
n_communities_county = len(set(partition_county.values()))

print(f"\nCommunity detection results (2015, county level):")
print(f"  Number of communities: {n_communities_county}")
print(f"  Modularity Q: {mod_county:.3f}")

# Count nodes per community
community_sizes_county = defaultdict(int)
for node, comm in partition_county.items():
    community_sizes_county[comm] += 1

print(f"\n  Community size distribution:")
sorted_communities_county = sorted(community_sizes_county.items(), key=lambda x: x[1], reverse=True)
print(f"    Total communities: {len(sorted_communities_county)}")
print(f"    Largest community: {sorted_communities_county[0][1]} counties" if sorted_communities_county else "    No communities")
print(f"    Smallest community: {sorted_communities_county[-1][1]} counties" if sorted_communities_county else "    No communities")
print(f"    Average community size: {sum(community_sizes_county.values()) / len(community_sizes_county):.1f} counties" if community_sizes_county else "    N/A")

print(f"\n  Top 20 largest communities:")
for i, (comm_id, size) in enumerate(sorted_communities_county[:20], 1):
    print(f"    {i:2d}. Community {comm_id}: {size:3d} counties")

# Add community info to graph
G_test_county = add_community_to_graph(G_test_county, partition_county)

print(f"\n  Community attributes added to county-level graph!")



--- COUNTY LEVEL ---
Filtering for year 2015: 13239 migration steps found
Created county-level graph: 2388 nodes, 10230 edges
Using Louvain algorithm (python-louvain)

Community detection results (2015, county level):
  Number of communities: 16
  Modularity Q: 0.579

  Community size distribution:
    Total communities: 16
    Largest community: 341 counties
    Smallest community: 2 counties
    Average community size: 149.2 counties

  Top 20 largest communities:
     1. Community 4: 341 counties
     2. Community 0: 327 counties
     3. Community 1: 314 counties
     4. Community 9: 246 counties
     5. Community 11: 209 counties
     6. Community 8: 202 counties
     7. Community 2: 195 counties
     8. Community 3: 112 counties
     9. Community 10: 103 counties
    10. Community 6:  83 counties
    11. Community 15:  76 counties
    12. Community 5:  65 counties
    13. Community 7:  57 counties
    14. Community 14:  54 counties
    15. Community 12:   2 counties
    16. Comm

In [None]:
# CORRECTED VERSION: Edges with only weight, nodes with simple averaged socioeconomics
def create_migration_graph_v2(df, from_col, to_col):
    """
    Create a NetworkX directed graph from migration flow dataframe.
    
    EDGES: Only contain 'weight' = number of different people who migrated on this route
    NODES: Simple averaged socioeconomic characteristics from connected edges
    
    Parameters:
    - df: dataframe with aggregated migration flows (one row per unique from-to pair)
    - from_col: column name for origin code
    - to_col: column name for destination code
    
    Returns:
    - NetworkX DiGraph
    """
    
    G = nx.DiGraph()
    
    # Track edges connected to each node for aggregation
    node_outgoing_edges = {}  # node -> list of edge data
    node_incoming_edges = {}  # node -> list of edge data
    node_coords = {}  # node -> (lon, lat)
    
    # Process each edge in the dataframe
    for idx, row in df.iterrows():
        from_node = row[from_col]
        to_node = row[to_col]
        
        # Skip if coordinates are missing
        if pd.isna(row['from_lon']) or pd.isna(row['to_lon']):
            continue
        
        # Calculate weight: count of different people (entries) on this route
        # This is already in row['flow_count'] which represents the count of migration_ids
        weight = row['flow_count']
        
        # Add edge with ONLY weight
        G.add_edge(from_node, to_node, weight=weight)
        
        # Store coordinates
        node_coords[from_node] = (row['from_lon'], row['from_lat'])
        node_coords[to_node] = (row['to_lon'], row['to_lat'])
        
        # Store edge socioeconomic data for node aggregation
        edge_data = {
            'year': row['year'],
            'edu_level': row['edu_level'],
            'income': row['average_family_income_per_month'],
            'cost': row['average_family_cost_per_month'],
            'members': row['current_members_live_with'],
            'stay_duration': row['how_long_to_stay'],
            'num_flows': row['num_flows_total']
        }
        
        # Track for source node (outgoing)
        if from_node not in node_outgoing_edges:
            node_outgoing_edges[from_node] = []
        node_outgoing_edges[from_node].append(edge_data)
        
        # Track for destination node (incoming)
        if to_node not in node_incoming_edges:
            node_incoming_edges[to_node] = []
        node_incoming_edges[to_node].append(edge_data)
    
    # Set node attributes
    for node in G.nodes():
        # Coordinates
        if node in node_coords:
            lon, lat = node_coords[node]
            G.nodes[node]['lon'] = lon
            G.nodes[node]['lat'] = lat
            G.nodes[node]['pos'] = (lon, lat)
        
        # Outgoing characteristics (simple average, each edge counts equally)
        if node in node_outgoing_edges:
            out_edges = node_outgoing_edges[node]
            G.nodes[node]['out_avg_year'] = np.mean([e['year'] for e in out_edges if pd.notna(e['year'])])
            G.nodes[node]['out_avg_edu'] = np.mean([e['edu_level'] for e in out_edges if pd.notna(e['edu_level'])])
            G.nodes[node]['out_avg_income'] = np.mean([e['income'] for e in out_edges if pd.notna(e['income'])])
            G.nodes[node]['out_avg_cost'] = np.mean([e['cost'] for e in out_edges if pd.notna(e['cost'])])
            G.nodes[node]['out_avg_members'] = np.mean([e['members'] for e in out_edges if pd.notna(e['members'])])
            G.nodes[node]['out_avg_stay'] = np.mean([e['stay_duration'] for e in out_edges if pd.notna(e['stay_duration'])])
        
        # Incoming characteristics (simple average, each edge counts equally)
        if node in node_incoming_edges:
            in_edges = node_incoming_edges[node]
            G.nodes[node]['in_avg_year'] = np.mean([e['year'] for e in in_edges if pd.notna(e['year'])])
            G.nodes[node]['in_avg_edu'] = np.mean([e['edu_level'] for e in in_edges if pd.notna(e['edu_level'])])
            G.nodes[node]['in_avg_income'] = np.mean([e['income'] for e in in_edges if pd.notna(e['income'])])
            G.nodes[node]['in_avg_cost'] = np.mean([e['cost'] for e in in_edges if pd.notna(e['cost'])])
            G.nodes[node]['in_avg_members'] = np.mean([e['members'] for e in in_edges if pd.notna(e['members'])])
            G.nodes[node]['in_avg_stay'] = np.mean([e['stay_duration'] for e in in_edges if pd.notna(e['stay_duration'])])
        
        # Flow counts
        G.nodes[node]['total_out_flow'] = sum(G[node][neighbor]['weight'] for neighbor in G.successors(node))
        G.nodes[node]['total_in_flow'] = sum(G[predecessor][node]['weight'] for predecessor in G.predecessors(node))
        G.nodes[node]['net_flow'] = G.nodes[node]['total_in_flow'] - G.nodes[node]['total_out_flow']
    
    return G

print("create_migration_graph_v2() defined: Edges with only weight, nodes with simple averaged socioeconomics")