In [2]:
#### Calculate the distance between each dialect ####
# This code calculates the dialectal distance between each pair of sub-dialect group
# Programmer: Dan Qin
# Date: 2018.07.08

# import libraries
import pandas as pd
import numpy as np

In [5]:
# load data
dia_dict = pd.read_csv("data/Chinese_dialectdict_compl.csv")
dia_dict.head(1)

Unnamed: 0,语系,语族,方言大区,方言区/语支,方言片/语种,Supergroup,Dialect group,Sub-dialect group
0,汉藏,汉语,官话,东北官话,吉沈片,Mandarin,Northeastern,Jishen


In [3]:
### Calculate distance between dialects ###
# create a dataframe of sub-dialect group pairs
dia_pairs = pd.DataFrame(index = dia_dict["方言片/语种"], 
                         columns = dia_dict["方言片/语种"])
dia_pairs.head(1)

方言片/语种,吉沈片,哈阜片,黑松片,京承片,朝峰片,保唐片,石济片,沧惠片,登连片,青莱片,...,土族,锡伯,赫哲,佤,京,布赓,阿美,回辉,塔吉克,朝鲜语
方言片/语种,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
吉沈片,,,,,,,,,,,...,,,,,,,,,,


In [4]:
# calculate the distance between every two sub-dialect groups
for i in range(len(dia_pairs.index)):
    for j in range(len(dia_pairs.columns)):
        # pair of sub-dialect groups
        dia_1 = dia_pairs.index[i]
        dia_2 = dia_pairs.columns[j]
        
        # find their rows in the dialect dictionary
        row_1 = dia_dict.loc[dia_dict["方言片/语种"] == dia_1]
        row_2 = dia_dict.loc[dia_dict["方言片/语种"] == dia_2]
        
        # initialize dialectal distance
        distance = 0 
        
        # Assign a distance value according to the dialect tree
        # Assignment Rules:
        # 1. When two counties belong to the same sub-group, the distance is 0
        # 2. If different sub-groups but same group, the distance is 1
        # 3. If different groups but same super-group, the distance is 2
        # 4. If different dialect groups but same branch, the distance is 3
        # 5. Else, the distance is 4
        if dia_1 == dia_2:
            pass
        elif row_1["方言区/语支"].values[0] == row_2["方言区/语支"].values[0]:
            distance = 1
        elif row_1["方言大区"].values[0] == row_2["方言大区"].values[0]:
            distance = 2
        elif row_1["语族"].values[0] == row_2["语族"].values[0]:
            distance = 3
        else:
            distance = 4
         
        dia_pairs.iloc[i,j] = distance

In [5]:
# check
dia_pairs.head(3)

方言片/语种,吉沈片,哈阜片,黑松片,京承片,朝峰片,保唐片,石济片,沧惠片,登连片,青莱片,...,土族,锡伯,赫哲,佤,京,布赓,阿美,回辉,塔吉克,朝鲜语
方言片/语种,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
吉沈片,0,1,1,2,2,2,2,2,2,2,...,4,4,4,4,4,4,4,4,4,4
哈阜片,1,0,1,2,2,2,2,2,2,2,...,4,4,4,4,4,4,4,4,4,4
黑松片,1,1,0,2,2,2,2,2,2,2,...,4,4,4,4,4,4,4,4,4,4


In [152]:
# save it to file
dia_pairs.to_csv("data/Chinese_dialect_distance.csv", encoding = "utf_8_sig")

In [6]:
# # TEMP LOADING
# dia_pairs = pd.read_csv("data/Chinese_dialect_distance.csv",index_col=0)

In [7]:
# load county dialect data
county_dia = pd.read_csv("data/CH_dialect_county_compl.csv")
county_dia.head(1)

Unnamed: 0,AdCode,Province,Prefecture,County,方言大区,方言区/语支,方言片/语种,SGroup,DiaGroup,SubDiaGroup
0,110101,北京市,北京市,东城区,官话,北京官话,京承片,Mandarin,Beijing,Jingcheng


In [8]:
# load county population data
county_pop = pd.read_csv("data/2011_census_pop_age_edu.csv")
county_pop.head(2)

Unnamed: 0,省级,地级,县市,总人口,年轻人口(20-39),高等教育人口,年轻人比例,高等教育人口比例,市总人口,县市人口比例
0,北京市,北京市,北京市,19612368.0,8556982.0,6177772.0,0.436305,0.314994,19612368,1.0
1,北京市,北京市,东城区,573180.0,213458.0,213156.0,0.37241,0.371883,19612368,0.029225


In [9]:
# list of province and prefecture names
prov_list = county_pop["省级"].value_counts().index.tolist()
pref_list = county_pop["地级"].value_counts().index.tolist()

del_list = ['南沙群岛', '西沙群岛', '省直辖县级行政区划', '市辖区', '中沙群岛的岛礁及其海域', '自治区直辖县级行政区划']

# keep county rows in census data, drop province and prefectures
for province in prov_list:
    county_pop = county_pop.drop(county_pop[county_pop.县市 == province].index)

for prefecture in pref_list:
    county_pop = county_pop.drop(county_pop[county_pop.县市 == prefecture].index)
    
for item in del_list:
    county_pop = county_pop.drop(county_pop[county_pop.县市 == item].index)

county_pop.count()

省级             2869
地级             2869
县市             2869
总人口            2869
年轻人口(20-39)    2869
高等教育人口         2869
年轻人比例          2869
高等教育人口比例       2869
市总人口           2869
县市人口比例         2869
dtype: int64

In [10]:
len(pref_list)

366

In [40]:
### Altered county names ###
# as census data was collected in 2011, some of the county names have altered
# list of latest county names
county_list = county_dia["County"].tolist()

# list of county names in census data
census_clist = county_pop["县市"].tolist()

# check for altered county names
altered = list(set(census_clist) - set(county_list))
shouldbe = list(set(county_list) - set(census_clist))

county_popcp = county_pop.copy()

leftnames = []
record = []
temp = None
i = 0
for row in county_popcp.itertuples():
    if row.县市 in altered:
        temp = row.县市
        for j in range(len(shouldbe)):  
            # if the county name remains the same, only the division name changed
            if (row.县市[:-1] == shouldbe[j][:-1]):
                # assign the county with its latest name
                county_popcp.loc[county_popcp.index[i], "县市"] = shouldbe[j]
                record.append(shouldbe[j])
                temp = None
                break            
        if (temp != None):
            leftnames.append(temp)
    i += 1
    
len(leftnames)

89

In [36]:
# names that have altered totally
print(leftnames)

['崇文区', '宣武区', '蓟县', '石家庄市桥东区', '唐海县', '邯郸县', '内邱县', '保定市新市区', '北市区', '南市区', '宣化县', '承德市双桥区', '东陵区', '铁岭市清河区', '加格达奇', '松岭区', '新林区', '呼中区', '卢湾区', '闸北区', '白下区', '下关区', '崇安区', '南长区', '北塘区', '九里区', '戚墅堰区', '沧浪区', '平江区', '金阊区', '新浦区', '淮安市清河区', '楚州区', '清浦区', '维扬区', '江东区', '绍兴县', '居巢区', '金家庄区', '铜官山区', '狮子山区', '铜陵县', '永定县', '星子县', '赣县', '巿北区', '四方区', '胶南市', '枣庄市巿中区', '济宁市市中区', '苍山县', '陵县', '金明区', '开封县', '许昌县', '陕县', '狮河区', '郧县', '襄阳区', '沙巿区', '汩罗市', '永定区', '萝岗区', '茂港区', '梅县', '蝶山区', '平桂管理区', '万盛区', '重庆市双桥区', '开县', '郫县', '安县', '广元市市中区', '元坝区', '内江市巿中区', '达县', '小河区', '盘县', '遵义县', '万山特区', '潞西市', '班嘎县', '户县', '华县', '吴旗县', '芒崖行委', '柴旦行委', '冷湖行委', '乌鲁木齐市新市区']


In [37]:
# dictionary for altered county names
cname_dict = {"吴旗县":"吴起县","郫县":"郫都区","汩罗市":"汨罗市","盘县":"盘州市",
             "苍山县":"兰陵县","潞西市":"芒市","清浦区":"清江浦区",
             "襄阳区":"襄州区","梅县":"梅县区",
             "班嘎县":"班戈县","铜官山区":"铜官区","狮子山区":"铜官区",
             "狮河区":"浉河区","安县":"安州区","开县":"开州区",
             "开封县":"祥符区","唐海县":"曹妃甸区","许昌县":"建安区","陕县":"陕州区",
             "东陵区":"浑南区","郧县":"郧阳区","内邱县":"内丘县",
             "新浦区":"阜新市海州区","金阊区":"姑苏区","宣化县":"宣化区","崇文区":"东城区",
             "江东区":"鄞州区","蓟县":"蓟州区","松岭区":"呼玛县","新林区":"呼玛县",
             "呼中区":"呼玛县","金家庄区":"花山区","遵义县":"播州区",
             "永定县":"龙岩市永定区","茂港区":"电白区","胶南市":"黄岛区","崇安区":"梁溪区",
             "绍兴县":"柯桥区","闸北区":"静安区","沙巿区":"沙市区","蝶山区":"万秀区",
             "户县":"鄠邑区","陵县":"陵城区","万盛区":"南川区",
             "加格达奇":"呼玛县","金明区":"龙亭区","萝岗区":"黄埔区","北塘区":"梁溪区",
             "达县":"达川区","四方区":"市北区","下关区":"南京市鼓楼区","南市区":"黄浦区",
             "宣武区":"西城区","元坝区":"昭化区","楚州区":"淮安区","华县":"华州区",
             "卢湾区":"黄浦区","冷湖行委":"德令哈市","赣县":"赣县区","平桂管理区":"八步区",
             "南长区":"梁溪区","铜陵县":"义安区","巿北区":"市北区","戚墅堰区":"武进区",
             "星子县":"庐山市","居巢区":"巢湖市","平江区":"姑苏区","白下区":"秦淮区",
             "万山特区":"万山区","九里区":"南京市鼓楼区","安县":"安州区","柴旦行委":"德令哈市",
             "北市区":"莲池区","沧浪区":"姑苏区","潞西市":"芒市","盘县":"盘州市",
             "芒崖行委":"德令哈市","小河区":"乌当区","邯郸县":"邯山区","维扬区":"邗江区",
             "石家庄市桥东区":"石家庄市长安区","保定市新市区":"竞秀区","承德市双桥区":"双桥区",
              "铁岭市清河区":"清河区","淮安市清河区":"淮安区","枣庄市巿中区":"枣庄市市中区",
              "济宁市市中区":"任城区","永定区":"张家界市永定区","重庆市双桥区":"渝中区",
              "广元市市中区":"利州区","内江市巿中区":"内江市市中区","乌鲁木齐市新市区":"天山区"}


In [59]:
# replace the county names with the latest names
i = 0
for row in county_popcp.itertuples():
    county = row.县市
    if county in cname_dict.keys():
        county_popcp.loc[county_popcp.index[i],"县市"] = cname_dict[county]
       
    i += 1

In [20]:
### Calculate dialectal distance between counties ###
# OPTION I: Assign the dialectal distance of county pairs(parallel structure)
# (This option creates really large dataset)

# # county list of the new dataframe
# census_clist_new = county_popcp["县市"].value_counts().index.tolist()

# # create a list of county pairs
# county_1 = []
# county_2 = []
# for i in range(len(census_clist_new)):
#     for j in range(len(census_clist_new)):
#         # pairs of counties
#         county_1.append(census_clist_new[i])
#         county_2.append(census_clist_new[j])

# # create a dataframe to store dialectal distance of county pairs
# index = np.arange(len(county_1))
# columns = ['County_1','County_2','DiaDist']
# county_dist = pd.DataFrame(index = index ,columns = columns)

# county_dist["County_1"] = county_1
# county_dist["County_2"] = county_2
# county_dist.head()

In [292]:
# # OPTION I(continue)
# # fill in DiaDist column
# DiaDist = []
# for row in county_dist.itertuples():
#     # pair of counties
#     county_1 = row.County_1
#     county_2 = row.County_2
    
#     # get the dialect of the specified county 
#     if county_1 in cname_dict.keys():
#         county_1 = cname_dict[county_1]
#     if county_2 in cname_dict.keys():
#         county_2 = cname_dict[county_2]
        
#     index_1 = county_list.index(county_1)
#     index_2 = county_list.index(county_2)
    
#     dia_1 = county_dialist[index_1]
#     dia_2 = county_dialist[index_2]
    
#     # get the dialectal distance between the two counties
#     distance = dia_pairs.loc[dia_1, dia_2]
    
#     DiaDist.append(distance)
    
# # add the column to dataframe
# county_dist["DiaDist"] = DiaDist
# county_dist.head(3)

In [62]:
# OPTION II: Assign the dialectal distance of county pairs(cross structure)

# county list in the population data
census_clist_new = county_popcp["县市"].tolist()

# as pandas is extremely slow with loops,use a numpy array instead
county_array = np.empty([len(census_clist_new),len(census_clist_new)])
county_array[:] = np.nan

# dialect of counties
county_dialist = county_dia["方言片/语种"].tolist()

# loop through each pair of counties
for i in range(len(census_clist_new)):
    for j in range(len(census_clist_new)):
        # pairs of counties
        county_1 = census_clist_new[i]
        county_2 = census_clist_new[j]
        
        # find the index of the counties
        index_1 = county_list.index(county_1)
        index_2 = county_list.index(county_2)
        
        # find the dialect of the specified county
        dia_1 = county_dialist[index_1]
        dia_2 = county_dialist[index_2]
        
        # get the dialectal distance between the two counties
        distance = dia_pairs.loc[dia_1, dia_2]
        
        # store the value to the array
        county_array[i,j] = distance  
 

In [63]:
# create a dataframe
county_dist_2 = pd.DataFrame(data = county_array,
                         index = census_clist_new,
                         columns = census_clist_new)
county_dist_2.head(1)

Unnamed: 0,东城区,西城区,东城区.1,西城区.1,北京市朝阳区,丰台区,石景山区,海淀区,门头沟区,房山区,...,布尔津县,富蕴县,福海县,哈巴河县,青河县,吉木乃县,石河子市,阿拉尔市,图木舒克市,五家渠市
东城区,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,4.0,4.0


In [64]:
# save it to file 
county_dist_2.to_csv("data/CH_county_diadist_2.csv", encoding = "utf_8_sig")

In [11]:
# TEMP LOADING
# county_dist_2 = pd.read_csv("data/CH_county_diadist_2.csv",index_col = 0)

In [12]:
### Calculate dialectal distance between prefectures ###
# remove irrelevant names in pref_list
pref_list.remove("省直辖县级行政区划")
pref_list.remove("自治区直辖县级行政区划")

for province in prov_list:
    pref_list.remove(province)

# add the four municipalities
pref_list = ["北京市","上海市","天津市","重庆市"] + pref_list 
    
# create a list of prefecture pairs
pref_1 = []
pref_2 = []
for i in range(len(pref_list)):
    for j in range(len(pref_list)):
        # pairs of prefectures
        pref_1.append(pref_list[i])
        pref_2.append(pref_list[j])

# create a dataframe to store dialectal distance of prefecture pairs
index = np.arange(len(pref_1))
columns = ['Pref_1','Pref_2','DiaDist']
pref_dist = pd.DataFrame(index = index ,columns = columns)

pref_dist["Pref_1"] = pref_1
pref_dist["Pref_2"] = pref_2
pref_dist.head()

Unnamed: 0,Pref_1,Pref_2,DiaDist
0,北京市,北京市,
1,北京市,上海市,
2,北京市,天津市,
3,北京市,重庆市,
4,北京市,保定市,


In [66]:
# Calculate the dialectal distance between prefectures
# calculation formula: d(A,B) = ∑i ∑j S_Ai * S_Bj * d_ij 
    # d(A,B): dialectal distance between prefecture a and b
    # S_Ai: population proportion of county i in prefecture a
    # S_Bj: population proportion of county j in prefecture b
    # d_ij: dialectal distance between county i and j
    
# list for dialectal distance
DiaDist = []

for row in pref_dist.itertuples():
    # pairs of prefectures
    p1 = row.Pref_1
    p2 = row.Pref_2
    
    # county subset of each prefecture
    sub_1 = county_popcp.loc[county_popcp.地级 == p1]
    sub_2 = county_popcp.loc[county_popcp.地级 == p2]
    
    # initialize dialectal distance
    distance = 0
        
    for row1 in sub_1.itertuples():
        for row2 in sub_2.itertuples():
            # counties from either prefecture
            county_1 = row1.县市
            county_2 = row2.县市
                
            # get the dialectal distance between two counties
            index_1 = census_clist_new.index(county_1)
            index_2 = census_clist_new.index(county_2)
            county_distance = county_array[index_1,index_2]
            
            # calculate the distance based on population proportion
            distance += row1.县市人口比例 * row2.县市人口比例 * county_distance
        
    DiaDist.append(distance)


In [67]:
# add the column to dataframe and check the results
pref_dist["DiaDist"] = DiaDist
pref_dist.head(2)

Unnamed: 0,Pref_1,Pref_2,DiaDist
0,北京市,北京市,0.083037
1,北京市,上海市,3.0


In [68]:
# load adcode data
adcode = pd.read_csv("data/CH_administrative_code_1708.csv")
adcode.head(1)

Unnamed: 0,行政区划代码,省级,地级,县级,区号,邮编
0,110000,北京市,,,,


In [69]:
# only keep prefecture rows in adcode data
drop_index = []
for row in adcode.itertuples():
    if type(row.县级) == str:
        drop_index.append(row[0])

adcode = adcode.drop(index = drop_index)        

In [70]:
# merge the adcode with the diadist data
pref_dist_r = pd.merge(pref_dist, adcode, left_on="Pref_1", right_on="地级",how="left")
pref_dist_r = pref_dist_r.drop(columns = ["地级","省级","县级","区号","邮编"])
pref_dist_r = pref_dist_r.rename(columns={"行政区划代码":"PrefCode_1"})
pref_dist_r = pd.merge(pref_dist_r, adcode, left_on="Pref_2", right_on="地级",how="left")
pref_dist_r = pref_dist_r.drop(columns = ["地级","省级","县级","区号","邮编"])
pref_dist_r = pref_dist_r.rename(columns={"行政区划代码":"PrefCode_2"})
pref_dist_r.head()    

Unnamed: 0,Pref_1,Pref_2,DiaDist,PrefCode_1,PrefCode_2
0,北京市,北京市,0.083037,110100,110100
1,北京市,上海市,3.0,110100,310100
2,北京市,天津市,1.816805,110100,120100
3,北京市,重庆市,2.0,110100,500100
4,北京市,保定市,1.881126,110100,130600


In [71]:
# save it to file
pref_dist_r.to_csv("data/CH_pref_diadist.csv", index = False, encoding = "utf_8_sig")

In [72]:
# ### Option II: Calculate the dialectal distance of prefecture pairs(loop with lists) 
# ### SLOW!!!
# # create a numpy array to store the dialectal distance
# pref_array = np.empty([len(pref_list),len(pref_list)])
# pref_array[:] = np.nan

# # Calculate the dialectal distance between prefectures
# # calculation formula: d(A,B) = ∑i ∑j S_Ai * S_Bj * d_ij 
#     # d(A,B): dialectal distance between prefecture a and b
#     # S_Ai: population proportion of county i in prefecture a
#     # S_Bj: population proportion of county j in prefecture b
#     # d_ij: dialectal distance between county i and j

# # loop through each pair of prefectures
# for i in range(len(pref_list)):
#     for j in range(len(pref_list)):
#         # pairs of prefectures
#         pref_1 = pref_list[i]
#         pref_2 = pref_list[j]
        
#         # list of counties in each prefecture     
#         counties_1 = []
#         counties_2 = []
        
#         # list of county population
#         cpop_1 = []
#         cpop_2 = []
        
#         for row in county_pop.itertuples():
#             if (row.地级 == pref_1) :
#                 counties_1.append(row.县市)
#                 cpop_1.append(row.县市人口比例)
#             elif (row.地级 == pref_2) :
#                 counties_2.append(row.县市)
#                 cpop_2.append(row.县市人口比例)
        
#         distance = 0
        
#         for m in range(len(counties_1):
#             for n in range(len(counties_2):
#                 county_1 = counties_1[m]
#                 county_2 = counties_2[n]
#                 c1_pop = cpop_1[m]
#                 c2_pop = cpop_2[n]
                
#                 index_1 = county_list.index(county_1)
#                 index_2 = county_list.index(county_2)
#                 county_dist = county_array[index_1,index_2]
#                 distance += row1.县市人口比例 * row2.县市人口比例 * county_dist
                           
#         pref_array[i,j] = distance


In [73]:
# ### Option III: Calculate the dialectal distance of prefecture pairs(loop with itertuples)
# ### SLOW!!!
# # create a numpy array to store the dialectal distance
# pref_array = np.empty([len(pref_list),len(pref_list)])
# pref_array[:] = np.nan

# # loop through each pair of prefectures
# for i in range(len(pref_list)):
#     for j in range(len(pref_list)):
#         # pairs of prefectures
#         pref_1 = pref_list[i]
#         pref_2 = pref_list[j]
        
#         # counties in each prefecture(slow!)
#         sub_1 = county_pop.loc[county_pop.地级 == pref_1]
#         sub_2 = county_pop.loc[county_pop.地级 == pref_2]
        
#         distance = 0
        
#         for row1 in sub_1.itertuples():
#             for row2 in sub_2.itertuples():
#                 # select a county from either prefecture
#                 county_1 = row1.县市
#                 county_2 = row2.县市
                
#                 # get the dialectal distance between counties
#                 index_1 = county_list.index(county_1)
#                 index_2 = county_list.index(county_2)
#                 county_dist = county_array[index_1,index_2]
                
#                 distance += row1.县市人口比例 * row2.县市人口比例 * county_dist
        
#         pref_array[i,j] = distance
