In [1]:
import numpy as np
import pandas as pd
import glob
import os
from importlib import reload
from sklearn.metrics.pairwise import haversine_distances

In [2]:
filename = '../data/geonames_countryInfo.txt'
meta = pd.read_csv(filename, skiprows=49, usecols=[0,1,4,5,6,7,8,9], keep_default_na=False)
isos = pd.unique(meta['ISO'])

meta.head()

Unnamed: 0,ISO,ISO3,Country,Capital,Area(in sq km),Population,Continent,Income
0,AD,AND,Andorra,Andorra la Vella,468.0,77006,EU,HI
1,AE,ARE,United Arab Emirates,Abu Dhabi,82880.0,9630959,AS,HI
2,AF,AFG,Afghanistan,Kabul,647500.0,37172386,AS,LO
3,AG,ATG,Antigua and Barbuda,St. John's,443.0,96286,,X
4,AI,AIA,Anguilla,The Valley,102.0,13254,,X


In [3]:
filename = '../data/geonames_disputed_territories.csv'
conflict = pd.read_csv(filename, keep_default_na=False)
print(conflict.shape)
conflict.head()

(2, 4)


Unnamed: 0,ISO,name,claimed_by_iso,claimed_by_name
0,CY,Cyprus,TR,Turkey
1,UA,Ukraine,RU,Russia


In [4]:
filename = '../data/geonames_dependent_countries_territories.csv'
depend = pd.read_csv(filename, keep_default_na=False)
print(depend.shape)
depend.head()

(51, 6)


Unnamed: 0,ISO,name,depends_from,featurecode,children,notes
0,AI,Anguila,GB,PCLD,-,
1,AQ,Antarctica,12 countries,ADM1,-,
2,AS,American Samoa,US,ADMD,ADMD,
3,AW,Aruba,NL,PCLIX,-,
4,AX,Åland Islands,FI,ISLS,-,


In [5]:
filename = '../data/geonames_cities500.txt'
popd = pd.read_table(filename, header=None, dtype={10:'str', 13:'str'}, keep_default_na=False)
print(popd.shape)
popdcolumns = ['geonameid', 'name', 'asciiname', 'alternatenames',
               'latitude', 'longitude', 'feature_class', 'feature_code',
               'country_code', 'cc2', 'admin1_code', 'admin2_code', 'admin3_code',
               'admin4_code', 'population', 'elevation','dem', 'timezone', 'modification_date']
popd.columns = popdcolumns
popd = popd.loc[:, ['asciiname', 'country_code', 'population', 'latitude', 'longitude']]
popd = popd[popd['population'] > 100]
print(popd.shape)

popd.head()

(198934, 19)
(166795, 5)


Unnamed: 0,asciiname,country_code,population,latitude,longitude
0,Andorra la Vella,AD,20430.0,42.50779,1.52109
1,Arinsal,AD,1419.0,42.57205,1.48453
2,Canillo,AD,3292.0,42.5676,1.59756
3,El Tarter,AD,1052.0,42.57952,1.65362
4,Encamp,AD,11223.0,42.53474,1.58014


# Connection to research cities

In [43]:
src = '../results/'
dst = '../plots/'

filename = src + 'tally_merged_locations.csv'
auth = pd.read_csv(filename)

print(auth.shape)
countries = pd.unique(auth.country)

(6424, 6)


In [7]:
country = 'Botswana'
if country in ['England', 'North Ireland', 'Scotland', 'Wales']:
    iso = 'GB'
else:
    iso = meta[meta['Country'] == country]['ISO'].values[0]
print(country, iso)

Botswana BW


In [19]:
resc = auth[auth['country'] == country].copy()
resloc = resc.loc[:, ['latitude', 'longitude']].values
resloc = np.deg2rad(resloc)

isocolonies = depend[depend['depends_from']==iso]['ISO'].values
isoconflict = conflict[conflict['claimed_by_iso']==iso]['ISO'].values

if len(isocolonies) > 0:
    colonies = []
    for isoc in isocolonies:
        colonies.append(popd[popd['country_code'] == isoc].copy())
    colonies.append(popd[popd['country_code'] == iso].copy())
    popc = pd.concat(colonies).sort_values(by='population', ascending=False).copy()
elif len(isoconflict) > 0:
    conflicts = []
    for isoc in isoconflict:
        conflicts.append(popd[popd['country_code'] == isoc].copy())
    conflicts.append(popd[popd['country_code'] == iso].copy())
    popc = pd.concat(conflicts).sort_values(by='population', ascending=False).copy()
else:
    popc = popd[popd['country_code'] == iso].copy().sort_values(by='population', ascending=False)
    
poploc = popc.loc[:, ['latitude', 'longitude']].values
poploc = np.deg2rad(poploc)
print(popc.shape)

r2p = haversine_distances(resloc, poploc)
r2p *= 6371000/1000
print(r2p.shape)

(105, 5)
(7, 105)


In [20]:
popc

Unnamed: 0,asciiname,country_code,population,latitude,longitude
15691,Gaborone,BW,208411.0,-24.65451,25.90859
15689,Francistown,BW,89979.0,-21.17000,27.50778
15747,Molepolole,BW,63248.0,-24.40659,25.49508
15771,Selebi-Phikwe,BW,53727.0,-21.97895,27.84296
15736,Maun,BW,49945.0,-19.98333,23.41667
...,...,...,...,...,...
15706,Kavimba,BW,566.0,-18.01667,24.60000
15781,Toteng,BW,556.0,-20.38333,22.95000
15682,Bokspits,BW,547.0,-26.90000,20.70000
15751,Moremi,BW,546.0,-22.57234,27.45058


In [21]:
resc

Unnamed: 0,incountry_rank,country,location,score,latitude,longitude
458,0,Botswana,"Gaborone, Botswana",89,-24.628208,25.923147
459,1,Botswana,"Maun, Botswana",14,-19.995262,23.418077
460,2,Botswana,"Palapye, Botswana",8,-22.551487,27.11471
461,3,Botswana,"Mmadinare, Botswana",2,-21.88106,27.751381
462,4,Botswana,"Kanye, Botswana",1,-24.988061,25.34302
463,5,Botswana,"Sebele, Francistown, Botswana",1,-21.1661,27.51436
464,6,Botswana,"Ghanzi, Botswana",1,-21.696099,21.648186


In [22]:
cdata = auth[auth['country'] == country].copy()
metro = np.zeros(len(r2p), dtype=int)
matchs = ['' for i in range(len(metro))]
isom = ['' for i in range(len(metro))]
diffs = np.zeros(len(metro))

for i in range(len(metro)):
    argmatch = np.argmin(r2p[i])
    diffs[i] = r2p[i, argmatch]
    matchs[i] = popc.iloc[argmatch, 0]
    isom[i] = popc.iloc[argmatch, 1]
    metro[i] = popc.iloc[argmatch, 2]
    popc.iloc[argmatch, 2] = 0
for i in range(len(metro)):
    if isom[i] not in isoconflict:
        argmatch = np.nonzero(r2p[i] < 35)[0]
        matches = popc.iloc[argmatch]
        isomatches = matches[matches['country_code'] == isom[i]]
        totmetro = np.sum(isomatches['population'].values)
        metro[i] += totmetro
        popc.loc[isomatches.index, 'population'] = 0

cdata['matched_iso'] = isom
cdata['matched_pop'] = matchs
cdata['population'] = metro
cdata['difference'] = diffs

In [23]:
cdata

Unnamed: 0,incountry_rank,country,location,score,latitude,longitude,matched_iso,matched_pop,population,difference
458,0,Botswana,"Gaborone, Botswana",89,-24.628208,25.923147,BW,Gaborone,357616,3.273878
459,1,Botswana,"Maun, Botswana",14,-19.995262,23.418077,BW,Maun,49945,1.33492
460,2,Botswana,"Palapye, Botswana",8,-22.551487,27.11471,BW,Palapye,39569,1.223745
461,3,Botswana,"Mmadinare, Botswana",2,-21.88106,27.751381,BW,Selebi-Phikwe,55652,14.412236
462,4,Botswana,"Kanye, Botswana",1,-24.988061,25.34302,BW,Kanye,67615,2.58673
463,5,Botswana,"Sebele, Francistown, Botswana",1,-21.1661,27.51436,BW,Francistown,112413,0.808443
464,6,Botswana,"Ghanzi, Botswana",1,-21.696099,21.648186,BW,Ghanzi,10977,0.313322


In [18]:
cdata[cdata['population'] < 10].sort_values(by='matched_pop')

Unnamed: 0,incountry_rank,country,location,score,latitude,longitude,matched_iso,matched_pop,population,difference
459,1,Botswana,"Maun, Botswana",14,-19.995262,23.418077,BW,Maun,0,1.33492
461,3,Botswana,"Mmadinare, Botswana",2,-21.88106,27.751381,BW,Selebi-Phikwe,0,14.412236


In [129]:
cdata[cdata['matched_pop'] == "Cheorwon"]

Unnamed: 0,incountry_rank,country,location,score,latitude,longitude,matched_iso,matched_pop,population,difference


In [13]:
cdata[cdata['matched_iso'] != iso]

Unnamed: 0,incountry_rank,country,location,score,latitude,longitude,matched_iso,matched_pop,population,difference


In [103]:
cdata.sort_values(by = 'difference', ascending=False).head(15)

Unnamed: 0,incountry_rank,country,location,score,latitude,longitude,matched_iso,matched_pop,population,difference
5231,47,South Korea,"Sejong City, South Korea",10,35.907757,127.766922,KR,Yeongdong,24154,29.728257
5243,59,South Korea,"Mitynang Si, Gyeongsangnam, South Korea",2,35.4606,128.2132,KR,Changnyeong,0,27.052663
5244,60,South Korea,"Hamyang, South Korea",2,35.520461,127.725176,KR,Changsu,26463,23.725397
5232,48,South Korea,"Jangheung, Jeollanam Do, South Korea",8,34.681686,126.906928,KR,Yeongam,55000,23.320246
5239,55,South Korea,"Hadong Gun, Gyeongnam, South Korea",3,35.067211,127.751269,KR,Kwangyang,89281,17.954987
5215,31,South Korea,"Eumseong, Chungbuk, South Korea",60,36.939679,127.690502,KR,Koesan,44461,17.064871
5217,33,South Korea,"Pochon, Gyeonggi Do, South Korea",58,37.894915,127.200355,KR,Yangju,179923,13.978722
5249,65,South Korea,"Seongju, Gyeongbuk Prov, South Korea",1,35.919008,128.282974,KR,Waegwan,29691,13.179996
5233,49,South Korea,"Bonghwa Gun, Gyeongsangbuk D, South Korea",6,36.893093,128.732375,KR,Eisen,84625,12.028989
5214,30,South Korea,"Seocheon, Chungcheongnam, South Korea",71,36.080331,126.691328,KR,Gunsan,243406,11.453856


# Concatenating all countries

In [44]:
cdata = []
cnone = []

for i in range(len(countries)):
    country=countries[i]
    if country in ['England', 'North Ireland', 'Scotland', 'Wales']:
        iso = 'GB'
    else:
        iso = meta[meta['Country'] == country]['ISO'].values[0]
        
    resc = auth[auth['country'] == country].copy()
    resloc = resc.loc[:, ['latitude', 'longitude']].values
    resloc = np.deg2rad(resloc)
    
    isocolonies = depend[depend['depends_from']==iso]['ISO'].values
    isoconflict = conflict[conflict['claimed_by_iso']==iso]['ISO'].values

    if len(isocolonies) > 0:
        colonies = []
        for isoc in isocolonies:
            colonies.append(popd[popd['country_code'] == isoc].copy())
        colonies.append(popd[popd['country_code'] == iso].copy())
        popc = pd.concat(colonies).sort_values(by='population', ascending=False).copy()
    elif len(isoconflict) > 0:
        conflicts = []
        for isoc in isoconflict:
            conflicts.append(popd[popd['country_code'] == isoc].copy())
        conflicts.append(popd[popd['country_code'] == iso].copy())
        popc = pd.concat(conflicts).sort_values(by='population', ascending=False).copy()
    else:
        popc = popd[popd['country_code'] == iso].copy().sort_values(by='population', ascending=False)

    poploc = popc.loc[:, ['latitude', 'longitude']].values
    poploc = np.deg2rad(poploc)
    
    r2p = haversine_distances(resloc, poploc)
    r2p *= 6371000/1000
    
    metro = np.zeros(len(r2p), dtype=int)
    matchs = ['' for i in range(len(metro))]
    isom = ['' for i in range(len(metro))]
    diffs = np.zeros(len(metro))

    for i in range(len(metro)):
        argmatch = np.argmin(r2p[i])
        diffs[i] = r2p[i, argmatch]
        matchs[i] = popc.iloc[argmatch, 0]
        isom[i] = popc.iloc[argmatch, 1]
        metro[i] = popc.iloc[argmatch, 2]
        popc.iloc[argmatch, 2] = 0
    for i in range(len(metro)):
        if isom[i] not in isoconflict:
            argmatch = np.nonzero(r2p[i] < 25)[0]
            matches = popc.iloc[argmatch]
            isomatches = matches[matches['country_code'] == isom[i]]
            totmetro = np.sum(isomatches['population'].values)
            metro[i] += totmetro
            popc.loc[isomatches.index, 'population'] = 0
    
    resc['country_code'] = iso
    resc['matched_iso'] = isom
    resc['matched_pop'] = matchs
    resc['dist_difference'] = diffs
    resc['population'] = metro
    
    cdata.append(resc)
    cnone.append(popc[popc['population'] > 0])

In [45]:
dfred = pd.concat(cdata, ignore_index=True)
print(dfred.shape)
dfred = dfred.iloc[:, [6,1,2,3,0,7,8,9,10,4,5]]
dfred.head()

(6424, 11)


Unnamed: 0,country_code,country,location,score,incountry_rank,matched_iso,matched_pop,dist_difference,population,latitude,longitude
0,AF,Afghanistan,"Kabul, Afghanistan",7,0,AF,Kabul,4.419195,4489112,34.555349,69.207486
1,AF,Afghanistan,"Kandahar, Afghanistan",3,1,AF,Kandahar,3.089936,523300,31.628871,65.737175
2,AF,Afghanistan,"Lashkar Gah, Helmand, Afghanistan",2,2,AF,Lashkar Gah,3.883134,43934,31.609565,64.408208
3,AF,Afghanistan,"Jalalabad, Afghanistan",1,3,AF,Jalalabad,0.612165,271900,34.428353,70.457802
4,AF,Afghanistan,"Herat, Afghanistan",1,4,AF,Herat,0.657768,586235,34.352865,62.204029


In [49]:
filename = '../data/merged_research_cities500.csv'
dfred.to_csv(filename, index=False)

In [27]:
print(len(dfred[dfred['population'] < 1]))
dfred[dfred['population'] < 1].iloc[:50]

0


Unnamed: 0,country_code,country,location,score,incountry_rank,matched_iso,matched_pop,dist_difference,population,latitude,longitude


In [53]:
dfred[dfred['country_code'] == 'MG'].head(10)

Unnamed: 0,country_code,country,location,score,incountry_rank,matched_iso,matched_pop,dist_difference,population,latitude,longitude
3586,MG,Madagascar,"Antananarivo, Madagascar",235,0,MG,Antananarivo,4.850209,1402433,-18.87919,47.507905
3587,MG,Madagascar,"Mahajanga, Madagascar",8,1,MG,Mahajanga,3.27899,154657,-15.692149,46.333686
3588,MG,Madagascar,"Antsirabe, Madagascar",7,2,MG,Antsirabe,0.908773,237038,-19.873008,47.029116
3589,MG,Madagascar,"Toamasina, Madagascar",3,3,MG,Toamasina,0.882668,206373,-18.144281,49.395784
3590,MG,Madagascar,"Antsiranana, Madagascar",2,4,MG,Antsiranana,0.697308,82937,-12.323135,49.294283
3591,MG,Madagascar,"Toliara, Madagascar",1,5,MG,Toliara,1.93007,122319,-23.351619,43.685494
3592,MG,Madagascar,"Fianarantsoa, Madagascar",1,6,MG,Fianarantsoa,0.286396,176227,-21.454615,47.087504
3593,MG,Madagascar,"Antalaha, Madagascar",1,7,MG,Antalaha,0.64463,59112,-14.906124,50.278549
3594,MG,Madagascar,"Ambositra, Madagascar",1,8,MG,Ambositra,0.709503,30353,-20.536263,47.245975


In [28]:
dfred.sort_values(by='dist_difference', ascending=False).head(25)

Unnamed: 0,country_code,country,location,score,incountry_rank,matched_iso,matched_pop,dist_difference,population,latitude,longitude
3903,KZ,Kazakhstan,"Nauchnyi, Kazakhstan",1,16,KZ,Saykhin,1077.097935,4825,44.724599,34.010742
1774,CO,Colombia,"Araracuara, Amazonas, Colombia",1,44,CO,Miraflores,221.560876,4864,-0.633333,-72.25
1599,CN,China,"Delingha, China",1,394,CN,Yanglong,185.457272,2404,37.36947,97.36087
4519,OM,Oman,"Jumah, Oman",3,5,OM,Hayma',171.246661,1294,21.473533,55.975413
3462,IQ,Iraq,"Anbar, Iraq",4,8,IQ,Hit,146.603545,31901,32.559761,41.919647
4901,RU,Russia,"Leninskie Gory, Russia",5,56,RU,Yerbogachen,146.052626,2019,61.52401,105.318756
1203,CL,Chile,"Ohiggins, Chile",1,30,CL,Cochrane,134.598041,2867,-48.46615,-72.559215
1069,CA,Canada,"W Hill, ON, Canada",5,81,CA,La Ronge,132.569109,5671,56.130366,-106.346771
1106,CA,Canada,"Calvert Isl, BC, Canada",2,118,CA,Port McNeill,126.257274,2064,51.555873,-128.036517
1765,CO,Colombia,"Caqueta, Colombia",2,35,CO,Cartagena del Chaira,122.708316,7586,0.869892,-73.841906
