# Assigns standardized locations to entities in the KG
**[Work in progress]**

This notebook standardizes location information for ingestion into a Knowledge Graph.

Author: Peter Rose (pwrose@ucsd.edu), Braden Riggs 

In [1]:
import os
import pandas as pd
from pathlib import Path
import time
import unidecode
import difflib

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-b9d10363-6d59-4deb-9595-2cb904a99d1d/installation-4.1.0/import


### Import data sources that have an `origLocation` property

In [4]:
df1 = pd.read_csv(NEO4J_IMPORT / '01b-Nextstrain.csv', dtype='str', usecols=['origLocation'])
df2 = pd.read_csv(NEO4J_IMPORT / '01d-CNCBStrain.csv', dtype='str', usecols=['origLocation'])
df3 = pd.read_csv(NEO4J_IMPORT / '02a-JHUCasesGlobal.csv', dtype='str', usecols=['origLocation'])
df = pd.concat([df1, df2, df3])

In [5]:
df.drop_duplicates(inplace=True)
df.fillna('', inplace=True)

In [6]:
df.shape

(2457, 1)

In [7]:
df.head()

Unnamed: 0,origLocation
0,"China,Guangdong,Zhuhai"
1,"Taiwan,Taipei"
2,"China,Hubei,Wuhan"
7,"Thailand,Bangkok"
8,Thailand


### Load custom GeoName mappings

In [8]:
ref = pd.read_csv("../../reference_data/GeoNameMapping.csv", usecols=['origName', 'geoName'], comment='#', dtype='str')

In [9]:
ref.head()

Unnamed: 0,origName,geoName
0,Burma,Myanmar
1,Congo (Kinshasa),Democratic Republic of the Congo
2,Congo (Brazzaville),Republic of the Congo
3,Cote d'Ivoire,Ivory Coast
4,Czech Republic,Czechia


In [10]:
name_list = ref[['origName', 'geoName']].values.tolist()

In [11]:
def fix_locations(text):
    for loc in name_list:
        if text.startswith(loc[0]):
            # full length match or match at a comma
            if len(text) == len(loc[0]) or text[len(loc[0])] == ',':
                text = text.replace(loc[0], loc[1])
    return text

### Apply custom GeoName mappings

In [12]:
df['geoLocation'] = df['origLocation'].apply(lambda x: fix_locations(x))

In [13]:
df.query('geoLocation != origLocation').head()

Unnamed: 0,origLocation,geoLocation
10,"USA,California","United States,California"
75,"USA,California,Orange County CA","United States,California,Orange County"
126,"USA,Arizona,Phoenix","United States,Arizona,Phoenix"
143,"USA,California,Los Angeles County","United States,California,Los Angeles County"
150,"USA,Illinois","United States,Illinois"


### Match Cruiseships

In [14]:
cruiseships = df[df['geoLocation'].str.startswith('CruiseShip,')].copy()

In [15]:
cruiseships.head(10)

Unnamed: 0,origLocation,geoLocation
13902,"Italy,Cruise ship","CruiseShip,Cruise ship"
46652,"United States,Cruise_Ship_1,California","CruiseShip,Cruise_Ship_1"
46658,"United States,Cruise_Ship_2,California","CruiseShip,Cruise_Ship_2"
865,Diamond Princess,"CruiseShip,Diamond Princess"
4104,"Canada,Grand Princess","CruiseShip,Grand Princess"
6004,"Canada,Diamond Princess","CruiseShip,Diamond Princess"
7591,MS Zaandam,"CruiseShip,MS Zaandam"


In [16]:
cruiseships = cruiseships[['origLocation', 'geoLocation']]
cruiseships.to_csv(NEO4J_IMPORT / "10a-GeoLinkCruiseShip.csv", index=False)

In [17]:
##### Remove Cruiseship data from data frame

In [18]:
df = df[~df['geoLocation'].str.startswith('CruiseShip,')].copy()

In [19]:
df.shape

(2450, 2)

In [20]:
def remove_special_characters(text):
    text = str(text)
    text = text.lower()
    # transliterate German umlaut
    text = text.replace('ü', 'ue') 
    # transliterate Unicode string into the closest possible ASCII representation
    text = unidecode.unidecode(text)
    text = text.replace("-"," ")
    text = text.replace("_"," ")
    text = text.replace(","," ")
    text = text.replace('\d+', '')
    text = text.strip()

    return text

In [21]:
def remove_affix(text):
    text = str(text)
    text = text.lower()
    # try match without prefix or suffix
#    text = text.replace("city", "")
    text = text.replace("region", "")
    text = text.replace("oblast", "") # regions in Russia
    text = text.replace("al ", "") # regions in Saudi Arabia
    text = text.replace("prefecture", "") # regions in Japan
    text = text.replace("province", "") # regions in China
    text = text.replace("district", "")
#    text = text.replace("parish", "")
    text = text.replace("gemeente", "") # regions in the Netherlands
    text = text.replace("wahlkreis", "") # regions in Switzerland
    text = text.replace("canton", "") # regions in Switzerland
    text = text.replace("municipality", "")
    text = text.replace("metropolitan", "")
    text = text.replace("state", "")
#    text = text.replace("county", "")
    text = text.replace('\d+', '')
    text = text.strip()
    return text

In [22]:
def remove_affix2(text):
    text = str(text)
    text = text.lower()
    # try match without prefix or suffix
    text = text.replace("city", "")
    text = text.replace("region", "")
    text = text.replace("county", "")
    text = text.replace("oblast", "") # regions in Russia
    text = text.replace("al ", "") # regions in Saudi Arabia
    text = text.replace("prefecture", "") # regions in Japan
    text = text.replace("province", "") # regions in China
    text = text.replace("district", "")
    text = text.replace("parish", "")
    text = text.replace("gemeente", "") # regions in the Netherlands
    text = text.replace("wahlkreis", "") # regions in Switzerland
    text = text.replace("canton", "") # regions in Switzerland
    text = text.replace("municipality", "")
    text = text.replace("metropolitan", "")
    text = text.replace("state", "")
#    text = text.replace("county", "")
    text = text.replace('\d+', '')
    text = text.strip()
    return text

In [23]:
def get_close_match(x, name):
    z = difflib.get_close_matches(x, name, n=1, cutoff=0.9)
    if z:
        return z[0]
    else:
        return ''

In [24]:
df.shape

(2450, 2)

In [25]:
df['locationLevels'] = df['geoLocation'].str.count(',')
df[['loc0', 'loc1', 'loc2', 'loc3']] = df['geoLocation'].str.split(',', n=3, expand=True)
df.fillna('', inplace=True)

## Match Countries

In [26]:
countries = pd.DataFrame(df['loc0'].copy())
countries.drop_duplicates(inplace=True)
countries.dropna(inplace=True)
countries.columns=['origName']
countries['canonicalName'] = countries.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [27]:
countries.shape

(218, 2)

In [28]:
countries.head()

Unnamed: 0,origName,canonicalName
0,China,china
1,Taiwan,taiwan
7,Thailand,thailand
10,United States,united states
15,Canada,canada


In [29]:
geo_countries = pd.read_csv(NEO4J_IMPORT / "00e-GeoNamesCountry.csv", dtype='str', usecols=['name'])

In [30]:
geo_countries['canonicalName'] = geo_countries.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [31]:
countries = countries.merge(geo_countries, on='canonicalName', how='left')
countries.fillna('', inplace=True)

### Exact country matches

In [32]:
country_exact_match = countries.query("name != ''").copy()

In [33]:
country_exact_match.shape

(217, 3)

In [34]:
country_no_match = countries.query("name == ''").copy()

In [35]:
country_no_match.head()

Unnamed: 0,origName,canonicalName,name
114,Macau,macau,


### Match Admin1

In [36]:
admin1 = pd.DataFrame(df['loc1'].copy())
admin1.drop_duplicates(inplace=True)
admin1.dropna(inplace=True)
admin1.columns=['origName']
admin1['canonicalName'] = admin1.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [37]:
geo_admin1 = pd.read_csv(NEO4J_IMPORT / "00f-GeoNamesAdmin1.csv", dtype='str', usecols=['name'])

In [38]:
geo_admin1['canonicalName'] = geo_admin1.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [39]:
admin1 = admin1.merge(geo_admin1, on='canonicalName', how='left')
admin1.fillna('', inplace=True)

In [40]:
admin1_exact_match = admin1.query("name != ''").copy()
admin1_exact_match.drop_duplicates(inplace=True)

In [41]:
admin1_exact_match.shape

(486, 3)

In [42]:
admin1_exact_match.head(10)

Unnamed: 0,origName,canonicalName,name
0,Guangdong,guangdong,Guangdong
1,Taipei,taipei,Taipei
2,Hubei,hubei,Hubei
3,Bangkok,bangkok,Bangkok
5,California,california,California
6,Ontario,ontario,Ontario
7,Shanghai,shanghai,Shanghai
9,Hokkaido,hokkaido,Hokkaido
11,Kuala Lumpur,kuala lumpur,Kuala Lumpur
12,New South Wales,new south wales,New South Wales


In [43]:
geo_admin1.query("name == 'Masovia'")

Unnamed: 0,name,canonicalName


In [44]:
admin1_exact_match.query("name == 'Masovia'")

Unnamed: 0,origName,canonicalName,name


##### Remaining locations that don't match a GeoName location

In [45]:
admin1 = admin1.query("name == ''").copy()
admin1.drop('name', axis=1, inplace=True)

In [46]:
admin1.shape

(755, 2)

In [47]:
admin1['canonicalNameNoAffix'] = admin1.apply(lambda x: remove_affix(x['canonicalName']), axis=1)

In [48]:
geo_admin1['canonicalNameNoAffix'] = geo_admin1.apply(lambda x: remove_affix(x['canonicalName']), axis=1)
admin1 = admin1.merge(geo_admin1[['name', 'canonicalNameNoAffix']], on='canonicalNameNoAffix', how='left')
admin1.fillna('', inplace=True)

In [49]:
admin1_no_affix_match = admin1.query("name != ''")

In [50]:
admin1_no_affix_match.shape

(34, 4)

In [51]:
admin1_no_affix_match.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
27,Osun State,osun state,osun,Osun
28,Kwara State,kwara state,kwara,Kwara
54,Chisinau,chisinau,chisinau,Chisinau Municipality
56,Ogun State,ogun state,ogun,Ogun
58,Oyo State,oyo state,oyo,Oyo


In [52]:
admin1_no_match = admin1.query("name == ''").copy()
admin1_no_match.query("origName != ''", inplace=True)

In [53]:
admin1_no_match.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
1,Kanto,kanto,kanto,
2,Hokurikushinsyu,hokurikushinsyu,hokurikushinsyu,
3,Hong Kong,hong kong,hong kong,
4,Kaohsiung,kaohsiung,kaohsiung,
5,Samut Prakarn,samut prakarn,samut prakarn,


### Match Admin2

In [54]:
admin2 = pd.DataFrame(df['loc2'].copy())
admin2.columns=['origName']
admin2.query("origName != ''", inplace=True)
admin2.head()

Unnamed: 0,origName
0,Zhuhai
2,Wuhan
36,Huizhou
37,Sydney
39,Clayton AU


In [55]:
a1_no_match = pd.DataFrame(admin1_no_match['origName']).copy()

In [56]:
a1_no_match.head()

Unnamed: 0,origName
1,Kanto
2,Hokurikushinsyu
3,Hong Kong
4,Kaohsiung
5,Samut Prakarn


In [57]:
admin2.head()

Unnamed: 0,origName
0,Zhuhai
2,Wuhan
36,Huizhou
37,Sydney
39,Clayton AU


In [58]:
# carry over regions from location level 1 that might be Admin2s or Cities
admin2 = pd.concat([admin2,a1_no_match])
admin2.drop_duplicates(inplace=True)
admin2.dropna(inplace=True)
admin2.columns=['origName']
admin2['canonicalName'] = admin2.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [59]:
geo_admin2 = pd.read_csv(NEO4J_IMPORT / "00g-GeoNamesAdmin2.csv", dtype='str', usecols=['name'])

In [60]:
geo_admin2['canonicalName'] = geo_admin2.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [61]:
admin2 = admin2.merge(geo_admin2, on='canonicalName', how='left')
admin2.fillna('', inplace=True)

In [62]:
admin2_exact_match = admin2.query("name != ''").copy()
admin2_exact_match.drop_duplicates(inplace=True)

In [63]:
admin2_exact_match.shape

(415, 3)

In [64]:
admin2_exact_match.head()

Unnamed: 0,origName,canonicalName,name
6,Melbourne,melbourne,Melbourne
7,Orange County,orange county,Orange County
15,Paris,paris,Paris
17,Brisbane,brisbane,Brisbane
23,Los Angeles County,los angeles county,Los Angeles County


##### Remaining locations that don't match a GeoName location

In [65]:
admin2_no_match = admin2.query("name == ''").copy()

In [66]:
admin2_no_match.shape

(1138, 3)

### Match Cities

In [67]:
city = pd.DataFrame(df['loc3'].copy())
city.columns=['origName']
city.query("origName != ''", inplace=True)
city.head(100)

Unnamed: 0,origName
1589,Queens
1825,Manhattan
544,Sheffield
2382,Manhattan
3103,Brooklyn
3898,Queens
10162,Havlickuv Brod
13869,St Anton am Arlberg
36109,Bilbao
48448,Caraman


In [68]:
a2_no_match = pd.DataFrame(admin2_no_match['origName'])

In [69]:
# carry over regions from location level 1 that might be Admin2s or Cities
city = pd.concat([a2_no_match, city])
city.drop_duplicates(inplace=True)
city.dropna(inplace=True)
city.columns=['origName']
city['canonicalName'] = city.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [70]:
geo_city = pd.read_csv(NEO4J_IMPORT / "00h-GeoNamesCity.csv", dtype='str', usecols=['name'])

In [71]:
geo_city['canonicalName'] = geo_city.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [72]:
city = city.merge(geo_city, on='canonicalName', how='left')
city.fillna('', inplace=True)

In [73]:
city_exact_match = city.query("name != ''").copy()
city_exact_match.drop_duplicates(inplace=True)

In [74]:
city_exact_match.shape

(765, 3)

In [75]:
city_exact_match.head()

Unnamed: 0,origName,canonicalName,name
0,Zhuhai,zhuhai,Zhuhai
1,Wuhan,wuhan,Wuhan
2,Huizhou,huizhou,Huizhou
3,Sydney,sydney,Sydney
7,Kyoto,kyoto,Kyoto


In [76]:
no_match = city.query("name == ''").copy()

In [77]:
no_match.shape

(382, 3)

In [78]:
no_match.head()

Unnamed: 0,origName,canonicalName,name
5,Clayton AU,clayton au,
6,Jönköping,jonkoping,
15,Gyeonggi,gyeonggi,
17,Gomez Palacios,gomez palacios,
18,Amelia Denis de Icaza,amelia denis de icaza,


### Match other divisions

In [79]:
divisions = no_match['origName']

In [80]:
divisions.drop_duplicates(inplace=True)
divisions.dropna(inplace=True)
divisions = divisions[divisions != '']
divisions.shape

(382,)

In [81]:
divisions = pd.DataFrame(divisions)
divisions.columns=['origName']

In [82]:
divisions['canonicalName'] = divisions.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [83]:
divisions.head()

Unnamed: 0,origName,canonicalName
5,Clayton AU,clayton au
6,Jönköping,jonkoping
15,Gyeonggi,gyeonggi
17,Gomez Palacios,gomez palacios
18,Amelia Denis de Icaza,amelia denis de icaza


In [84]:
geo_admin1 = pd.read_csv(NEO4J_IMPORT / "00f-GeoNamesAdmin1.csv", dtype='str', usecols=['name'])
geo_admin2 = pd.read_csv(NEO4J_IMPORT / "00g-GeoNamesAdmin2.csv", dtype='str', usecols=['name'])
geo_city = pd.read_csv(NEO4J_IMPORT / "00h-GeoNamesCity.csv", dtype='str', usecols=['name'])

In [85]:
geo_divisions = pd.concat([geo_admin1, geo_admin2, geo_city])

In [86]:
geo_divisions.drop_duplicates(inplace=True)

In [87]:
geo_divisions.shape

(154213, 1)

In [88]:
geo_divisions.dropna(inplace=True)
geo_divisions.columns=['name']
geo_divisions['canonicalName'] = geo_divisions.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [89]:
geo_divisions.head()

Unnamed: 0,name,canonicalName
0,Sant Julia de Loria,sant julia de loria
1,Ordino,ordino
2,La Massana,la massana
3,Encamp,encamp
4,Canillo,canillo


In [90]:
divisions['canonicalNameNoAffix'] = divisions.apply(lambda x: remove_affix2(x['canonicalName']), axis=1)

In [91]:
geo_divisions['canonicalNameNoAffix'] = geo_divisions.apply(lambda x: remove_affix2(x['canonicalName']), axis=1)
divisions = divisions.merge(geo_divisions[['name', 'canonicalNameNoAffix']], on='canonicalNameNoAffix', how='left')
divisions.fillna('', inplace=True)

In [92]:
divisions.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
0,Clayton AU,clayton au,clayton au,
1,Jönköping,jonkoping,jonkoping,
2,Gyeonggi,gyeonggi,gyeonggi,
3,Gomez Palacios,gomez palacios,gomez palacios,
4,Amelia Denis de Icaza,amelia denis de icaza,amelia denis de icaza,


In [93]:
divisions_exact_match = divisions.query("name != ''")

In [94]:
divisions_exact_match.shape

(45, 4)

In [95]:
divisions_exact_match.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
11,Heinsberg District,heinsberg district,heinsberg,Heinsberg
13,Aichi,aichi,aichi,Aichi
32,eThekwini,ethekwini,ethekwini,eThekwini Metropolitan Municipality
40,Mirpur,mirpur,mirpur,Mirpur District
42,Zululand,zululand,zululand,Zululand District Municipality


### Collect all location matches

In [96]:
matches = pd.concat([country_exact_match, admin1_exact_match, admin1_no_affix_match, admin2_exact_match, divisions_exact_match, city_exact_match])

In [97]:
matches = matches[['origName', 'name']]

In [98]:
matches.shape

(1962, 2)

In [99]:
matches.tail()

Unnamed: 0,origName,name
1467,Brooklyn,Brooklyn
1475,Havlickuv Brod,Havlickuv Brod
1476,St Anton am Arlberg,St Anton am Arlberg
1477,Bilbao,Bilbao
1478,Caraman,Caraman


In [100]:
matches.query("origName == 'Zhuhai City'")

Unnamed: 0,origName,name
75,Zhuhai City,Zhuhai


In [101]:
name_list = matches[['origName', 'name']].values.tolist()
name_dict = {name: val for name, val in name_list}

In [102]:
name_dict.get('France,Normandy')

In [103]:
df['geoName0'] = df['loc0'].apply(lambda x: name_dict.get(x, ''))
df['geoName1'] = df['loc1'].apply(lambda x: name_dict.get(x, ''))
df['geoName2'] = df['loc2'].apply(lambda x: name_dict.get(x, ''))
df['geoName3'] = df['loc3'].apply(lambda x: name_dict.get(x, ''))

In [104]:
df.head()

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
0,"China,Guangdong,Zhuhai","China,Guangdong,Zhuhai",2,China,Guangdong,Zhuhai,,China,Guangdong,Zhuhai,
1,"Taiwan,Taipei","Taiwan,Taipei",1,Taiwan,Taipei,,,Taiwan,Taipei,,
2,"China,Hubei,Wuhan","China,Hubei,Wuhan",2,China,Hubei,Wuhan,,China,Hubei,Wuhan,
7,"Thailand,Bangkok","Thailand,Bangkok",1,Thailand,Bangkok,,,Thailand,Bangkok,,
8,Thailand,Thailand,0,Thailand,,,,Thailand,,,


In [105]:
df[(df['geoName1'] == '') & (df['geoName2'] != '')]

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
799,"Bahrain,Capital Governorate,Manama","Bahrain,Capital Governorate,Manama",2,Bahrain,Capital Governorate,Manama,,Bahrain,,Manama,
834,"Italy,Lazio,Rome","Italy,Lazio,Rome",2,Italy,Lazio,Rome,,Italy,,Rome,
1040,"Colombia,Departamento de Amazonas,Leticia","Colombia,Departamento de Amazonas,Leticia",2,Colombia,Departamento de Amazonas,Leticia,,Colombia,,Leticia,
1081,"North Macedonia,Northeastern Region,Kumanovo","North Macedonia,Northeastern Region,Kumanovo",2,North Macedonia,Northeastern Region,Kumanovo,,North Macedonia,,Kumanovo,
1171,"Bangladesh,Chattogram,Chandpur","Bangladesh,Chattogram,Chandpur",2,Bangladesh,Chattogram,Chandpur,,Bangladesh,,Chandpur,
1426,"Vietnam,Red River Delta,Hanoi","Vietnam,Red River Delta,Hanoi",2,Vietnam,Red River Delta,Hanoi,,Vietnam,,Hanoi,
1932,"Costa Rica,Cartago CR,La Union","Costa Rica,Cartago CR,La Union",2,Costa Rica,Cartago CR,La Union,,Costa Rica,,La Union,
2035,"Poland,Malopolskie,Kraków","Poland,Malopolskie,Kraków",2,Poland,Malopolskie,Kraków,,Poland,,Krakow,
2462,"Czech Republic,Hranice Na Moravě,Prostějov","Czechia,Hranice Na Moravě,Prostějov",2,Czechia,Hranice Na Moravě,Prostějov,,Czechia,,Prostejov,
2612,"Oman,Dakhiliyah,Nizwa","Oman,Dakhiliyah,Nizwa",2,Oman,Dakhiliyah,Nizwa,,Oman,,Nizwa,


### Left align all geoNames

In [106]:
df[['geoName1','geoName2']] = df[['geoName1','geoName2']].apply(lambda x: x if x[0] != '' else [x[1],x[0]], axis=1)
df[['geoName2','geoName3']] = df[['geoName2','geoName3']].apply(lambda x: x if x[0] != '' else [x[1],x[0]], axis=1)

In [107]:
df.query("origLocation == 'North Macedonia,Pelagonia,Prilep'")

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
3776,"North Macedonia,Pelagonia,Prilep","North Macedonia,Pelagonia,Prilep",2,North Macedonia,Pelagonia,Prilep,,North Macedonia,Prilep,,


In [108]:
df[(df['geoName1'] == '') & (df['geoName2'] != '')]

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3


Test

In [109]:
df.query("origLocation == 'Austria,St. Anton'")

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
13869,"Austria,St. Anton","Austria,Tyrol,Politischer Bezirk Landeck,St An...",3,Austria,Tyrol,Politischer Bezirk Landeck,St Anton am Arlberg,Austria,Tyrol,Politischer Bezirk Landeck,St Anton am Arlberg


In [110]:
df = df[(df['origLocation'] != '') & (df['geoName0'] != '')]
df = df[['origLocation', 'geoName0', 'geoName1', 'geoName2', 'geoName3', 'locationLevels']]
df.to_csv(NEO4J_IMPORT / "10a-GeoLink.csv", index=False)

### Remaining locations that do not match GeoNames.org locations

In [111]:
geo_divisions['canonicalName'] = geo_divisions.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [112]:
geo_divisions.head()

Unnamed: 0,name,canonicalName,canonicalNameNoAffix
0,Sant Julia de Loria,sant julia de loria,sant julia de loria
1,Ordino,ordino,ordino
2,La Massana,la massana,la massana
3,Encamp,encamp,encamp
4,Canillo,canillo,canillo


In [113]:
divisions = divisions.query("name == ''")

In [114]:
divisions['name'] = divisions.apply(lambda x: get_close_match(x['canonicalName'], geo_divisions['canonicalName']), axis=1)

In [115]:
divisions_close_match = divisions.query("name != ''")

In [116]:
divisions_close_match.head(300)

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
1,Jönköping,jonkoping,jonkoping,joenkoeping
3,Gomez Palacios,gomez palacios,gomez palacios,gomez palacio
16,Lamine,lamine,lamine,lamin
17,Mehsana,mehsana,mehsana,mesana
18,Barka,barka,barka,barkam
19,Fenoarivo,fenoarivo,fenoarivo,fenoarivobe
22,Al Matariyyah,al matariyyah,matariyyah,al matariyah
23,Columbia County WI,columbia county wi,columbia wi,columbia county
24,Jefferson County WI,jefferson county wi,jefferson wi,jefferson county
48,Manah,manah,manah,matnah


### Locations that do not match

In [117]:
divisions_no_match = divisions.query("name == ''")

In [118]:
divisions_no_match.shape

(247, 4)

In [119]:
divisions_no_match.head(300)

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
0,Clayton AU,clayton au,clayton au,
2,Gyeonggi,gyeonggi,gyeonggi,
4,Amelia Denis de Icaza,amelia denis de icaza,amelia denis de icaza,
5,Belisario Frías,belisario frias,belisario frias,
6,Vitoria-Gasteiz,vitoria gasteiz,vitoria gasteiz,
7,Omar Torrijos,omar torrijos,omar torrijos,
8,Ernesto Córdoba,ernesto cordoba,ernesto cordoba,
9,Donostia-San Sebastian,donostia san sebastian,donostia san sebastian,
10,Arnulfo Arias,arnulfo arias,arnulfo arias,
12,Contamines,contamines,contamines,
