# Assigns standardized locations to entities in the KG
**[Work in progress]**

This notebook standardizes location information for ingestion into a Knowledge Graph.

Author: Peter Rose (pwrose@ucsd.edu), Braden Riggs 

In [1]:
import os
import pandas as pd
from pathlib import Path
import time
import unidecode
import difflib

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-328d8379-6ab4-4cc1-a397-2de37909d2e4/installation-4.1.0/import


### Import data sources that have an `origLocation` property

In [4]:
df1 = pd.read_csv(NEO4J_IMPORT / '01b-Nextstrain.csv', dtype='str', usecols=['origLocation'])
df2 = pd.read_csv(NEO4J_IMPORT / '01d-CNCBStrain.csv', dtype='str', usecols=['origLocation'])
df3 = pd.read_csv(NEO4J_IMPORT / '02a-JHUCasesGlobal.csv', dtype='str', usecols=['origLocation'])
df4 = pd.read_csv(NEO4J_IMPORT / '02b-CDSCases.csv', dtype='str', usecols=['origLocation'])
df5 = pd.read_csv(NEO4J_IMPORT / '02d-GOBMXCasesAdmin1.csv', dtype='str', usecols=['origLocation'])
df6 = pd.read_csv(NEO4J_IMPORT / '02d-GOBMXCasesAdmin2.csv', dtype='str', usecols=['origLocation'])

df = pd.concat([df1, df2, df3, df4, df5, df6])

In [5]:
df.drop_duplicates(inplace=True)
df.fillna('', inplace=True)

In [6]:
df.shape

(8157, 1)

In [7]:
df.head()

Unnamed: 0,origLocation
0,"China,Guangdong"
1,"China,Hubei,Wuhan"
6,Singapore
7,"Thailand,Bangkok"
11,"Taiwan,Taipei"


In [8]:
df[df['origLocation'].str.contains('D.C.')].head()

Unnamed: 0,origLocation
274903,"United States,Washington, D.C."
275124,"United States,Washington, D.C.,District of Col..."


### Load custom GeoName mappings

In [9]:
ref = pd.read_csv("../../reference_data/GeoNameMapping.csv", usecols=['origName', 'geoName'], comment='#', dtype='str')

In [10]:
ref.head()

Unnamed: 0,origName,geoName
0,Burma,Myanmar
1,Cape Verde,Cabo Verde
2,Congo (Kinshasa),Democratic Republic of the Congo
3,Congo (Brazzaville),Republic of the Congo
4,Congo-Brazzaville,Republic of the Congo


In [11]:
name_list = ref[['origName', 'geoName']].values.tolist()

In [12]:
def fix_locations(text):
    for loc in name_list:
        if text.startswith(loc[0]):
            # full length match or match at a comma
            if len(text) == len(loc[0]) or text[len(loc[0])] == ',':
                text = text.replace(loc[0], loc[1])
    return text

### Apply custom GeoName mappings

In [13]:
df['geoLocation'] = df['origLocation'].apply(lambda x: fix_locations(x))

In [14]:
df.query('geoLocation != origLocation').head()

Unnamed: 0,origLocation,geoLocation
22,"Spain,Comunitat Valenciana,Valencia","Spain,Valencia,Valencia"
74,"Panama,Panama Center,Las Cumbres","Panama,Panama,Las Cumbres"
88,"Panama,Panama Center","Panama,Panama"
89,"Panama,Panama Center,Chilibre","Panama,Panama,Chilibre"
91,"Panama,Panama Center,Pedregal","Panama,Panama,Pedregal"


### Match Cruiseships

In [15]:
cruiseships = df[df['geoLocation'].str.startswith('CruiseShip,')].copy()

In [16]:
cruiseships.head(10)

Unnamed: 0,origLocation,geoLocation
864,Diamond Princess,"CruiseShip,Diamond Princess"
4103,"Canada,Grand Princess","CruiseShip,Grand Princess"
6003,"Canada,Diamond Princess","CruiseShip,Diamond Princess"
7590,MS Zaandam,"CruiseShip,MS Zaandam"


In [17]:
cruiseships = cruiseships[['origLocation', 'geoLocation']]
cruiseships.to_csv(NEO4J_IMPORT / "10a-GeoLinkCruiseShip.csv", index=False)

##### Remove Cruiseship data from data frame

In [18]:
df = df[~df['geoLocation'].str.startswith('CruiseShip,')].copy()

In [19]:
df.shape

(8153, 2)

In [20]:
df.head(100)

Unnamed: 0,origLocation,geoLocation
0,"China,Guangdong","China,Guangdong"
1,"China,Hubei,Wuhan","China,Hubei,Wuhan"
6,Singapore,Singapore
7,"Thailand,Bangkok","Thailand,Bangkok"
11,"Taiwan,Taipei","Taiwan,Taipei"
12,"China,Guangdong,Huizhou","China,Guangdong,Huizhou"
15,"Israel,Central District,Kfar Saba","Israel,Central District,Kfar Saba"
20,Thailand,Thailand
21,"Spain,Extremadura","Spain,Extremadura"
22,"Spain,Comunitat Valenciana,Valencia","Spain,Valencia,Valencia"


In [21]:
def remove_special_characters(text):
    text = str(text)
    text = text.lower()
    # transliterate umlauts
    text = text.replace('ü', 'ue') 
    text = text.replace('ä', 'ae') 
    text = text.replace('ö', 'oe')
    # transliterate Unicode string into the closest possible ASCII representation
    text = unidecode.unidecode(text)
    text = text.replace("-"," ")
    text = text.replace("_"," ")
    text = text.replace(","," ")
    text = text.replace("."," ")
    text = text.replace('\d+', '')
    text = text.strip()

    return text

In [22]:
def remove_affix(text):
    text = str(text)
    text = text.lower()
    # try match without prefix or suffix
    text = text.replace("region", "")
    text = text.replace("oblast", "") # regions in Russia
    text = text.replace("al ", "") # regions in Saudi Arabia
    text = text.replace("prefecture", "") # regions in Japan
    text = text.replace("province", "") # regions in China
    text = text.replace("district", "")
    text = text.replace("canton", "") # regions in Switzerland
    text = text.replace("municipality", "")
    text = text.replace("municipio", "")  # regions in Mexico
    text = text.replace("metropolitan", "")
    text = text.replace("voivodeship", "")
    text = text.replace("state", "")
    text = text.replace('\d+', '')
    text = text.strip()
    return text

In [23]:
def remove_affix2(text):
    text = str(text)
    text = text.lower()
    # try match without prefix or suffix
    text = text.replace("city", "")
    text = text.replace("region", "")
    text = text.replace("county", "")
    text = text.replace("oblast", "") # regions in Russia
    text = text.replace("al ", "") # regions in Saudi Arabia
    text = text.replace("prefecture", "") # regions in Japan
    text = text.replace("province", "") # regions in China
    text = text.replace("district", "")
    text = text.replace("parish", "")
    text = text.replace("gemeente", "") # regions in the Netherlands
    text = text.replace("wahlkreis", "") # regions in Switzerland
    text = text.replace("canton", "") # regions in Switzerland
    text = text.replace("municipality", "")
    text = text.replace("municipio", "") # regions in Mexico
    text = text.replace("metropolitan", "")
    text = text.replace("voivodeship", "")
    text = text.replace("novads", "")
    text = text.replace("state", "")
    text = text.replace("st ", "")
    text = text.replace('\d+', '')
    text = text.strip()
    return text

In [24]:
def get_close_match(x, name):
    z = difflib.get_close_matches(x, name, n=1, cutoff=0.9)
    if z:
        return z[0]
    else:
        return ''

In [25]:
df.shape

(8153, 2)

In [26]:
df['locationLevels'] = df['geoLocation'].str.count(',')
df[['loc0', 'loc1', 'loc2', 'loc3']] = df['geoLocation'].str.split(',', n=3, expand=True)
df.fillna('', inplace=True)

## Match Countries

In [27]:
countries = pd.DataFrame(df['loc0'].copy())
countries.drop_duplicates(inplace=True)
countries.dropna(inplace=True)
countries.columns=['origName']
countries['canonicalName'] = countries.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [28]:
countries.shape

(221, 2)

In [29]:
countries.head()

Unnamed: 0,origName,canonicalName
0,China,china
6,Singapore,singapore
7,Thailand,thailand
11,Taiwan,taiwan
15,Israel,israel


In [30]:
geo_countries = pd.read_csv(NEO4J_IMPORT / "00e-GeoNamesCountry.csv", dtype='str', usecols=['name'])

In [31]:
geo_countries['canonicalName'] = geo_countries.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [32]:
countries = countries.merge(geo_countries, on='canonicalName', how='left')
countries.fillna('', inplace=True)

### Exact country matches

In [33]:
country_exact_match = countries.query("name != ''").copy()

In [34]:
country_exact_match.shape

(220, 3)

In [35]:
country_no_match = countries.query("name == ''").copy()

In [36]:
country_no_match.head(10)

Unnamed: 0,origName,canonicalName,name
103,Crimea,crimea,


### Match Admin1

In [37]:
admin1 = pd.DataFrame(df['loc1'].copy())
admin1.drop_duplicates(inplace=True)
admin1.dropna(inplace=True)
admin1.columns=['origName']
admin1['canonicalName'] = admin1.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [38]:
geo_admin1 = pd.read_csv(NEO4J_IMPORT / "00f-GeoNamesAdmin1.csv", dtype='str', usecols=['name'])

In [39]:
geo_admin1['canonicalName'] = geo_admin1.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [40]:
admin1 = admin1.merge(geo_admin1, on='canonicalName', how='left')
admin1.fillna('', inplace=True)

In [41]:
admin1_exact_match = admin1.query("name != ''").copy()
admin1_exact_match.drop_duplicates(inplace=True)

In [42]:
admin1_exact_match.shape

(952, 3)

In [43]:
admin1_exact_match.head(10)

Unnamed: 0,origName,canonicalName,name
0,Guangdong,guangdong,Guangdong
1,Hubei,hubei,Hubei
3,Bangkok,bangkok,Bangkok
4,Taipei,taipei,Taipei
5,Central District,central district,Central District
6,Extremadura,extremadura,Extremadura
7,Valencia,valencia,Valencia
8,Seoul,seoul,Seoul
9,Zhejiang,zhejiang,Zhejiang
10,Tolima,tolima,Tolima


##### Remaining locations that don't match a GeoName location

In [44]:
admin1 = admin1.query("name == ''").copy()
admin1.drop('name', axis=1, inplace=True)

In [45]:
admin1.shape

(3234, 2)

In [46]:
admin1['canonicalNameNoAffix'] = admin1.apply(lambda x: remove_affix(x['canonicalName']), axis=1)

In [47]:
geo_admin1['canonicalNameNoAffix'] = geo_admin1.apply(lambda x: remove_affix(x['canonicalName']), axis=1)
admin1 = admin1.merge(geo_admin1[['name', 'canonicalNameNoAffix']], on='canonicalNameNoAffix', how='left')
admin1.fillna('', inplace=True)

In [48]:
admin1_no_affix_match = admin1.query("name != ''")

In [49]:
admin1_no_affix_match.shape

(254, 4)

In [50]:
admin1_no_affix_match.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
11,Osun State,osun state,osun,Osun
14,Kwara State,kwara state,kwara,Kwara
39,Tel Aviv District,tel aviv district,tel aviv,Tel Aviv
56,Chisinau,chisinau,chisinau,Chisinau Municipality
58,Oyo State,oyo state,oyo,Oyo


In [51]:
admin1_no_match = admin1.query("name == ''").copy()
admin1_no_match.query("origName != ''", inplace=True)

In [52]:
admin1_no_match.shape

(2986, 4)

In [53]:
admin1_no_match.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
1,Nakhonnayok,nakhonnayok,nakhonnayok,
2,Tbilisi,tbilisi,tbilisi,
3,Athens,athens,athens,
4,Castilla y Leon,castilla y leon,castilla y leon,
5,Singapore,singapore,singapore,


### Match Admin2

In [54]:
admin2 = pd.DataFrame(df['loc2'].copy())
admin2.columns=['origName']
admin2.query("origName != ''", inplace=True)
admin2.head()

Unnamed: 0,origName
1,Wuhan
12,Huizhou
15,Kfar Saba
22,Valencia
24,Seoul


In [55]:
a1_no_match = pd.DataFrame(admin1_no_match['origName']).copy()

In [56]:
a1_no_match.shape

(2986, 1)

In [57]:
a1_no_match.head()

Unnamed: 0,origName
1,Nakhonnayok
2,Tbilisi
3,Athens
4,Castilla y Leon
5,Singapore


In [58]:
# carry over regions from location level 1 that might be Admin2s or Cities
admin2 = pd.concat([admin2,a1_no_match])
admin2.drop_duplicates(inplace=True)
admin2.dropna(inplace=True)
admin2.columns=['origName']
admin2['canonicalName'] = admin2.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [59]:
geo_admin2 = pd.read_csv(NEO4J_IMPORT / "00g-GeoNamesAdmin2.csv", dtype='str', usecols=['name'])

In [60]:
geo_admin2['canonicalName'] = geo_admin2.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [61]:
admin2 = admin2.merge(geo_admin2, on='canonicalName', how='left')
admin2.fillna('', inplace=True)

In [62]:
admin2_exact_match = admin2.query("name != ''").copy()
admin2_exact_match.drop_duplicates(inplace=True)

In [63]:
admin2_exact_match.shape

(4354, 3)

In [64]:
admin2_exact_match.head()

Unnamed: 0,origName,canonicalName,name
3,Valencia,valencia,Valencia
5,Ibague,ibague,Ibague
11,Pueblo Nuevo,pueblo nuevo,Pueblo Nuevo
15,Pacora,pacora,Pacora
23,Leon,leon,Leon


##### Remaining locations that don't match a GeoName location

In [65]:
admin2_no_match = admin2.query("name == ''").copy()

In [66]:
admin2_no_match.shape

(1039, 3)

### Match Cities

In [67]:
city = pd.DataFrame(df['loc3'].copy())
city.columns=['origName']
city.query("origName != ''", inplace=True)
city.head(100)

Unnamed: 0,origName
911,Manhattan
2834,Brooklyn
298,Sheffield
1605,Manhattan
2116,Brooklyn
2597,Queens
2868,Companigonj
2970,Goshbag / Lamchori


In [68]:
a2_no_match = pd.DataFrame(admin2_no_match['origName'])

In [69]:
# carry over regions from location level 1 that might be Admin2s or Cities
city = pd.concat([a2_no_match, city])
city.drop_duplicates(inplace=True)
city.dropna(inplace=True)
city.columns=['origName']
city['canonicalName'] = city.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [70]:
geo_city = pd.read_csv(NEO4J_IMPORT / "00h-GeoNamesCity.csv", dtype='str', usecols=['name'])

In [71]:
geo_city['canonicalName'] = geo_city.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [72]:
city = city.merge(geo_city, on='canonicalName', how='left')
city.fillna('', inplace=True)

In [73]:
city_exact_match = city.query("name != ''").copy()
city_exact_match.drop_duplicates(inplace=True)

In [74]:
city_exact_match.shape

(444, 3)

In [75]:
city_exact_match.head()

Unnamed: 0,origName,canonicalName,name
0,Wuhan,wuhan,Wuhan
1,Huizhou,huizhou,Huizhou
2,Kfar Saba,kfar saba,Kfar Saba
3,Seoul,seoul,Seoul
4,Las Cumbres,las cumbres,Las Cumbres


In [76]:
no_match = city.query("name == ''").copy()

In [77]:
no_match.shape

(600, 3)

In [78]:
no_match.head()

Unnamed: 0,origName,canonicalName,name
12,Amelia Denis de Icaza,amelia denis de icaza,
16,Omar Torrijos,omar torrijos,
17,Vitoria-Gasteiz,vitoria gasteiz,
19,Arnulfo Arias,arnulfo arias,
20,Don Bosco,don bosco,


### Match other divisions

In [79]:
divisions = no_match['origName']

In [80]:
divisions.drop_duplicates(inplace=True)
divisions.dropna(inplace=True)
divisions = divisions[divisions != '']
divisions.shape

(600,)

In [81]:
divisions = pd.DataFrame(divisions)
divisions.columns=['origName']

In [82]:
divisions['canonicalName'] = divisions.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [83]:
divisions.head()

Unnamed: 0,origName,canonicalName
12,Amelia Denis de Icaza,amelia denis de icaza
16,Omar Torrijos,omar torrijos
17,Vitoria-Gasteiz,vitoria gasteiz
19,Arnulfo Arias,arnulfo arias
20,Don Bosco,don bosco


In [84]:
geo_admin1 = pd.read_csv(NEO4J_IMPORT / "00f-GeoNamesAdmin1.csv", dtype='str', usecols=['name'])
geo_admin2 = pd.read_csv(NEO4J_IMPORT / "00g-GeoNamesAdmin2.csv", dtype='str', usecols=['name'])
geo_city = pd.read_csv(NEO4J_IMPORT / "00h-GeoNamesCity.csv", dtype='str', usecols=['name'])

In [85]:
geo_divisions = pd.concat([geo_admin1, geo_admin2, geo_city])

In [86]:
geo_divisions.drop_duplicates(inplace=True)

In [87]:
geo_divisions.shape

(154242, 1)

In [88]:
geo_divisions.dropna(inplace=True)
geo_divisions.columns=['name']
geo_divisions['canonicalName'] = geo_divisions.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [89]:
geo_divisions.head()

Unnamed: 0,name,canonicalName
0,Sant Julia de Loria,sant julia de loria
1,Ordino,ordino
2,La Massana,la massana
3,Encamp,encamp
4,Canillo,canillo


In [90]:
divisions['canonicalNameNoAffix'] = divisions.apply(lambda x: remove_affix2(x['canonicalName']), axis=1)

In [91]:
geo_divisions['canonicalNameNoAffix'] = geo_divisions.apply(lambda x: remove_affix2(x['canonicalName']), axis=1)
divisions = divisions.merge(geo_divisions[['name', 'canonicalNameNoAffix']], on='canonicalNameNoAffix', how='left')
divisions.fillna('', inplace=True)

In [92]:
divisions.head(1000)

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
0,Amelia Denis de Icaza,amelia denis de icaza,amelia denis de icaza,
1,Omar Torrijos,omar torrijos,omar torrijos,
2,Vitoria-Gasteiz,vitoria gasteiz,vitoria gasteiz,
3,Arnulfo Arias,arnulfo arias,arnulfo arias,
4,Don Bosco,don bosco,don bosco,
5,Santa Ana PA,santa ana pa,santa ana pa,
6,Aichi,aichi,aichi,Aichi
7,Clayton AU,clayton au,clayton au,
8,Heinsberg District,heinsberg district,heinsberg,Heinsberg
9,Bogor Regency,bogor regency,bogor regency,


In [93]:
divisions_exact_match = divisions.query("name != ''")

In [94]:
divisions_exact_match.shape

(290, 4)

In [95]:
divisions_exact_match.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
6,Aichi,aichi,aichi,Aichi
8,Heinsberg District,heinsberg district,heinsberg,Heinsberg
25,iLembe,ilembe,ilembe,iLembe District Municipality
26,eThekwini,ethekwini,ethekwini,eThekwini Metropolitan Municipality
35,uMkhanyakude,umkhanyakude,umkhanyakude,uMkhanyakude District Municipality


### Collect all location matches

In [96]:
matches = pd.concat([country_exact_match, admin1_exact_match, admin1_no_affix_match, admin2_exact_match, city_exact_match, divisions_exact_match])

In [97]:
matches = matches[['origName', 'name']]

In [98]:
matches.shape

(6514, 2)

In [99]:
matches.tail()

Unnamed: 0,origName,name
623,Västernorrland County,Vaesternorrland
624,Jämtland County,Jaemtland
635,Buri Ram Province,Buri Ram
636,Suphan Buri Province,Suphan Buri
641,Mykolaiv Oblast,Mykolaiv


In [100]:
matches.query("origName == 'Zhuhai City'")

Unnamed: 0,origName,name
84,Zhuhai City,Zhuhai


In [101]:
name_list = matches[['origName', 'name']].values.tolist()
name_dict = {name: val for name, val in name_list}

In [102]:
name_dict.get('Zhuhai City')

'Zhuhai'

#### Assign GeoNames for Countries

In [103]:
country_list = country_exact_match[['origName', 'name']].values.tolist()
country_dict = {name: val for name, val in country_list}

In [104]:
df['geoName0'] = df['loc0'].apply(lambda x: country_dict.get(x, ''))

#### Assign GeoNames for Admin1 divisions

In [105]:
admin1_match = pd.concat([admin1_exact_match, admin1_no_affix_match])
admin1_list = admin1_match[['origName', 'name']].values.tolist()
admin1_dict = {name: val for name, val in admin1_list}

In [106]:
admin1_dict.get("United States,New York")

In [107]:
df['geoName1'] = df['loc1'].apply(lambda x: admin1_dict.get(x, ''))

#### Assign GeoNames for Admin2 divisions

In [108]:
admin2_list = admin2_exact_match[['origName', 'name']].values.tolist()
admin2_dict = {name: val for name, val in admin2_list}

In [109]:
df['geoName2'] = df['loc2'].apply(lambda x: admin2_dict.get(x, ''))

#### Assign GeoNames for Cities

In [110]:
city_list = city_exact_match[['origName', 'name']].values.tolist()
city_dict = {name: val for name, val in city_list}

In [111]:
df['geoName3'] = df['loc3'].apply(lambda x: city_dict.get(x, ''))

In [112]:
df.query("origLocation == 'United States,New York'").head()

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
289,"United States,New York","United States,New York",1,United States,New York,,,United States,New York,,


#### Assign best match for any unassigned locations

In [113]:
def get_name(loc, geoName):
    if geoName == '':
        return name_dict.get(loc, '')
    else:
        return geoName

In [114]:
df['geoName0'] = df.apply(lambda x: get_name(x['loc0'], x['geoName0']), axis=1)
df['geoName1'] = df.apply(lambda x: get_name(x['loc1'], x['geoName1']), axis=1)
df['geoName2'] = df.apply(lambda x: get_name(x['loc2'], x['geoName2']), axis=1)
df['geoName3'] = df.apply(lambda x: get_name(x['loc3'], x['geoName3']), axis=1)

In [115]:
df.head()

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
0,"China,Guangdong","China,Guangdong",1,China,Guangdong,,,China,Guangdong,,
1,"China,Hubei,Wuhan","China,Hubei,Wuhan",2,China,Hubei,Wuhan,,China,Hubei,Wuhan,
6,Singapore,Singapore,0,Singapore,,,,Singapore,,,
7,"Thailand,Bangkok","Thailand,Bangkok",1,Thailand,Bangkok,,,Thailand,Bangkok,,
11,"Taiwan,Taipei","Taiwan,Taipei",1,Taiwan,Taipei,,,Taiwan,Taipei,,


In [116]:
df[(df['geoName1'] == '') & (df['geoName2'] != '')]

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
143,"Spain,Castilla y Leon,Leon","Spain,Castilla y Leon,Leon",2,Spain,Castilla y Leon,Leon,,Spain,,Leon,
294,"Sweden,Småland,Jönköping","Sweden,Småland,Jönköping",2,Sweden,Småland,Jönköping,,Sweden,,Joenkoeping,
315,"Italy,Lazio,Rome","Italy,Lazio,Rome",2,Italy,Lazio,Rome,,Italy,,Rome,
732,"Bahrain,Capital Governorate,Manama","Bahrain,Capital Governorate,Manama",2,Bahrain,Capital Governorate,Manama,,Bahrain,,Manama,
1362,"Belgium,Tournai-Mouscron,Mouscron","Belgium,Tournai-Mouscron,Mouscron",2,Belgium,Tournai-Mouscron,Mouscron,,Belgium,,Mouscron,
1476,"Poland,Malopolskie,Kraków","Poland,Malopolskie,Kraków",2,Poland,Malopolskie,Kraków,,Poland,,Krakow,
1489,"Oman,Dakhiliyah,Nizwa","Oman,Dakhiliyah,Nizwa",2,Oman,Dakhiliyah,Nizwa,,Oman,,Nizwa,
1588,"Italy,Marche,Ancona","Italy,Marche,Ancona",2,Italy,Marche,Ancona,,Italy,,Ancona,
1769,"Vietnam,Red River Delta,Hanoi","Vietnam,Red River Delta,Hanoi",2,Vietnam,Red River Delta,Hanoi,,Vietnam,,Hanoi,
2317,"Spain,Castilla y Leon,Villarejo de Orbigo","Spain,Castilla y Leon,Villarejo de Orbigo",2,Spain,Castilla y Leon,Villarejo de Orbigo,,Spain,,Villarejo de Orbigo,


### Left align all geoNames

In [117]:
df[['geoName1','geoName2']] = df[['geoName1','geoName2']].apply(lambda x: x if x[0] != '' else [x[1],x[0]], axis=1)

In [118]:
# There are no cases that match this condition
# df[['geoName2','geoName3']] = df[['geoName2','geoName3']].apply(lambda x: x if x[0] != '' else [x[1],x[0]], axis=1)

In [119]:
df[(df['geoName2'] == '') & (df['geoName3'] != '')]

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3


In [120]:
def calc_location_levels(row):
    level = 3
    if row['geoName3'] == '':
        level = 2
    if row['geoName2'] == '':
        level = 1
    if row['geoName1'] == '':
        level = 0
    return level

In [121]:
df['locationLevels'] = df.apply(lambda x: calc_location_levels(x), axis=1)

In [122]:
df.query("origLocation == 'North Macedonia,Pelagonia,Prilep'")

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3


In [123]:
df[(df['geoName1'] == '') & (df['geoName2'] != '')]

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3


Test

In [124]:
df.query("origLocation == 'Austria,St. Anton'")

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3


In [125]:
df.query("geoName1 == 'District of Columbia'").head()

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
274903,"United States,Washington, D.C.","United States,District of Columbia",1,United States,District of Columbia,,,United States,District of Columbia,,
275124,"United States,Washington, D.C.,District of Col...","United States,District of Columbia,District of...",2,United States,District of Columbia,District of Columbia,,United States,District of Columbia,District of Columbia,


In [126]:
df = df[(df['origLocation'] != '') & (df['geoName0'] != '')]
df = df[['origLocation', 'geoName0', 'geoName1', 'geoName2', 'geoName3', 'locationLevels']]
df.to_csv(NEO4J_IMPORT / "10a-GeoLink.csv", index=False)

### Remaining locations that do not match GeoNames.org locations

In [127]:
geo_divisions['canonicalName'] = geo_divisions.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [128]:
geo_divisions.head()

Unnamed: 0,name,canonicalName,canonicalNameNoAffix
0,Sant Julia de Loria,sant julia de loria,sant julia de loria
1,Ordino,ordino,ordino
2,La Massana,la massana,la massana
3,Encamp,encamp,encamp
4,Canillo,canillo,canillo


In [129]:
divisions = divisions.query("name == ''")

In [130]:
divisions['name'] = divisions.apply(lambda x: get_close_match(x['canonicalName'], geo_divisions['canonicalName']), axis=1)

In [131]:
divisions_close_match = divisions.query("name != ''")

In [132]:
divisions_close_match.shape

(72, 4)

In [133]:
divisions_close_match.head(300)

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
15,Tolemaida,tolemaida,tolemaida,ptolemaida
19,Barka,barka,barka,barkam
22,Murbach,murbach,murbach,urbach
23,Roucourt,roucourt,roucourt,rocourt
43,Mehsana,mehsana,mehsana,mesana
45,Fenoarivo,fenoarivo,fenoarivo,fenoarivobe
47,Al Matariyyah,al matariyyah,matariyyah,al matariyah
49,Winnebago County IL,winnebago county il,winnebago il,winnebago county
50,Kynice,kynice,kynice,krynice
65,Ekaterinburg,ekaterinburg,ekaterinburg,yekaterinburg


### Locations that do not match

In [134]:
divisions_no_match = divisions.query("name == ''")

In [135]:
divisions_no_match.shape

(294, 4)

In [136]:
divisions_no_match.head(500)

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
0,Amelia Denis de Icaza,amelia denis de icaza,amelia denis de icaza,
1,Omar Torrijos,omar torrijos,omar torrijos,
2,Vitoria-Gasteiz,vitoria gasteiz,vitoria gasteiz,
3,Arnulfo Arias,arnulfo arias,arnulfo arias,
4,Don Bosco,don bosco,don bosco,
5,Santa Ana PA,santa ana pa,santa ana pa,
7,Clayton AU,clayton au,clayton au,
9,Bogor Regency,bogor regency,bogor regency,
10,Timika,timika,timika,
11,Contamines,contamines,contamines,
