# Assigns standardized locations to entities in the KG
**[Work in progress]**

This notebook standardizes location information for ingestion into a Knowledge Graph.

Author: Peter Rose (pwrose@ucsd.edu), Braden Riggs 

In [1]:
import os
import pandas as pd
from pathlib import Path
import time
import unidecode
import difflib

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-19636412-9e74-4bac-8a4c-c6c8b49bb9d3/installation-4.1.0/import


### Import data sources that have an `origLocation` property

In [4]:
df1 = pd.read_csv(NEO4J_IMPORT / '01d-CNCBStrain.csv', dtype='str', usecols=['origLocation'])
df2 = pd.read_csv(NEO4J_IMPORT / '02a-JHUCasesGlobal.csv', dtype='str', usecols=['origLocation'])
df = pd.concat([df1, df2])

In [5]:
df.drop_duplicates(inplace=True)
df.fillna('', inplace=True)

In [6]:
df.shape

(2104, 1)

In [7]:
df.head()

Unnamed: 0,origLocation
0,"China,Hubei"
2,"Thailand,Nonthaburi Province"
4,"China,Hubei,Wuhan"
13,"China,Zhejiang"
17,"China,Guangdong Province,Zhuhai City"


### Load custom GeoName mappings

In [8]:
ref = pd.read_csv("../../reference_data/GeoNameMapping.csv", usecols=['origName', 'geoName'], comment='#', dtype='str')

In [9]:
ref.head()

Unnamed: 0,origName,geoName
0,Burma,Myanmar
1,Congo (Kinshasa),Democratic Republic of the Congo
2,Congo (Brazzaville),Republic of the Congo
3,Cote d'Ivoire,Ivory Coast
4,Czech Republic,Czechia


In [10]:
name_list = ref[['origName', 'geoName']].values.tolist()

In [11]:
def fix_locations(text):
    for loc in name_list:
        if text.startswith(loc[0]):
            text = text.replace(loc[0], loc[1])
    return text

### Apply custom GeoName mappings

In [12]:
df['geoLocation'] = df['origLocation'].apply(lambda x: fix_locations(x))

In [13]:
df.query('geoLocation != origLocation').head()

Unnamed: 0,origLocation,geoLocation
51,"China,Taiwan,Kaohsiung","Taiwan,Kaohsiung"
149,"China,Taiwan,Taipei","Taiwan,Taipei"
195,"France,Rhone-Alpes,Contamines","France,Auvergne-Rhone-Alpes,Contamines"
240,"China,Hong Kong",Hong Kong
349,"Netherlands,Nootdorp","Netherlands,Gemeente Pijnacker-Nootdorp"


### Match Cruiseships

In [14]:
cruiseships = df[df['geoLocation'].str.startswith('CruiseShip,')].copy()

In [15]:
cruiseships.head(10)

Unnamed: 0,origLocation,geoLocation
13902,"Italy,Cruise ship","CruiseShip,Cruise ship"
46652,"United States,Cruise_Ship_1,California","CruiseShip,Cruise_Ship_1"
46658,"United States,Cruise_Ship_2,California","CruiseShip,Cruise_Ship_2"
865,Diamond Princess,"CruiseShip,Diamond Princess"
4104,"Canada,Grand Princess","CruiseShip,Grand Princess"
6004,"Canada,Diamond Princess","CruiseShip,Diamond Princess"
7591,MS Zaandam,"CruiseShip,MS Zaandam"


In [16]:
cruiseships = cruiseships[['origLocation', 'geoLocation']]
cruiseships.to_csv(NEO4J_IMPORT / "10a-GeoLinkCruiseShip.csv", index=False)

In [17]:
##### Remove Cruiseship data from data frame

In [18]:
df = df[~df['geoLocation'].str.startswith('CruiseShip,')].copy()

In [19]:
df.shape

(2097, 2)

In [20]:
def remove_special_characters(text):
    text = str(text)
    text = text.lower()
    # transliterate German umlaut
    text = text.replace('ü', 'ue') 
    # transliterate Unicode string into the closest possible ASCII representation
    text = unidecode.unidecode(text)
    text = text.replace("-"," ")
    text = text.replace("_"," ")
    text = text.replace(","," ")
    text = text.replace('\d+', '')
    text = text.strip()

    return text

In [21]:
def remove_affix(text):
    text = str(text)
    text = text.lower()
    # try match without prefix or suffix
#    text = text.replace("city", "")
    text = text.replace("region", "")
    text = text.replace("oblast", "") # regions in Russia
    text = text.replace("al ", "") # regions in Saudi Arabia
    text = text.replace("prefecture", "") # regions in Japan
    text = text.replace("province", "") # regions in China
    text = text.replace("district", "")
#    text = text.replace("parish", "")
    text = text.replace("gemeente", "") # regions in the Netherlands
    text = text.replace("wahlkreis", "") # regions in Switzerland
    text = text.replace("canton", "") # regions in Switzerland
    text = text.replace("municipality", "")
    text = text.replace("metropolitan", "")
    text = text.replace("state", "")
#    text = text.replace("county", "")
    text = text.replace('\d+', '')
    text = text.strip()
    return text

In [22]:
def remove_affix2(text):
    text = str(text)
    text = text.lower()
    # try match without prefix or suffix
    text = text.replace("city", "")
    text = text.replace("region", "")
    text = text.replace("oblast", "") # regions in Russia
    text = text.replace("al ", "") # regions in Saudi Arabia
    text = text.replace("prefecture", "") # regions in Japan
    text = text.replace("province", "") # regions in China
    text = text.replace("district", "")
    text = text.replace("parish", "")
    text = text.replace("gemeente", "") # regions in the Netherlands
    text = text.replace("wahlkreis", "") # regions in Switzerland
    text = text.replace("canton", "") # regions in Switzerland
    text = text.replace("municipality", "")
    text = text.replace("metropolitan", "")
    text = text.replace("state", "")
#    text = text.replace("county", "")
    text = text.replace('\d+', '')
    text = text.strip()
    return text

In [23]:
def get_close_match(x, name):
    z = difflib.get_close_matches(x, name, n=1, cutoff=0.9)
    if z:
        return z[0]
    else:
        return ''

In [24]:
df.shape

(2097, 2)

In [25]:
df['locationLevels'] = df['geoLocation'].str.count(',')
df[['loc0', 'loc1', 'loc2', 'loc3']] = df['geoLocation'].str.split(',', n=3, expand=True)
df.fillna('', inplace=True)

## Match Countries

In [26]:
countries = pd.DataFrame(df['loc0'].copy())
countries.drop_duplicates(inplace=True)
countries.dropna(inplace=True)
countries.columns=['origName']
countries['canonicalName'] = countries.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [27]:
countries.shape

(213, 2)

In [28]:
countries.head()

Unnamed: 0,origName,canonicalName
0,China,china
2,Thailand,thailand
32,United States,united states
51,Taiwan,taiwan
78,France,france


In [29]:
geo_countries = pd.read_csv(NEO4J_IMPORT / "00e-GeoNamesCountry.csv", dtype='str', usecols=['name'])

In [30]:
geo_countries['canonicalName'] = geo_countries.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [31]:
countries = countries.merge(geo_countries, on='canonicalName', how='left')
countries.fillna('', inplace=True)

### Exact country matches

In [32]:
country_exact_match = countries.query("name != ''").copy()

In [33]:
country_exact_match.shape

(213, 3)

In [34]:
country_no_match = countries.query("name == ''").copy()

In [35]:
country_no_match.head()

Unnamed: 0,origName,canonicalName,name


### Match Admin1

In [36]:
admin1 = pd.DataFrame(df['loc1'].copy())
admin1.drop_duplicates(inplace=True)
admin1.dropna(inplace=True)
admin1.columns=['origName']
admin1['canonicalName'] = admin1.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [37]:
geo_admin1 = pd.read_csv(NEO4J_IMPORT / "00f-GeoNamesAdmin1.csv", dtype='str', usecols=['name'])

In [38]:
geo_admin1['canonicalName'] = geo_admin1.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [39]:
admin1 = admin1.merge(geo_admin1, on='canonicalName', how='left')
admin1.fillna('', inplace=True)

In [40]:
admin1_exact_match = admin1.query("name != ''").copy()
admin1_exact_match.drop_duplicates(inplace=True)

In [41]:
admin1_exact_match.shape

(423, 3)

In [42]:
admin1_exact_match.head(10)

Unnamed: 0,origName,canonicalName,name
0,Hubei,hubei,Hubei
2,Zhejiang,zhejiang,Zhejiang
4,Guangdong,guangdong,Guangdong
5,Washington,washington,Washington
6,Illinois,illinois,Illinois
8,California,california,California
9,Arizona,arizona,Arizona
10,Ile-de-France,ile de france,Ile-de-France
11,Yunnan,yunnan,Yunnan
12,Victoria,victoria,Victoria


##### Remaining locations that don't match a GeoName location

In [43]:
admin1 = admin1.query("name == ''").copy()
admin1.drop('name', axis=1, inplace=True)

In [44]:
admin1.shape

(711, 2)

In [45]:
admin1['canonicalNameNoAffix'] = admin1.apply(lambda x: remove_affix(x['canonicalName']), axis=1)

In [46]:
geo_admin1['canonicalNameNoAffix'] = geo_admin1.apply(lambda x: remove_affix(x['canonicalName']), axis=1)
admin1 = admin1.merge(geo_admin1[['name', 'canonicalNameNoAffix']], on='canonicalNameNoAffix', how='left')
admin1.fillna('', inplace=True)

In [47]:
admin1_no_affix_match = admin1.query("name != ''")

In [48]:
admin1_no_affix_match.shape

(31, 4)

In [49]:
admin1_no_affix_match.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
0,Nonthaburi Province,nonthaburi province,nonthaburi,Nonthaburi
1,Guangdong Province,guangdong province,guangdong,Guangdong
8,Kerala State,kerala state,kerala,Kerala
38,Riyadh,riyadh,riyadh,Riyadh Region
49,Amazonas State,amazonas state,amazonas,Amazonas


In [50]:
admin1_no_match = admin1.query("name == ''").copy()
admin1_no_match.query("origName != ''", inplace=True)

In [51]:
admin1_no_match.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
2,Kaohsiung,kaohsiung,kaohsiung,
4,Leuven,leuven,leuven,
5,Kathmandu,kathmandu,kathmandu,
6,Rome,rome,rome,
7,Sihanoukville,sihanoukville,sihanoukville,


### Match Admin2

In [52]:
admin2 = pd.DataFrame(df['loc2'].copy())
admin2.columns=['origName']
admin2.query("origName != ''", inplace=True)
admin2.head()

Unnamed: 0,origName
4,Wuhan
17,Zhuhai City
20,Shenzhen City
28,Shenzhen
32,Snohomish County


In [53]:
a1_no_match = pd.DataFrame(admin1_no_match['origName']).copy()

In [54]:
a1_no_match.head()

Unnamed: 0,origName
2,Kaohsiung
4,Leuven
5,Kathmandu
6,Rome
7,Sihanoukville


In [55]:
admin2.head()

Unnamed: 0,origName
4,Wuhan
17,Zhuhai City
20,Shenzhen City
28,Shenzhen
32,Snohomish County


In [56]:
# carry over regions from location level 1 that might be Admin2s or Cities
admin2 = pd.concat([admin2,a1_no_match])
admin2.drop_duplicates(inplace=True)
admin2.dropna(inplace=True)
admin2.columns=['origName']
admin2['canonicalName'] = admin2.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [57]:
geo_admin2 = pd.read_csv(NEO4J_IMPORT / "00g-GeoNamesAdmin2.csv", dtype='str', usecols=['name'])

In [58]:
geo_admin2['canonicalName'] = geo_admin2.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [59]:
admin2 = admin2.merge(geo_admin2, on='canonicalName', how='left')
admin2.fillna('', inplace=True)

In [60]:
admin2_exact_match = admin2.query("name != ''").copy()
admin2_exact_match.drop_duplicates(inplace=True)

In [61]:
admin2_exact_match.shape

(375, 3)

In [62]:
admin2_exact_match.head()

Unnamed: 0,origName,canonicalName,name
3,Shenzhen,shenzhen,Shenzhen
4,Snohomish County,snohomish county,Snohomish County
7,Orange County,orange county,Orange County
17,Paris,paris,Paris
22,Gold Coast,gold coast,Gold Coast


##### Remaining locations that don't match a GeoName location

In [63]:
admin2_no_match = admin2.query("name == ''").copy()

In [64]:
admin2_no_match.shape

(1007, 3)

### Match Cities

In [65]:
city = pd.DataFrame(df['loc3'].copy())
city.columns=['origName']
city.query("origName != ''", inplace=True)
city.head(100)

Unnamed: 0,origName
544,Sheffield
10162,Havlickuv Brod
13869,St Anton am Arlberg
36109,Bilbao
48448,Caraman
52389,Companigonj
52491,Goshbag / Lamchori


In [66]:
a2_no_match = pd.DataFrame(admin2_no_match['origName'])

In [67]:
# carry over regions from location level 1 that might be Admin2s or Cities
city = pd.concat([a2_no_match, city])
city.drop_duplicates(inplace=True)
city.dropna(inplace=True)
city.columns=['origName']
city['canonicalName'] = city.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [68]:
geo_city = pd.read_csv(NEO4J_IMPORT / "00h-GeoNamesCity.csv", dtype='str', usecols=['name'])

In [69]:
geo_city['canonicalName'] = geo_city.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [70]:
city = city.merge(geo_city, on='canonicalName', how='left')
city.fillna('', inplace=True)

In [71]:
city_exact_match = city.query("name != ''").copy()
city_exact_match.drop_duplicates(inplace=True)

In [72]:
city_exact_match.shape

(678, 3)

In [73]:
city_exact_match.head()

Unnamed: 0,origName,canonicalName,name
0,Wuhan,wuhan,Wuhan
3,Chicago,chicago,Chicago
4,Los Angeles,los angeles,Los Angeles
13,Phoenix,phoenix,Phoenix
19,Clayton,clayton,Clayton


In [74]:
no_match = city.query("name == ''").copy()

In [75]:
no_match.shape

(335, 3)

In [76]:
no_match.head()

Unnamed: 0,origName,canonicalName,name
1,Zhuhai City,zhuhai city,
2,Shenzhen City,shenzhen city,
17,Guangzhou City,guangzhou city,
18,Pu'er,pu'er,
40,Zhongxian,zhongxian,


### Match other divisions

In [77]:
divisions = no_match['origName']

In [78]:
divisions.drop_duplicates(inplace=True)
divisions.dropna(inplace=True)
divisions = divisions[divisions != '']
divisions.shape

(335,)

In [79]:
divisions = pd.DataFrame(divisions)
divisions.columns=['origName']

Unnamed: 0,origName
1,Zhuhai City
2,Shenzhen City
17,Guangzhou City
18,Pu'er
40,Zhongxian


In [80]:
divisions['canonicalName'] = divisions.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [81]:
divisions.head()

Unnamed: 0,origName,canonicalName
1,Zhuhai City,zhuhai city
2,Shenzhen City,shenzhen city
17,Guangzhou City,guangzhou city
18,Pu'er,pu'er
40,Zhongxian,zhongxian


In [82]:
geo_admin1 = pd.read_csv(NEO4J_IMPORT / "00f-GeoNamesAdmin1.csv", dtype='str', usecols=['name'])
geo_admin2 = pd.read_csv(NEO4J_IMPORT / "00g-GeoNamesAdmin2.csv", dtype='str', usecols=['name'])
geo_city = pd.read_csv(NEO4J_IMPORT / "00h-GeoNamesCity.csv", dtype='str', usecols=['name'])

In [83]:
geo_divisions = pd.concat([geo_admin1, geo_admin2, geo_city])

In [84]:
geo_divisions.drop_duplicates(inplace=True)

In [85]:
geo_divisions.shape

(154206, 1)

In [86]:
geo_divisions.dropna(inplace=True)
geo_divisions.columns=['name']
geo_divisions['canonicalName'] = geo_divisions.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [87]:
geo_divisions.head()

Unnamed: 0,name,canonicalName
0,Sant Julia de Loria,sant julia de loria
1,Ordino,ordino
2,La Massana,la massana
3,Encamp,encamp
4,Canillo,canillo


In [88]:
divisions['canonicalNameNoAffix'] = divisions.apply(lambda x: remove_affix2(x['canonicalName']), axis=1)

In [89]:
geo_divisions['canonicalNameNoAffix'] = geo_divisions.apply(lambda x: remove_affix2(x['canonicalName']), axis=1)
divisions = divisions.merge(geo_divisions[['name', 'canonicalNameNoAffix']], on='canonicalNameNoAffix', how='left')
divisions.fillna('', inplace=True)

In [90]:
divisions.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
0,Zhuhai City,zhuhai city,zhuhai,Zhuhai
1,Shenzhen City,shenzhen city,shenzhen,Shenzhen
2,Guangzhou City,guangzhou city,guangzhou,Guangzhou
3,Pu'er,pu'er,pu'er,Pu'er City
4,Zhongxian,zhongxian,zhongxian,


In [91]:
divisions_exact_match = divisions.query("name != ''")

In [92]:
divisions_exact_match.shape

(33, 4)

In [93]:
divisions_exact_match.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
0,Zhuhai City,zhuhai city,zhuhai,Zhuhai
1,Shenzhen City,shenzhen city,shenzhen,Shenzhen
2,Guangzhou City,guangzhou city,guangzhou,Guangzhou
3,Pu'er,pu'er,pu'er,Pu'er City
7,Heinsberg District,heinsberg district,heinsberg,Heinsberg


### Collect all location matches

In [94]:
matches = pd.concat([country_exact_match, admin1_exact_match, admin1_no_affix_match, admin2_exact_match, divisions_exact_match, city_exact_match])

In [95]:
matches = matches[['origName', 'name']]

In [96]:
matches.shape

(1753, 2)

In [97]:
matches.tail()

Unnamed: 0,origName,name
1317,Sheffield,Sheffield
1324,Havlickuv Brod,Havlickuv Brod
1325,St Anton am Arlberg,St Anton am Arlberg
1326,Bilbao,Bilbao
1327,Caraman,Caraman


In [98]:
matches.query("origName == 'Zhuhai City'")

Unnamed: 0,origName,name
0,Zhuhai City,Zhuhai


In [99]:
name_list = matches[['origName', 'name']].values.tolist()
name_dict = {name: val for name, val in name_list}

In [100]:
name_dict.get('France,Normandy')

In [101]:
df['geoName0'] = df['loc0'].apply(lambda x: name_dict.get(x, ''))
df['geoName1'] = df['loc1'].apply(lambda x: name_dict.get(x, ''))
df['geoName2'] = df['loc2'].apply(lambda x: name_dict.get(x, ''))
df['geoName3'] = df['loc3'].apply(lambda x: name_dict.get(x, ''))

In [102]:
df.head()

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
0,"China,Hubei","China,Hubei",1,China,Hubei,,,China,Hubei,,
2,"Thailand,Nonthaburi Province","Thailand,Nonthaburi Province",1,Thailand,Nonthaburi Province,,,Thailand,Nonthaburi,,
4,"China,Hubei,Wuhan","China,Hubei,Wuhan",2,China,Hubei,Wuhan,,China,Hubei,Wuhan,
13,"China,Zhejiang","China,Zhejiang",1,China,Zhejiang,,,China,Zhejiang,,
17,"China,Guangdong Province,Zhuhai City","China,Guangdong Province,Zhuhai City",2,China,Guangdong Province,Zhuhai City,,China,Guangdong,Zhuhai,


Test

In [103]:
df.query("origLocation == 'Austria,St. Anton'")

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
13869,"Austria,St. Anton","Austria,Tyrol,Politischer Bezirk Landeck,St An...",3,Austria,Tyrol,Politischer Bezirk Landeck,St Anton am Arlberg,Austria,Tyrol,Politischer Bezirk Landeck,St Anton am Arlberg


In [104]:
df = df[(df['origLocation'] != '') & (df['geoName0'] != '')]
df = df[['origLocation', 'geoName0', 'geoName1', 'geoName2', 'geoName3', 'locationLevels']]
df.to_csv(NEO4J_IMPORT / "10a-GeoLink.csv", index=False)

### Remaining locations that do not match GeoNames.org locations

In [105]:
geo_divisions['canonicalName'] = geo_divisions.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [106]:
geo_divisions.head()

Unnamed: 0,name,canonicalName,canonicalNameNoAffix
0,Sant Julia de Loria,sant julia de loria,sant julia de loria
1,Ordino,ordino,ordino
2,La Massana,la massana,la massana
3,Encamp,encamp,encamp
4,Canillo,canillo,canillo


In [107]:
divisions = divisions.query("name == ''")

In [108]:
divisions['name'] = divisions.apply(lambda x: get_close_match(x['canonicalName'], geo_divisions['canonicalName']), axis=1)

In [109]:
divisions_close_match = divisions.query("name != ''")

In [110]:
divisions_close_match.head(300)

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
4,Zhongxian,zhongxian,zhongxian,zhongxiang
5,Huaian,huaian,huaian,huainan
8,Crépy-en -Valois,crepy en valois,crepy en valois,crepy en valois
10,Northamtonshire,northamtonshire,northamtonshire,northamptonshire
11,San Francisco County,san francisco county,san francisco county,saint francis county
19,Thiensvill,thiensvill,thiensvill,thiensville
23,Tehatta,tehatta,tehatta,thatta
34,Southeast,southeast,southeast,south east
39,Sagunt,sagunt,sagunt,sagunto
40,Lutxent,lutxent,lutxent,llutxent


### Locations that do not match

In [111]:
divisions_no_match = divisions.query("name == ''")

In [112]:
divisions_no_match.shape

(217, 4)

In [113]:
divisions_no_match.head(300)

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
6,Contamines,contamines,contamines,
9,South Yorkshire,south yorkshire,south yorkshire,
12,Meudon la Forêt,meudon la foret,meudon la foret,
13,Bronx,bronx,bronx,
14,Jian,jian,jian,
16,Xishuangbanna,xishuangbanna,xishuangbanna,
17,Campbellsp,campbellsp,campbellsp,
18,South Milw,south milw,south milw,
20,Port Washi,port washi,port washi,
21,Unknown County,unknown county,unknown county,
