# Assigns standardized locations to entities in the KG
**[Work in progress]**

This notebook standardizes location information for ingestion into a Knowledge Graph.

Author: Peter Rose (pwrose@ucsd.edu), Braden Riggs 

In [1]:
import os
import pandas as pd
from pathlib import Path
import time
import unidecode
import difflib

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Import data sources that have an `origLocation` property

In [4]:
df2 = pd.read_csv(NEO4J_IMPORT / '01c-CNCBStrain.csv', dtype='str', usecols=['origLocation'])
df3 = pd.read_csv(NEO4J_IMPORT / '02a-JHUCasesGlobal.csv', dtype='str', usecols=['origLocation'])
df4 = pd.read_csv(NEO4J_IMPORT / '02b-CDSCases.csv', dtype='str', usecols=['origLocation'])
df5 = pd.read_csv(NEO4J_IMPORT / '02d-GOBMXCasesAdmin1.csv', dtype='str', usecols=['origLocation'])
df6 = pd.read_csv(NEO4J_IMPORT / '02d-GOBMXCasesAdmin2.csv', dtype='str', usecols=['origLocation'])

df = pd.concat([df2, df3, df4, df5, df6])

In [5]:
df.drop_duplicates(inplace=True)
df.fillna('', inplace=True)

In [6]:
df.shape

(9821, 1)

In [7]:
df.head()

Unnamed: 0,origLocation
0,"China,Hubei"
1,"Thailand,Nonthaburi Province"
3,"China,Hubei,Wuhan"
6,"Japan,Kanagawa Prefecture"
8,"China,Zhejiang"


In [8]:
df[df['origLocation'].str.contains('D.C.')].head()

Unnamed: 0,origLocation
274903,"United States,Washington, D.C."
275124,"United States,Washington, D.C.,District of Col..."


### Load custom GeoName mappings

In [9]:
ref = pd.read_csv("../../reference_data/GeoNameMapping.csv", usecols=['origName', 'geoName'], comment='#', dtype='str')

In [10]:
ref.head()

Unnamed: 0,origName,geoName
0,Burma,Myanmar
1,Cape Verde,Cabo Verde
2,Congo (Kinshasa),Democratic Republic of the Congo
3,Congo (Brazzaville),Republic of the Congo
4,Congo-Brazzaville,Republic of the Congo


In [11]:
name_list = ref[['origName', 'geoName']].values.tolist()

In [12]:
def fix_locations(text):
    for loc in name_list:
        if text.startswith(loc[0]):
            # full length match or match at a comma
            if len(text) == len(loc[0]) or text[len(loc[0])] == ',':
                text = text.replace(loc[0], loc[1])
    return text

### Apply custom GeoName mappings

In [13]:
df['geoLocation'] = df['origLocation'].apply(lambda x: fix_locations(x))

In [14]:
df.query('geoLocation != origLocation').head()

Unnamed: 0,origLocation,geoLocation
24,"China,Taiwan,Kaohsiung","Taiwan,Kaohsiung"
100,"China,Taiwan,Taipei","Taiwan,Taipei"
119,"China,Hong Kong",Hong Kong
156,"France,Rhone-Alpes,Contamines","France,Auvergne-Rhone-Alpes,Contamines"
333,"Netherlands,Nootdorp","Netherlands,Gemeente Pijnacker-Nootdorp"


### Match Cruiseships

In [15]:
cruiseships = df[df['geoLocation'].str.startswith('CruiseShip,')].copy()

In [16]:
cruiseships.head(10)

Unnamed: 0,origLocation,geoLocation
18012,"Italy,Cruise ship","CruiseShip,Cruise ship"
74606,"United States,Cruise_Ship_1,California","CruiseShip,Cruise_Ship_1"
74612,"United States,Cruise_Ship_2,California","CruiseShip,Cruise_Ship_2"
864,Diamond Princess,"CruiseShip,Diamond Princess"
4103,"Canada,Grand Princess","CruiseShip,Grand Princess"
6003,"Canada,Diamond Princess","CruiseShip,Diamond Princess"
7590,MS Zaandam,"CruiseShip,MS Zaandam"


In [17]:
cruiseships = cruiseships[['origLocation', 'geoLocation']]
cruiseships.to_csv(NEO4J_IMPORT / "10a-GeoLinkCruiseShip.csv", index=False)

##### Remove Cruiseship data from data frame

In [18]:
df = df[~df['geoLocation'].str.startswith('CruiseShip,')].copy()

In [19]:
df.shape

(9814, 2)

In [20]:
df.head(100)

Unnamed: 0,origLocation,geoLocation
0,"China,Hubei","China,Hubei"
1,"Thailand,Nonthaburi Province","Thailand,Nonthaburi Province"
3,"China,Hubei,Wuhan","China,Hubei,Wuhan"
6,"Japan,Kanagawa Prefecture","Japan,Kanagawa Prefecture"
8,"China,Zhejiang","China,Zhejiang"
10,"Asia,China,Guangdong,Zhuhai","Asia,China,Guangdong,Zhuhai"
11,"China,Guangdong Province,Shenzhen City","China,Guangdong Province,Shenzhen City"
15,"China,Guangdong,Shenzhen","China,Guangdong,Shenzhen"
17,"United States,Washington,Snohomish County","United States,Washington,Snohomish County"
18,"United States,Illinois,Chicago","United States,Illinois,Chicago"


In [21]:
def remove_special_characters(text):
    text = str(text)
    text = text.lower()
    # transliterate umlauts
    text = text.replace('ü', 'ue') 
    text = text.replace('ä', 'ae') 
    text = text.replace('ö', 'oe')
    # transliterate Unicode string into the closest possible ASCII representation
    text = unidecode.unidecode(text)
    text = text.replace("-"," ")
    text = text.replace("_"," ")
    text = text.replace(","," ")
    text = text.replace("."," ")
    text = text.replace('\d+', '')
    text = text.strip()

    return text

In [22]:
def remove_affix(text):
    text = str(text)
    text = text.lower()
    # try match without prefix or suffix
    text = text.replace("region", "")
    text = text.replace("oblast", "") # regions in Russia
    text = text.replace("al ", "") # regions in Saudi Arabia
    text = text.replace("prefecture", "") # regions in Japan
    text = text.replace("province", "") # regions in China
    text = text.replace("district", "")
    text = text.replace("canton", "") # regions in Switzerland
    text = text.replace("municipality", "")
    text = text.replace("municipio", "")  # regions in Mexico
    text = text.replace("metropolitan", "")
    text = text.replace("voivodeship", "")
    text = text.replace("state", "")
    text = text.replace('\d+', '')
    text = text.strip()
    return text

In [23]:
def remove_affix2(text):
    text = str(text)
    text = text.lower()
    # try match without prefix or suffix
    text = text.replace("city", "")
    text = text.replace("region", "")
    text = text.replace("county", "")
    text = text.replace("oblast", "") # regions in Russia
    text = text.replace("al ", "") # regions in Saudi Arabia
    text = text.replace("prefecture", "") # regions in Japan
    text = text.replace("province", "") # regions in China
    text = text.replace("district", "")
    text = text.replace("parish", "")
    text = text.replace("gemeente", "") # regions in the Netherlands
    text = text.replace("wahlkreis", "") # regions in Switzerland
    text = text.replace("canton", "") # regions in Switzerland
    text = text.replace("municipality", "")
    text = text.replace("municipio", "") # regions in Mexico
    text = text.replace("metropolitan", "")
    text = text.replace("voivodeship", "")
    text = text.replace("novads", "")
    text = text.replace("state", "")
    text = text.replace("st ", "")
    text = text.replace('\d+', '')
    text = text.strip()
    return text

In [24]:
def get_close_match(x, name):
    z = difflib.get_close_matches(x, name, n=1, cutoff=0.9)
    if z:
        return z[0]
    else:
        return ''

In [25]:
df.shape

(9814, 2)

In [26]:
df['locationLevels'] = df['geoLocation'].str.count(',')
df[['loc0', 'loc1', 'loc2', 'loc3']] = df['geoLocation'].str.split(',', n=3, expand=True)
df.fillna('', inplace=True)

## Match Countries

In [27]:
countries = pd.DataFrame(df['loc0'].copy())
countries.drop_duplicates(inplace=True)
countries.dropna(inplace=True)
countries.columns=['origName']
countries['canonicalName'] = countries.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [28]:
countries.shape

(228, 2)

In [29]:
countries.head()

Unnamed: 0,origName,canonicalName
0,China,china
1,Thailand,thailand
6,Japan,japan
10,Asia,asia
17,United States,united states


In [30]:
geo_countries = pd.read_csv(NEO4J_IMPORT / "00e-GeoNamesCountry.csv", dtype='str', usecols=['name'])

In [31]:
geo_countries['canonicalName'] = geo_countries.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [32]:
countries = countries.merge(geo_countries, on='canonicalName', how='left')
countries.fillna('', inplace=True)

### Exact country matches

In [33]:
country_exact_match = countries.query("name != ''").copy()

In [34]:
country_exact_match.shape

(223, 3)

In [35]:
country_no_match = countries.query("name == ''").copy()

In [36]:
country_no_match.head(10)

Unnamed: 0,origName,canonicalName,name
3,Asia,asia,
105,Democratic Republic of Congo,democratic republic of congo,
131,Palestine,palestine,
133,CotedIvoire,cotedivoire,
135,Czech Repubic,czech repubic,


### Match Admin1

In [37]:
admin1 = pd.DataFrame(df['loc1'].copy())
admin1.drop_duplicates(inplace=True)
admin1.dropna(inplace=True)
admin1.columns=['origName']
admin1['canonicalName'] = admin1.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [38]:
geo_admin1 = pd.read_csv(NEO4J_IMPORT / "00f-GeoNamesAdmin1.csv", dtype='str', usecols=['name'])

In [39]:
geo_admin1['canonicalName'] = geo_admin1.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [40]:
admin1 = admin1.merge(geo_admin1, on='canonicalName', how='left')
admin1.fillna('', inplace=True)

In [41]:
admin1_exact_match = admin1.query("name != ''").copy()
admin1_exact_match.drop_duplicates(inplace=True)

In [42]:
admin1_exact_match.shape

(1037, 3)

In [43]:
admin1_exact_match.head(10)

Unnamed: 0,origName,canonicalName,name
0,Hubei,hubei,Hubei
3,Zhejiang,zhejiang,Zhejiang
6,Guangdong,guangdong,Guangdong
7,Washington,washington,Washington
8,Illinois,illinois,Illinois
10,California,california,California
11,Arizona,arizona,Arizona
13,Ile-de-France,ile de france,Ile-de-France
14,Yunnan,yunnan,Yunnan
15,Victoria,victoria,Victoria


##### Remaining locations that don't match a GeoName location

In [44]:
admin1 = admin1.query("name == ''").copy()
admin1.drop('name', axis=1, inplace=True)

In [45]:
admin1.shape

(3813, 2)

In [46]:
admin1['canonicalNameNoAffix'] = admin1.apply(lambda x: remove_affix(x['canonicalName']), axis=1)

In [47]:
geo_admin1['canonicalNameNoAffix'] = geo_admin1.apply(lambda x: remove_affix(x['canonicalName']), axis=1)
admin1 = admin1.merge(geo_admin1[['name', 'canonicalNameNoAffix']], on='canonicalNameNoAffix', how='left')
admin1.fillna('', inplace=True)

In [48]:
admin1_no_affix_match = admin1.query("name != ''")

In [49]:
admin1_no_affix_match.shape

(272, 4)

In [50]:
admin1_no_affix_match.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
0,Nonthaburi Province,nonthaburi province,nonthaburi,Nonthaburi
1,Kanagawa Prefecture,kanagawa prefecture,kanagawa,Kanagawa
3,Guangdong Province,guangdong province,guangdong,Guangdong
11,Kerala State,kerala state,kerala,Kerala
57,Riyadh,riyadh,riyadh,Riyadh Region


In [51]:
admin1_no_match = admin1.query("name == ''").copy()
admin1_no_match.query("origName != ''", inplace=True)

In [52]:
admin1_no_match.shape

(3548, 4)

In [53]:
admin1_no_match.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
2,China,china,china,
4,Kaohsiung,kaohsiung,kaohsiung,
6,Rome,rome,rome,
7,Leuven,leuven,leuven,
8,Kathmandu,kathmandu,kathmandu,


### Match Admin2

In [54]:
admin2 = pd.DataFrame(df['loc2'].copy())
admin2.columns=['origName']
admin2.query("origName != ''", inplace=True)
admin2.head()

Unnamed: 0,origName
3,Wuhan
10,Guangdong
11,Shenzhen City
15,Shenzhen
17,Snohomish County


In [55]:
a1_no_match = pd.DataFrame(admin1_no_match['origName']).copy()

In [56]:
a1_no_match.shape

(3548, 1)

In [57]:
a1_no_match.head()

Unnamed: 0,origName
2,China
4,Kaohsiung
6,Rome
7,Leuven
8,Kathmandu


In [58]:
# carry over regions from location level 1 that might be Admin2s or Cities
admin2 = pd.concat([admin2,a1_no_match])
admin2.drop_duplicates(inplace=True)
admin2.dropna(inplace=True)
admin2.columns=['origName']
admin2['canonicalName'] = admin2.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [59]:
geo_admin2 = pd.read_csv(NEO4J_IMPORT / "00g-GeoNamesAdmin2.csv", dtype='str', usecols=['name'])

In [60]:
geo_admin2['canonicalName'] = geo_admin2.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [61]:
admin2 = admin2.merge(geo_admin2, on='canonicalName', how='left')
admin2.fillna('', inplace=True)

In [62]:
admin2_exact_match = admin2.query("name != ''").copy()
admin2_exact_match.drop_duplicates(inplace=True)

In [63]:
admin2_exact_match.shape

(4659, 3)

In [64]:
admin2_exact_match.head()

Unnamed: 0,origName,canonicalName,name
3,Shenzhen,shenzhen,Shenzhen
4,Snohomish County,snohomish county,Snohomish County
7,Orange County,orange county,Orange County
17,Paris,paris,Paris
22,Gold Coast,gold coast,Gold Coast


##### Remaining locations that don't match a GeoName location

In [65]:
admin2_no_match = admin2.query("name == ''").copy()

In [66]:
admin2_no_match.shape

(2123, 3)

### Match Cities

In [67]:
city = pd.DataFrame(df['loc3'].copy())
city.columns=['origName']
city.query("origName != ''", inplace=True)
city.head(100)

Unnamed: 0,origName
10,Zhuhai
571,Sheffield
2513,Manhattan
3471,Brooklyn
4084,Queens
12715,Havlickuv Brod
17978,St Anton am Arlberg
60296,Bilbao
64551,Whitehall
77482,Caraman


In [68]:
a2_no_match = pd.DataFrame(admin2_no_match['origName'])

In [69]:
# carry over regions from location level 1 that might be Admin2s or Cities
city = pd.concat([a2_no_match, city])
city.drop_duplicates(inplace=True)
city.dropna(inplace=True)
city.columns=['origName']
city['canonicalName'] = city.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [70]:
geo_city = pd.read_csv(NEO4J_IMPORT / "00h-GeoNamesCity.csv", dtype='str', usecols=['name'])

In [71]:
geo_city['canonicalName'] = geo_city.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [72]:
city = city.merge(geo_city, on='canonicalName', how='left')
city.fillna('', inplace=True)

In [73]:
city_exact_match = city.query("name != ''").copy()
city_exact_match.drop_duplicates(inplace=True)

In [74]:
city_exact_match.shape

(1137, 3)

In [75]:
city_exact_match.head()

Unnamed: 0,origName,canonicalName,name
0,Wuhan,wuhan,Wuhan
3,Chicago,chicago,Chicago
4,Los Angeles,los angeles,Los Angeles
13,Phoenix,phoenix,Phoenix
19,Clayton,clayton,Clayton


In [76]:
no_match = city.query("name == ''").copy()

In [77]:
no_match.shape

(1011, 3)

In [78]:
no_match.head()

Unnamed: 0,origName,canonicalName,name
1,Guangdong,guangdong,
2,Shenzhen City,shenzhen city,
17,Guangzhou City,guangzhou city,
18,Pu'er,pu'er,
40,Zhongxian,zhongxian,


### Match other divisions

In [79]:
divisions = no_match['origName']

In [80]:
divisions.drop_duplicates(inplace=True)
divisions.dropna(inplace=True)
divisions = divisions[divisions != '']
divisions.shape

(1011,)

In [81]:
divisions = pd.DataFrame(divisions)
divisions.columns=['origName']

In [82]:
divisions['canonicalName'] = divisions.apply(lambda x: remove_special_characters(x['origName']), axis=1)

In [83]:
divisions.head()

Unnamed: 0,origName,canonicalName
1,Guangdong,guangdong
2,Shenzhen City,shenzhen city
17,Guangzhou City,guangzhou city
18,Pu'er,pu'er
40,Zhongxian,zhongxian


In [84]:
geo_admin1 = pd.read_csv(NEO4J_IMPORT / "00f-GeoNamesAdmin1.csv", dtype='str', usecols=['name'])
geo_admin2 = pd.read_csv(NEO4J_IMPORT / "00g-GeoNamesAdmin2.csv", dtype='str', usecols=['name'])
geo_city = pd.read_csv(NEO4J_IMPORT / "00h-GeoNamesCity.csv", dtype='str', usecols=['name'])

In [85]:
geo_divisions = pd.concat([geo_admin1, geo_admin2, geo_city])

In [86]:
geo_divisions.drop_duplicates(inplace=True)

In [87]:
geo_divisions.shape

(154242, 1)

In [88]:
geo_divisions.dropna(inplace=True)
geo_divisions.columns=['name']
geo_divisions['canonicalName'] = geo_divisions.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [89]:
geo_divisions.head()

Unnamed: 0,name,canonicalName
0,Sant Julia de Loria,sant julia de loria
1,Ordino,ordino
2,La Massana,la massana
3,Encamp,encamp
4,Canillo,canillo


In [90]:
divisions['canonicalNameNoAffix'] = divisions.apply(lambda x: remove_affix2(x['canonicalName']), axis=1)

In [91]:
geo_divisions['canonicalNameNoAffix'] = geo_divisions.apply(lambda x: remove_affix2(x['canonicalName']), axis=1)
divisions = divisions.merge(geo_divisions[['name', 'canonicalNameNoAffix']], on='canonicalNameNoAffix', how='left')
divisions.fillna('', inplace=True)

In [92]:
divisions.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
0,Guangdong,guangdong,guangdong,Guangdong
1,Shenzhen City,shenzhen city,shenzhen,Shenzhen
2,Guangzhou City,guangzhou city,guangzhou,Guangzhou
3,Pu'er,pu'er,pu'er,Pu'er City
4,Zhongxian,zhongxian,zhongxian,
5,Huaian,huaian,huaian,
6,Contamines,contamines,contamines,
7,Heinsberg District,heinsberg district,heinsberg,Heinsberg
8,Nazi Abad,nazi abad,nazi abad,
9,Crépy-en -Valois,crepy en valois,crepy en valois,


In [93]:
divisions_exact_match = divisions.query("name != ''")

In [94]:
divisions_exact_match.shape

(348, 4)

In [95]:
divisions_exact_match.head()

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
0,Guangdong,guangdong,guangdong,Guangdong
1,Shenzhen City,shenzhen city,shenzhen,Shenzhen
2,Guangzhou City,guangzhou city,guangzhou,Guangzhou
3,Pu'er,pu'er,pu'er,Pu'er City
7,Heinsberg District,heinsberg district,heinsberg,Heinsberg


### Collect all location matches

In [96]:
matches = pd.concat([country_exact_match, admin1_exact_match, admin1_no_affix_match, admin2_exact_match, city_exact_match, divisions_exact_match])

In [97]:
matches = matches[['origName', 'name']]

In [98]:
matches.shape

(7676, 2)

In [99]:
matches.tail()

Unnamed: 0,origName,name
1037,Västernorrland County,Vaesternorrland
1038,Jämtland County,Jaemtland
1049,Buri Ram Province,Buri Ram
1050,Suphan Buri Province,Suphan Buri
1055,Mykolaiv Oblast,Mykolaiv


In [100]:
matches.query("origName == 'Zhuhai City'")

Unnamed: 0,origName,name


In [101]:
name_list = matches[['origName', 'name']].values.tolist()
name_dict = {name: val for name, val in name_list}

In [102]:
name_dict.get('Zhuhai City')

#### Assign GeoNames for Countries

In [103]:
country_list = country_exact_match[['origName', 'name']].values.tolist()
country_dict = {name: val for name, val in country_list}

In [104]:
df['geoName0'] = df['loc0'].apply(lambda x: country_dict.get(x, ''))

#### Assign GeoNames for Admin1 divisions

In [105]:
admin1_match = pd.concat([admin1_exact_match, admin1_no_affix_match])
admin1_list = admin1_match[['origName', 'name']].values.tolist()
admin1_dict = {name: val for name, val in admin1_list}

In [106]:
admin1_dict.get("United States,New York")

In [107]:
df['geoName1'] = df['loc1'].apply(lambda x: admin1_dict.get(x, ''))

#### Assign GeoNames for Admin2 divisions

In [108]:
admin2_list = admin2_exact_match[['origName', 'name']].values.tolist()
admin2_dict = {name: val for name, val in admin2_list}

In [109]:
df['geoName2'] = df['loc2'].apply(lambda x: admin2_dict.get(x, ''))

#### Assign GeoNames for Cities

In [110]:
city_list = city_exact_match[['origName', 'name']].values.tolist()
city_dict = {name: val for name, val in city_list}

In [111]:
df['geoName3'] = df['loc3'].apply(lambda x: city_dict.get(x, ''))

In [112]:
df.query("origLocation == 'United States,New York'").head()

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
556,"United States,New York","United States,New York",1,United States,New York,,,United States,New York,,


#### Assign best match for any unassigned locations

In [113]:
def get_name(loc, geoName):
    if geoName == '':
        return name_dict.get(loc, '')
    else:
        return geoName

In [114]:
df['geoName0'] = df.apply(lambda x: get_name(x['loc0'], x['geoName0']), axis=1)
df['geoName1'] = df.apply(lambda x: get_name(x['loc1'], x['geoName1']), axis=1)
df['geoName2'] = df.apply(lambda x: get_name(x['loc2'], x['geoName2']), axis=1)
df['geoName3'] = df.apply(lambda x: get_name(x['loc3'], x['geoName3']), axis=1)

In [115]:
df.head()

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
0,"China,Hubei","China,Hubei",1,China,Hubei,,,China,Hubei,,
1,"Thailand,Nonthaburi Province","Thailand,Nonthaburi Province",1,Thailand,Nonthaburi Province,,,Thailand,Nonthaburi,,
3,"China,Hubei,Wuhan","China,Hubei,Wuhan",2,China,Hubei,Wuhan,,China,Hubei,Wuhan,
6,"Japan,Kanagawa Prefecture","Japan,Kanagawa Prefecture",1,Japan,Kanagawa Prefecture,,,Japan,Kanagawa,,
8,"China,Zhejiang","China,Zhejiang",1,China,Zhejiang,,,China,Zhejiang,,


In [116]:
df[(df['geoName1'] == '') & (df['geoName2'] != '')]

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
2543,"France,Centre-Val de Loire,Tours","France,Centre-Val de Loire,Tours",2,France,Centre-Val de Loire,Tours,,France,,Tours,
5059,"France,Centre - Val de Loire,Abondant","France,Centre - Val de Loire,Abondant",2,France,Centre - Val de Loire,Abondant,,France,,Abondant,
17637,"India,Telengana,Hyderabad","India,Telengana,Hyderabad",2,India,Telengana,Hyderabad,,India,,Hyderabad,
54071,"Czech Republic,Hranice na Moravě,Prostějov","Czechia,Hranice na Moravě,Prostějov",2,Czechia,Hranice na Moravě,Prostějov,,Czechia,,Prostejov,
66567,"Oman,Dakhiliyah,Nizwa","Oman,Dakhiliyah,Nizwa",2,Oman,Dakhiliyah,Nizwa,,Oman,,Nizwa,
70884,"Panama,Panama West,Vista Alegre","Panama,Panama West,Vista Alegre",2,Panama,Panama West,Vista Alegre,,Panama,,Vista Alegre,
70898,"Panama,Panama West,Herrera","Panama,Panama West,Herrera",2,Panama,Panama West,Herrera,,Panama,,Herrera,
70934,"Panama,Panama West,Guadalupe","Panama,Panama West,Guadalupe",2,Panama,Panama West,Guadalupe,,Panama,,Guadalupe,
70951,"Panama,Panama West,Veracruz","Panama,Panama West,Veracruz",2,Panama,Panama West,Veracruz,,Panama,,Veracruz,
77483,"France,NA,Limoges","France,NA,Limoges",2,France,,Limoges,,France,,Limoges,


### Left align all geoNames

In [117]:
df[['geoName1','geoName2']] = df[['geoName1','geoName2']].apply(lambda x: x if x[0] != '' else [x[1],x[0]], axis=1)

In [118]:
# There are no cases that match this condition
# df[['geoName2','geoName3']] = df[['geoName2','geoName3']].apply(lambda x: x if x[0] != '' else [x[1],x[0]], axis=1)

In [119]:
df[(df['geoName2'] == '') & (df['geoName3'] != '')]

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
12715,"Czech Republic,Vysocina Region,Bela - Ledec na...","Czechia,Vysocina Region,Bela - Ledec na Sazavo...",3,Czechia,Vysocina Region,Bela - Ledec na Sazavou,Havlickuv Brod,Czechia,Vysocina,,Havlickuv Brod
106865,"Czech Republic,Central Bohemian Region,Mlada B...","Czechia,Central Bohemian Region,Mlada Boleslav...",3,Czechia,Central Bohemian Region,Mlada Boleslav,Knezmost,Czechia,Mlada Boleslav,,Knezmost


In [120]:
def calc_location_levels(row):
    level = 3
    if row['geoName3'] == '':
        level = 2
    if row['geoName2'] == '':
        level = 1
    if row['geoName1'] == '':
        level = 0
    return level

In [121]:
df['locationLevels'] = df.apply(lambda x: calc_location_levels(x), axis=1)

In [122]:
df.query("origLocation == 'North Macedonia,Pelagonia,Prilep'")

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3


In [123]:
df[(df['geoName1'] == '') & (df['geoName2'] != '')]

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3


Test

In [124]:
df.query("origLocation == 'Austria,St. Anton'")

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
17978,"Austria,St. Anton","Austria,Tyrol,Politischer Bezirk Landeck,St An...",3,Austria,Tyrol,Politischer Bezirk Landeck,St Anton am Arlberg,Austria,Tyrol,Politischer Bezirk Landeck,St Anton am Arlberg


In [125]:
df.query("geoName1 == 'District of Columbia'").head()

Unnamed: 0,origLocation,geoLocation,locationLevels,loc0,loc1,loc2,loc3,geoName0,geoName1,geoName2,geoName3
8044,"United States,DC","United States,District of Columbia",1,United States,District of Columbia,,,United States,District of Columbia,,
37022,"United States,Washington DC","United States,District of Columbia",1,United States,District of Columbia,,,United States,District of Columbia,,
85058,"United States,District of Columbia,Distrsict o...","United States,District of Columbia",1,United States,District of Columbia,,,United States,District of Columbia,,
161305,"United States,District of Columbia","United States,District of Columbia",1,United States,District of Columbia,,,United States,District of Columbia,,
274903,"United States,Washington, D.C.","United States,District of Columbia",1,United States,District of Columbia,,,United States,District of Columbia,,


In [126]:
df = df[(df['origLocation'] != '') & (df['geoName0'] != '')]
df = df[['origLocation', 'geoName0', 'geoName1', 'geoName2', 'geoName3', 'locationLevels']]
df.to_csv(NEO4J_IMPORT / "10a-GeoLink.csv", index=False)

### Remaining locations that do not match GeoNames.org locations

In [127]:
geo_divisions['canonicalName'] = geo_divisions.apply(lambda x: remove_special_characters(x['name']), axis=1)

In [128]:
geo_divisions.head()

Unnamed: 0,name,canonicalName,canonicalNameNoAffix
0,Sant Julia de Loria,sant julia de loria,sant julia de loria
1,Ordino,ordino,ordino
2,La Massana,la massana,la massana
3,Encamp,encamp,encamp
4,Canillo,canillo,canillo


In [129]:
divisions = divisions.query("name == ''")

In [130]:
divisions['name'] = divisions.apply(lambda x: get_close_match(x['canonicalName'], geo_divisions['canonicalName']), axis=1)

In [131]:
divisions_close_match = divisions.query("name != ''")

In [132]:
divisions_close_match.shape

(180, 4)

In [133]:
divisions_close_match.head(300)

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
4,Zhongxian,zhongxian,zhongxian,zhongxiang
5,Huaian,huaian,huaian,huainan
9,Crépy-en -Valois,crepy en valois,crepy en valois,crepy en valois
11,Northamtonshire,northamtonshire,northamtonshire,northamptonshire
12,Nottinghamhisre,nottinghamhisre,nottinghamhisre,nottinghamshire
24,Tehatta,tehatta,tehatta,thatta
38,Southeast,southeast,southeast,south east
43,Sagunt,sagunt,sagunt,sagunto
45,Lutxent,lutxent,lutxent,llutxent
51,Togui,togui,togui,toguei


### Locations that do not match

In [134]:
divisions_no_match = divisions.query("name == ''")

In [135]:
divisions_no_match.shape

(551, 4)

In [136]:
divisions_no_match.head(500)

Unnamed: 0,origName,canonicalName,canonicalNameNoAffix,name
6,Contamines,contamines,contamines,
8,Nazi Abad,nazi abad,nazi abad,
10,South Yorkshire,south yorkshire,south yorkshire,
15,Meudon la Forêt,meudon la foret,meudon la foret,
16,Grand Princess cruise ship,grand princess cruise ship,grand princess cruise ship,
17,Jian,jian,jian,
20,Xishuangbanna,xishuangbanna,xishuangbanna,
21,Grand Princess,grand princess,grand princess,
22,Unknown County,unknown county,unknown,
23,Darjeeling,darjeeling,darjeeling,
