# Add data from GeoNames

**[Work in progress]**

This notebook adds latitude, longitude, elevation, and population data from GeoNames to Country, Admin1, Admin2, and City .csv files for ingestion into the Knowledge Graph.

Data source: [GeoNames.org](https://download.geonames.org/export/dump/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
from pathlib import Path
from io import BytesIO
import io
import csv
import requests
from zipfile import ZipFile
import pandas as pd

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-19636412-9e74-4bac-8a4c-c6c8b49bb9d3/installation-4.1.0/import


### Download data from GeoNames

In [4]:
country_url = 'https://download.geonames.org/export/dump/allCountries.zip'

In [5]:
content = requests.get(country_url)
zf = ZipFile(BytesIO(content.content))

for item in zf.namelist():
    print("File in zip: "+  item)

File in zip: allCountries.txt


In [6]:
# Intermediate data file is cached here
CACHE = Path(NEO4J_IMPORT / 'cache')
CACHE.mkdir(exist_ok=True)

### Cache cleaned-up data
Fix encoding and extract features for class A (country, state, region,...) and P(city, village,...)

In [7]:
encoding = 'utf-8'
path = CACHE / 'allCountries.csv'

In [8]:
try:
    with zf.open('allCountries.txt') as readfile:
        with open(path, "w") as file_out:
            writer = csv.writer(file_out)
            for line in io.TextIOWrapper(readfile, encoding):
                row = line.strip().split("\t")
                if row[6] == 'A' or row[6] == 'P':
                    writer.writerow([row[0], row[4], row[5], row[14], row[15]])
except:
    print('Download of allCountries.txt failed, using cached version of data')

### Read cleaned-up data file from Cache
If data download failed, the cached file from a previous run is used.

In [9]:
columns = ['geonameId', 'latitude', 'longitude', 'population', 'elevation']

In [10]:
df = pd.read_csv(path, names=columns, dtype='str', header=0)
df.fillna('', inplace=True)

In [11]:
# Missing population data are sometimes represented as zero, these will be ignored.
df['population'] = df['population'].str.replace('0', '')

### Add latitude, longitude, and elevation data for countries
Note, elevation data are excluded here since they are missing for countries.

In [12]:
dfc = df[['geonameId', 'latitude', 'longitude', 'population']]

In [13]:
dfc.head()

Unnamed: 0,geonameId,latitude,longitude,population
0,3038832,42.53176,1.56654,
1,3038899,42.48597,1.4891,
2,3038987,42.56461,1.52757,
3,3038999,42.57688,1.66769,62.0
4,3039039,42.53695,1.58068,


In [14]:
country = pd.read_csv(NEO4J_IMPORT / "00e-GeoNamesCountry.csv", dtype='str')

In [15]:
country = pd.merge(country, dfc, on='geonameId', how='left')
country.fillna('', inplace=True)

Fix id and iso code for Namibia (NA is interpreted as NaN in Pandas!)

In [16]:
# reset the id and iso code for Namibi.
index = country.query("iso3 == 'NAM'").index
country.at[index, 'iso'] = 'NA'
country.at[index, 'id'] = 'NA'

In [17]:
country.to_csv(NEO4J_IMPORT / "00e-GeoNamesCountry.csv", index=False)

In [18]:
country.head(300)

Unnamed: 0,id,name,iso,iso3,isoNumeric,areaSqKm,geonameId,latitude,longitude,population
0,AD,Andorra,AD,AND,20,468.0,,,,
1,AE,United Arab Emirates,AE,ARE,784,82880.0,290557.0,23.75,54.5,963959.0
2,AF,Afghanistan,AF,AFG,4,647500.0,1149361.0,33.0,66.0,37172386.0
3,AG,Antigua and Barbuda,AG,ATG,28,443.0,3576396.0,17.05,-61.8,96286.0
4,AI,Anguilla,AI,AIA,660,102.0,3573511.0,18.21667,-63.05,13254.0
5,AL,Albania,AL,ALB,8,28748.0,,,,
6,AM,Armenia,AM,ARM,51,29800.0,,,,
7,AO,Angola,AO,AGO,24,1246700.0,3351879.0,-12.5,18.5,389762.0
8,AQ,Antarctica,AQ,ATA,10,14000000.0,6697173.0,-82.67628,8.78906,
9,AR,Argentina,AR,ARG,32,2766890.0,,,,


### Add latitude, longitude, elevation, and population data for admin1 divisions

In [19]:
admin1 = pd.read_csv(NEO4J_IMPORT / "00f-GeoNamesAdmin1.csv", dtype='str')
admin1 = pd.merge(admin1, df, on='geonameId', how='left')
admin1.fillna('', inplace=True)
admin1.to_csv(NEO4J_IMPORT / "00f-GeoNamesAdmin1.csv", index=False)

In [20]:
admin1.head()

Unnamed: 0,id,name,code,parentId,geonameId,latitude,longitude,population,elevation
0,AD.06,Sant Julia de Loria,6,AD,3039162,42.46247,1.48247,9448,
1,AD.05,Ordino,5,AD,3039676,42.59758,1.52573,3467,
2,AD.04,La Massana,4,AD,3040131,42.55417,1.48333,8953,
3,AD.03,Encamp,3,AD,3040684,42.53333,1.63333,13685,
4,AD.02,Canillo,2,AD,3041203,42.58333,1.65833,567,


### Add latitude, longitude, elevation, and population data for admin2 divisions

In [21]:
admin2 = pd.read_csv(NEO4J_IMPORT / "00g-GeoNamesAdmin2.csv", dtype='str')
admin2 = pd.merge(admin2, df, on='geonameId', how='left')
admin2.fillna('', inplace=True)
admin2.to_csv(NEO4J_IMPORT / "00g-GeoNamesAdmin2.csv", index=False)

In [22]:
admin2.head()

Unnamed: 0,id,name,geonameId,parentId,latitude,longitude,population,elevation
0,AE.01.101,Abu Dhabi Municipality,12047239,AE.01,24.41361,54.43295,1797294,
1,AE.01.102,Al Ain Municipality,12047240,AE.01,24.15223,55.8204,776935,
2,AE.01.103,Al Dhafra,12047241,AE.01,23.65745,53.72225,319433,
3,AE.04.701,Al Fujairah Municipality,12047242,AE.04,25.13557,56.33279,168822,
4,AE.04.702,Dibba Al Fujairah Municipality,12047243,AE.04,25.5858,56.24792,67989,


### Add latitude, longitude, elevation, and population data for cities

In [23]:
city = pd.read_csv(NEO4J_IMPORT / "00h-GeoNamesCity.csv", dtype='str')
city = pd.merge(city, df, on='geonameId', how='left')
city.fillna('', inplace=True)
city.to_csv(NEO4J_IMPORT / "00h-GeoNamesCity.csv", index=False)

In [24]:
city.head()

Unnamed: 0,id,name,parentId,geonameId,latitude,longitude,population,elevation
0,3040051,les Escaldes,AD.08,3040051,42.50729,1.53414,15853,
1,3041563,Andorra la Vella,AD.07,3041563,42.50779,1.52109,243,
2,290594,Umm Al Quwain City,AE.07,290594,25.56473,55.55517,62747,
3,291074,Ras Al Khaimah City,AE.05,291074,25.78953,55.9432,351943,
4,291580,Zayed City,AE.01.103,291580,23.65416,53.70522,63482,
