# Create Lists of Cities

**[Work in progress]**

This notebook creates countries for ingestion into a Knowledge Graph.

Data source: [GeoNames.org](https://download.geonames.org/export/dump/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from pathlib import Path
import pandas as pd

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_HOME = Path(os.getenv('NEO4J_HOME'))
print(NEO4J_HOME)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3


### Read City data (> 15,000 citizens)

In [4]:
url = 'https://download.geonames.org/export/dump/cities15000.zip'
file_name = "cities15000.txt"
resp = urlopen(url)
zipfile = ZipFile(BytesIO(resp.read()))

In [5]:
names = [
        'geonameid','name','asciiname','alternatenames','latitude','longitude','feature class',
        'feature code','country code','cc2','admin1 code','admin2 code','admin3 code','admin4 code',
        'population','elevation','dem','timezone','modification date'
]

city = pd.read_csv(zipfile.open(file_name), sep="\t", low_memory=False, names=names)
city = city[['geonameid', 'asciiname', 'country code', 'admin1 code', 'admin2 code', 'population', 'elevation']]
city = city.fillna('')

In [6]:
print('Number of cities', city.shape[0])

Number of cities 24359


In [7]:
def get_location_id(country, admin1, admin2):
    location = country
    if admin1 != '':
        location = location + '.' + admin1
    if admin2 != '':
        location = location + '.' + admin2
        
    return location

### Standardize column names for Knowlege Graph
* id: unique identifier for country
* name: name of node
* parentId: unique identifier for continent
* properties: camelCase

In [8]:
city.rename(columns={'geonameid': 'id'}, inplace=True)
city.rename(columns={'asciiname': 'name'}, inplace=True)
city['parentId'] = city.apply(lambda row: get_location_id(row['country code'], 
                                                         row['admin1 code'], 
                                                         row['admin2 code']), axis=1)

### Example

In [9]:
city.query("name == 'San Diego'")

Unnamed: 0,id,name,country code,admin1 code,admin2 code,population,elevation,parentId
4215,3621926,San Diego,CR,02,303,16991,,CR.02.303
23213,5391811,San Diego,US,CA,73,1394928,20.0,US.CA.073


### Export a minimum subset for now

In [10]:
city = city[['id', 'name', 'population', 'elevation', 'parentId']]
city.fillna('', inplace=True)

In [11]:
city.head()

Unnamed: 0,id,name,population,elevation,parentId
0,3040051,les Escaldes,15853,,AD.08
1,3041563,Andorra la Vella,20430,,AD.07
2,290594,Umm Al Quwain City,62747,,AE.07
3,291074,Ras Al Khaimah City,351943,,AE.05
4,291580,Zayed City,63482,,AE.01.103


In [12]:
city.to_csv(NEO4J_HOME / "import/00h-GeoNamesCity.csv", index=False)