# Cities

**[Work in progress]**

This notebook creates a .csv file with city information (population > 1000) for ingestion into the Knowledge Graph.

Data source: [GeoNames.org](https://download.geonames.org/export/dump/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from pathlib import Path
import pandas as pd

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-9f7418e6-ef5d-4a2d-ae16-29a5a6814849/installation-4.1.0/import


### Read City data

In [4]:
names = [
        'geonameid','name','asciiname','alternatenames','latitude','longitude','feature class',
        'feature code','country code','cc2','admin1 code','admin2 code','admin3 code','admin4 code',
        'population','elevation','dem','timezone','modification date'
]

Read city data (population > 15,000)

In [5]:
url = 'https://download.geonames.org/export/dump/cities15000.zip'
file_name = "cities15000.txt"
resp = urlopen(url)
zipfile = ZipFile(BytesIO(resp.read()))
city_15k = pd.read_csv(zipfile.open(file_name), sep="\t", low_memory=False, names=names)

Read city data (population > 5,000)

In [6]:
url = 'https://download.geonames.org/export/dump/cities5000.zip'
file_name = "cities5000.txt"
resp = urlopen(url)
zipfile = ZipFile(BytesIO(resp.read()))
city_5k = pd.read_csv(zipfile.open(file_name), sep="\t", low_memory=False, names=names)

Read city data (population > 1000)

In [7]:
url = 'https://download.geonames.org/export/dump/cities1000.zip'
file_name = "cities1000.txt"
resp = urlopen(url)
zipfile = ZipFile(BytesIO(resp.read()))
city_1k = pd.read_csv(zipfile.open(file_name), sep="\t", low_memory=False, names=names)

In [8]:
# TODO read city data (population > 500)

In [9]:
city = pd.concat([city_15k, city_5k, city_1k])

In [10]:
city = city[['geonameid', 'asciiname', 'country code', 'admin1 code', 'admin2 code']]
city.fillna('', inplace=True)

#### Remove duplicates

In [11]:
city.drop_duplicates('geonameid', inplace=True)

In [12]:
print('Number of cities', city.shape[0])

Number of cities 137497


In [13]:
def get_location_id(country, admin1, admin2):
    location = country
    if admin1 != '':
        location = location + '.' + admin1
    if admin2 != '':
        location = location + '.' + admin2
        
    return location

### Standardize column names for Knowlege Graph
* id: unique identifier for country
* name: name of node
* parentId: unique identifier for continent
* properties: camelCase

In [14]:
city.rename(columns={'geonameid': 'geonameId'}, inplace=True)
city['id'] = city['geonameId']
city.rename(columns={'asciiname': 'name'}, inplace=True)
city['parentId'] = city.apply(lambda row: get_location_id(row['country code'], 
                                                         row['admin1 code'], 
                                                         row['admin2 code']), axis=1)

### Example

In [15]:
city.query("name == 'San Diego'")

Unnamed: 0,geonameId,name,country code,admin1 code,admin2 code,id,parentId
4337,3621926,San Diego,CR,02,303.0,3621926,CR.02.303
23384,5391811,San Diego,US,CA,73.0,5391811,US.CA.073
8212,3669947,San Diego,CO,10,20750.0,3669947,CO.10.20750
53075,3590312,San Diego,GT,22,,3590312,GT.22
53461,3602368,San Diego,HN,07,,3602368,HN.07
81073,3827294,San Diego,MX,17,,3827294,MX.17
81232,3973609,San Diego,MX,24,24.0,3973609,MX.24.024
81922,3987339,San Diego,MX,25,6.0,3987339,MX.25.006
83547,4024589,San Diego,MX,11,37.0,4024589,MX.11.037
84474,8858713,San Diego,MX,21,174.0,8858713,MX.21.174


### Export a minimum subset for now

In [16]:
city = city[['id', 'name', 'parentId', 'geonameId']]
city.fillna('', inplace=True)

In [17]:
city.head()

Unnamed: 0,id,name,parentId,geonameId
0,3040051,les Escaldes,AD.08,3040051
1,3041563,Andorra la Vella,AD.07,3041563
2,290594,Umm Al Quwain City,AE.07,290594
3,291074,Ras Al Khaimah City,AE.05,291074
4,291580,Zayed City,AE.01.103,291580


In [18]:
city.to_csv(NEO4J_IMPORT / "00h-GeoNamesCity.csv", index=False)