# GeoNames preparation
Optionally download GeoNames data for Switzerland, then project records to swiss coordinates (E and N) and save to file.

In [1]:
import pandas as pd
import time
import os

# own modules
from gazmatch import gazetteers
from gazmatch import projection

In [2]:
data_dir = 'data'

### Optional: download latest GeoNames Switzerland data

In [None]:
import urllib.request
from zipfile import ZipFile

geonames_ch_zip_url = r'http://download.geonames.org/export/dump/CH.zip'
geonames_ch_zip_local = os.path.join(data_dir, 'CH.zip')

urllib.request.urlretrieve(geonames_ch_zip_url, geonames_ch_zip_local)
with ZipFile(geonames_ch_zip_local, 'r') as zipfile:
    zipfile.extract('CH.txt', data_dir)

os.remove(geonames_ch_zip_local)

### Load GeoNames

In [3]:
geonames = gazetteers.GeoNamesCH(data_dir=data_dir, verbose=True)
geonames.df.head()

We have 67796 records in GeoNames for Switzerland and 19 columns.


Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,country code,cc2,admin1 code,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification date
0,2657883,Zuger See,Zuger See,"Lac de Zoug,Lago di Zug,Lai da Zug,Lake Zug,La...",47.1313,8.48335,H,LK,CH,,00,,,,0,413.0,411,Europe/Zurich,2012-02-01
1,2657884,Zwischbergental,Zwischbergental,"Zwischberg-Thal,Zwischbergental",46.16667,8.13333,T,VAL,CH,CH,VS,,,,0,,1671,Europe/Zurich,2012-01-17
2,2657885,Zwischbergen,Zwischbergen,"Zwischbergen,ci wei shi bei gen,茨維施貝根",46.16366,8.11575,P,PPL,CH,,VS,2301.0,6011.0,,127,,1322,Europe/Zurich,2012-01-17
3,2657886,Zwingen,Zwingen,"Cvingen,ci wen gen,Цвинген,茨溫根",47.43825,7.53027,P,PPL,CH,,BL,1302.0,2793.0,,2162,,342,Europe/Zurich,2013-02-28
4,2657887,Zweisimmen,Zweisimmen,"Cvajzimmen,Zweisimmen,Zweisimmeni vald,ci wei ...",46.55539,7.37302,P,PPL,CH,,BE,248.0,794.0,,2813,,934,Europe/Zurich,2017-02-03


In [4]:
geonames_copy = geonames.df.copy()
print(geonames_copy.shape)

(67796, 19)


### Project all features in batches

In [5]:
max_features = 20000
start_index = 0
dfs = []
while (start_index+1) <= geonames_copy.shape[0]:
    print("Start index was %s" %start_index)
    geonames_subset = geonames_copy.iloc[start_index:start_index+max_features].copy() # select feature subset
    
    lats = geonames_subset['latitude'].tolist()
    lons = geonames_subset['longitude'].tolist()
    latlon_coords = list(zip(lats, lons))
    print("Started with %s lat-lon coordinates." %len(latlon_coords))
    t1 = time.time()
    swiss_coords = projection.latlon_to_swiss_batch(latlon_coords)
    t2 = time.time()
    print("Ended up with %s swiss coordinates." %len(swiss_coords))
    print('Projection took %0.3fs' % ((t2-t1)))
    Es, Ns = zip(*swiss_coords)
    geonames_subset['gn_E'] = pd.Series(list(Es), index=geonames_subset.index)
    geonames_subset['gn_N'] = pd.Series(list(Ns), index=geonames_subset.index)    
    dfs.append(geonames_subset)
    start_index += max_features

#print("\nWe end up with %s dataframes" %len(dfs))

Start index was 0
Started with 20000 lat-lon coordinates.
Ended up with 20000 swiss coordinates.
Projection took 4.621s
Start index was 20000
Started with 20000 lat-lon coordinates.
Ended up with 20000 swiss coordinates.
Projection took 3.118s
Start index was 40000
Started with 20000 lat-lon coordinates.
Ended up with 20000 swiss coordinates.
Projection took 3.603s
Start index was 60000
Started with 7796 lat-lon coordinates.
Ended up with 7796 swiss coordinates.
Projection took 1.546s


### Concat back into one

In [6]:
df_all = pd.concat(dfs, axis=0)
df_all.shape

(67796, 21)

In [7]:
# sanity check (there should not be any nulls)
print(df_all[df_all['gn_E'].isnull()].shape)
print(df_all[df_all['gn_N'].isnull()].shape)

(0, 21)
(0, 21)


In [8]:
# export to csv
path_filename_csv = os.path.join(data_dir, 'geonames_ch_swisscoords.csv')
df_all.to_csv(path_or_buf=path_filename_csv, sep='\t', index=False, encoding='utf-8')