# 4. Clean and geocode locations
Clean and geocode (convert from strings to structured results, including geometries such as point coordinates) the locations found in the previous step.

In [1]:
import os
import re
import pandas as pd

# own files
from pysci import docutils as du
from pysci import geoparse as gp
from pysci import geocode as gc

### Preliminaries

In [2]:
path_to_content_locations = os.path.join('results', 'locations.tsv')
df_locations = pd.read_csv(path_to_content_locations, sep='\t', quotechar='"', encoding='utf-8')
df_locations.head()

Unnamed: 0,filename,content_locations,use_xml,location_sentences
0,Liu_et_al-2015-Insect_Conservation_and_Diversity,Xitiange,True,This study was conducted at Xitiange village i...
1,Liu_et_al-2015-Insect_Conservation_and_Diversity,Northeastern Miyun County (,True,This study was conducted at Xitiange village i...
2,Liu_et_al-2015-Insect_Conservation_and_Diversity,Beijing city,True,The area is situated about 70 km north of Beij...
3,Liu_et_al-2015-Insect_Conservation_and_Diversity,Xitiange,True,This study was conducted at Xitiange village i...
4,Liu_et_al-2015-Insect_Conservation_and_Diversity,Woodland,True,"Woodland was also planted with Populus spp., b..."


In [3]:
# create googlemaps.Client object: you will need an API key for this!
gmaps = gc.create_google_geocoder(api_key='YOUR-API-KEY-HERE')
# cache so we don't re-query identical strings
local_cache_google = {}
# or load existing cache
#local_cache_google = du.load_data('local_cache_google.pkl')
#print("The cache has values for %s string keys." %len(local_cache_google))

### Geocode locations

In [4]:
loc_clean = []
loc_lats = []
loc_lons = []
loc_strings = []
loc_types = []

for content_location in df_locations['content_locations'].tolist():
    if content_location == gp.NO_METHODS_STRING or content_location == gp.NO_LOCATIONS_STRING:
        # don't geocode these but store placeholder strings
        loc_clean.append(content_location)
        loc_lats.append(content_location)
        loc_lons.append(content_location)
        loc_strings.append(content_location)
        loc_types.append(content_location)
        continue
    clean_text = gc.clean_for_geocode(content_location)
    loc_clean.append(clean_text)
    #print("\tgeocoding text: %s" %clean_text)
    top = gc.geocode_with_cache_google(clean_text, gmaps, local_cache_google)
    if top:
        # get point
        geometry = top['geometry']
        pt = geometry['location']
        # store stuff
        loc_lats.append(pt['lat'])
        loc_lons.append(pt['lng'])
        loc_strings.append(top['formatted_address'])
        loc_types.append(geometry['location_type'])
    else:
        #print("\tWe had no top result")
        loc_lats.append(gc.NO_RESULT_STRING)
        loc_lons.append(gc.NO_RESULT_STRING)
        loc_strings.append(gc.NO_RESULT_STRING)
        loc_types.append(gc.NO_RESULT_STRING)

print("Done.")

Done.


In [5]:
# repickle the cache at the end...
print("The cache now has values for %s string keys." %len(local_cache_google))
du.pickle_data(local_cache_google, 'local_cache_google.pkl')

The cache now has values for 5 string keys.
pickled data at local_cache_google.pkl


True

### Tidy and export

In [6]:
df_locations['clean_content_loc'] = loc_clean
df_locations['geocode_lat'] = loc_lats
df_locations['geocode_lon'] = loc_lons
df_locations['geocode_str'] = loc_strings
df_locations['geocode_type'] = loc_types
df_locations.head()

Unnamed: 0,filename,content_locations,use_xml,location_sentences,clean_content_loc,geocode_lat,geocode_lon,geocode_str,geocode_type
0,Liu_et_al-2015-Insect_Conservation_and_Diversity,Xitiange,True,This study was conducted at Xitiange village i...,Xitiange,no-geocode-result,no-geocode-result,no-geocode-result,no-geocode-result
1,Liu_et_al-2015-Insect_Conservation_and_Diversity,Northeastern Miyun County (,True,This study was conducted at Xitiange village i...,Northeastern Miyun County,40.3769,116.843,"Miyun, Beijing, China",APPROXIMATE
2,Liu_et_al-2015-Insect_Conservation_and_Diversity,Beijing city,True,The area is situated about 70 km north of Beij...,Beijing city,39.9042,116.407,"Beijing, China",APPROXIMATE
3,Liu_et_al-2015-Insect_Conservation_and_Diversity,Xitiange,True,This study was conducted at Xitiange village i...,Xitiange,no-geocode-result,no-geocode-result,no-geocode-result,no-geocode-result
4,Liu_et_al-2015-Insect_Conservation_and_Diversity,Woodland,True,"Woodland was also planted with Populus spp., b...",Woodland,45.9046,-122.744,"Woodland, WA 98674, USA",APPROXIMATE


In [8]:
cols_in_order = ['filename', 'content_locations', 'clean_content_loc', 'geocode_str', 'geocode_type', 
                 'geocode_lat', 'geocode_lon', 'use_xml', 'location_sentences']
df_locations_clean = df_locations.loc[:,cols_in_order]
# export to file
df_locations_clean.to_csv(os.path.join('results', 'locations_geocoded.tsv'), sep='\t', index=False, quotechar='"', encoding='utf-8')