# Geocoding Addresses
## This notebook contains code to retrieve addresses from the articles dataframe and send them to Google's Geocoding service to receive lat/long coordinates for locating in a mapping service.
# Load articles data

In [1]:
# Allows us to import packages that exist one level up in the file system
# See https://stackoverflow.com/questions/34478398
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path = [module_path] + sys.path

In [2]:
from tagnews.utils import load_data as ld
import numpy as np
import pandas as pd
import pickle

In [3]:
df = ld.load_data()

In [4]:
df.head()

Unnamed: 0_level_0,feedname,url,title,bodytext,relevant,created,last_modified,news_source_id,author,locations,...,UNSPC,ILSC,ARSN,BURG,DUI,FRUD,ROBB,TASR,COPA,DIGP
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
195256,L,http://chicago.cbslocal.com/2015/01/26/city-sa...,City Says Reports Of Potholes Down From A Year...,**CHICAGO (CBS) **- The city of Chicago is sta...,False,2015-01-26 18:55:18.051663+00,2015-02-09 18:56:05.593997+00,100,,[],...,0,0,0,0,0,0,0,0,0,0
195257,L,http://chicago.cbslocal.com/2015/01/26/browns-...,Browns WR Josh Gordon Fails Another Drug Test,**CLEVELAND (AP)** -- Josh Gordon's troubles h...,False,2015-01-26 18:55:19.088182+00,2015-02-09 18:56:05.478395+00,100,,[],...,0,0,0,0,0,0,0,0,0,0
195258,L,http://chicago.cbslocal.com/2015/01/26/chicago...,Chicago-Based Medical Technology Company Expan...,**Chicago (CBS)** -- A medical industry techno...,False,2015-01-26 18:55:20.116429+00,2015-02-09 18:56:05.579096+00,100,,[],...,0,0,0,0,0,0,0,0,0,0
195259,L,http://chicago.cbslocal.com/2015/01/26/emmas-b...,Emma’s Big Ten Power Rankings: Conference Race...,"By Chris Emma-\n\n**(CBS)** With win No. 1,000...",False,2015-01-26 18:55:21.141739+00,2015-02-09 18:56:05.469932+00,100,,[],...,0,0,0,0,0,0,0,0,0,0
195260,L,http://chicago.cbslocal.com/2015/01/26/14-year...,14-Year-Old Boy Fatally Shot In Riverdale,**(CBS)** -- A 14-year-old boy was fatally sho...,True,2015-01-26 18:55:22.162578+00,2015-02-09 18:56:04.852817+00,100,,[],...,0,0,0,0,0,0,0,0,0,0


# Count total number of articles.

In [5]:
df.loc[:, 'OEMC':].any(axis=1).sum()

40009

# Count all the articles with addressses transcribed from the articles.

In [6]:
df['locations'].apply(lambda x: bool(x)).sum()

354

In [7]:
addr_list = df.locations[df.locations.apply(lambda x: bool(x))]

In [8]:
addr_list.head()

article_id
198122    [{'start': 69, 'end': 83, 'text': 'Southwest S...
208118    [{'start': 331, 'end': 342, 'text': 'Rogers Pa...
210368    [{'start': 140, 'end': 146, 'text': 'Uptown', ...
210741    [{'start': 164, 'end': 175, 'text': 'South Sho...
212073    [{'start': 191, 'end': 202, 'text': 'Irving Pa...
Name: locations, dtype: object

# Count the total number of addresses transcribed from the articles to be geocoded.

In [9]:
addr_list.apply(lambda x: len(x)).sum()

1147

In [10]:
count = 0
for i,j in addr_list.iteritems():
    if 'lat_long' not in j[0].keys():
        count += len(j)
count

1147

# Run following 2 lines if making changes to lat_long.py. They allow this notebook to automatically update those changes for testing.

In [11]:
%load_ext autoreload

In [12]:
%autoreload 2

#  Please also note that you will need to supply an api_key from Google's Geocoding API site.
In the shell you launched this notebook from enter the following code with your geocode api key from Google:
```
        export GOOGLE_GEOCODE_API_KEY=api...
```


In [13]:
import lat_long as ll
api_key = os.environ["GOOGLE_API_KEY"]

# Main program to gather available locations data that does not yet have lat/long coordinates. 
### Set `test = True` to run smaller batches to prevent reaching Google's query limits too quickly.

In [14]:
latlong_data = ll.get_lat_long(df, api_key, test=False)

N/A% (0 of 1147) |                       | Elapsed Time: 0:00:00 ETA:  --:--:--

354 articles with 1147 addresses will be processed.


100% (1147 of 1147) |#####################| Elapsed Time: 0:05:29 Time: 0:05:29


In [15]:
latlong_data.iloc[0]

[{'cleaned span': (60, 74),
  'cleaned text': 'Southwest Side',
  'end': 83,
  'lat_long': Location(Southwest Side, Chicago, IL, USA, (41.7977249, -87.7172616, 0.0)),
  'start': 69,
  'text': 'Southwest Side'},
 {'cleaned span': (402, 433),
  'cleaned text': '6600 block of South Springfield',
  'end': 442,
  'lat_long': Location(6600 S Springfield Ave, Chicago, IL 60629, USA, (41.7732183, -87.7201869, 0.0)),
  'start': 411,
  'text': '6600 block of South Springfield'},
 {'cleaned span': (466, 497),
  'cleaned text': '6300 block of South Springfield',
  'end': 506,
  'lat_long': Location(6300 S Springfield Ave, Chicago, IL 60629, USA, (41.7786783, -87.7203155, 0.0)),
  'start': 475,
  'text': '6300 block of South Springfield'}]

In [16]:
len(latlong_data)

354

In [17]:
latlong_data.apply(lambda x: len(x)).sum()

1147

In [18]:
latlong_data.to_pickle('addr_geotag_list.pkl')

# Can see from running code block below that some of the queries didn't return results for a variety of reasons.

In [19]:
count = 0
no_results = []
for i in latlong_data:
    for j in i:
        if 'lat_long' not in j.keys():
            count += 1
            no_results.append(j)
print('Number of addresses that didn\'t recieve lat/log coords: {}.'.format(count))
no_results

Number of addresses that didn't recieve lat/log coords: 22.


[{'cleaned span': (834, 840),
  'cleaned text': 'Austin',
  'end': 849,
  'start': 843,
  'text': 'Austin'},
 {'cleaned span': (173, 190),
  'cleaned text': 'Back of the Yards',
  'end': 207,
  'start': 190,
  'text': 'Back of the Yards'},
 {'cleaned span': (233, 241),
  'cleaned text': 'Lawndale',
  'end': 250,
  'start': 242,
  'text': 'Lawndale'},
 {'cleaned span': (84, 94),
  'cleaned text': 'South Side',
  'end': 111,
  'start': 101,
  'text': 'South Side'},
 {'cleaned span': (1807, 1833),
  'cleaned text': '4000 block of South Kedzie',
  'end': 1850,
  'start': 1824,
  'text': '4000 block of South Kedzie'},
 {'cleaned span': (166, 203),
  'cleaned text': '800 block of North Springfield Avenue',
  'end': 220,
  'start': 183,
  'text': '800 block of North Springfield Avenue'},
 {'cleaned span': (195, 224),
  'cleaned text': '2200 block of West 18th Place',
  'end': 242,
  'start': 212,
  'text': '2200 block of West 18th Place '},
 {'cleaned span': (763, 797),
  'cleaned text': '480

# Code block below can be run to see how the query works. You will need to supply your own api_key.

In [None]:
from geopy.geocoders import GoogleV3
api_key = ''
g = GoogleV3(api_key = api_key, timeout = 10)

In [None]:
latlong_data[0][1]['lat_long'][1][0]

In [None]:
addr = addr_list[0][2]['cleaned text']
location = g.geocode(addr, components={'locality':'Chicago'})
location

# Code to construct URL for displaying lat/longs in Google Static Map

Was not able to get this work. Can get the map to output, but with no markers. Abandoning this for now.

In [None]:
import mapper as m
import webbrowser

In [None]:
mapURL = m.mapper(addr_list, api_key='')
webbrowser.open(mapURL)

In [None]:
len(mapURL)

In [None]:
baseURL = 'http://maps.googleapis.com/maps/api/staticmap?center=Chicago,IL&zoom=12&size=640x640&scale=2'
#markers = '&markers=size:mid&color:red'
#markers += '%7C' + str(latlong_data[0][1]['lat_long'][1][1]) + ',' + str(latlong_data[29][0]['lat_long'][1][1])
markers = '&markers=' + str(latlong_data[0][1]['lat_long'][1][0]) + ',' + str(latlong_data[0][1]['lat_long'][1][1])
mapURL = baseURL + markers + '&key='
webbrowser.open(mapURL)

In [None]:
mapURL