# Geocoding Addresses
## This notebook contains code to retrieve addresses from the articles dataframe and send them to Google's Geocoding service to receive lat/long coordinates for locating in a mapping service.

# Load articles data

In [1]:
# Allows us to import packages that exist one level up in the file system
# See https://stackoverflow.com/questions/34478398
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path = [module_path] + sys.path

In [2]:
from tagnews.utils import load_data as ld
import numpy as np
import pandas as pd

In [3]:
df = ld.load_data()

In [4]:
df.head()

Unnamed: 0_level_0,feedname,url,title,bodytext,relevant,created,last_modified,news_source_id,author,locations,...,UNSPC,ILSC,ARSN,BURG,DUI,FRUD,ROBB,TASR,COPA,DIGP
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
195256,L,http://chicago.cbslocal.com/2015/01/26/city-sa...,City Says Reports Of Potholes Down From A Year...,**CHICAGO (CBS) **- The city of Chicago is sta...,False,2015-01-26 18:55:18.051663+00,2015-02-09 18:56:05.593997+00,100,,[],...,0,0,0,0,0,0,0,0,0,0
195257,L,http://chicago.cbslocal.com/2015/01/26/browns-...,Browns WR Josh Gordon Fails Another Drug Test,**CLEVELAND (AP)** -- Josh Gordon's troubles h...,False,2015-01-26 18:55:19.088182+00,2015-02-09 18:56:05.478395+00,100,,[],...,0,0,0,0,0,0,0,0,0,0
195258,L,http://chicago.cbslocal.com/2015/01/26/chicago...,Chicago-Based Medical Technology Company Expan...,**Chicago (CBS)** -- A medical industry techno...,False,2015-01-26 18:55:20.116429+00,2015-02-09 18:56:05.579096+00,100,,[],...,0,0,0,0,0,0,0,0,0,0
195259,L,http://chicago.cbslocal.com/2015/01/26/emmas-b...,Emma’s Big Ten Power Rankings: Conference Race...,"By Chris Emma-\n\n**(CBS)** With win No. 1,000...",False,2015-01-26 18:55:21.141739+00,2015-02-09 18:56:05.469932+00,100,,[],...,0,0,0,0,0,0,0,0,0,0
195260,L,http://chicago.cbslocal.com/2015/01/26/14-year...,14-Year-Old Boy Fatally Shot In Riverdale,**(CBS)** -- A 14-year-old boy was fatally sho...,True,2015-01-26 18:55:22.162578+00,2015-02-09 18:56:04.852817+00,100,,[],...,0,0,0,0,0,0,0,0,0,0


## Count total number of articles.

In [5]:
df.loc[:, 'OEMC':].any(axis=1).sum()

39846

## Count all the articles with addressses transcribed from the articles.

In [11]:
df['locations'].apply(lambda x: bool(x)).sum()

313

In [48]:
addr_list = df.locations[df.locations.apply(lambda x: bool(x))]

In [52]:
addr_list.head()

article_id
198122    [{'start': 69, 'end': 83, 'text': 'Southwest S...
208118    [{'start': 331, 'end': 342, 'text': 'Rogers Pa...
210368    [{'start': 140, 'end': 146, 'text': 'Uptown', ...
210741    [{'start': 164, 'end': 175, 'text': 'South Sho...
212073    [{'start': 191, 'end': 202, 'text': 'Irving Pa...
Name: locations, dtype: object

## Count the total number of addresses transcribed from the articles to be geocoded.

In [57]:
addr_list.apply(lambda x: len(x)).sum()

1026

In [55]:
count = 0
for i in addr_list:
    if 'lat_long' not in i[0].keys():
        count += len(i)
print(count)

1026


## Run following 2 lines if making changes to lat_long.py. They allow this notebook to automatically update those changes for testing.

In [59]:
%load_ext autoreload

In [60]:
%autoreload 2

##  Please also note that you will need to supply an api_key from Google's Geocoding API site.

In [62]:
import lat_long as ll
api_key = 'AIzaSyDGnOxhNcH3Cp4iHvz6E7Apo1dUvET9bkc'

## Main program to gather available locations data that does not yet have lat/long coordinates. 
### Set `test = True` to run smaller batches to prevent reaching Google's query limits too quickly.

In [64]:
latlong_data = ll.get_lat_long(df, api_key, test=False)

N/A% (0 of 1011) |                       | Elapsed Time: 0:00:00 ETA:  --:--:--

308 articles with 1011 addresses will be processed.


100% (1011 of 1011) |#####################| Elapsed Time: 0:04:35 Time: 0:04:35


In [65]:
latlong_data[307]

[{'cleaned span': (66, 76),
  'cleaned text': 'Morgan Par',
  'end': 93,
  'lat_long': Location(Chicago, IL, USA, (41.8781136, -87.6297982, 0.0)),
  'start': 83,
  'text': 'Morgan Par'},
 {'cleaned span': (108, 122),
  'cleaned text': 'Far South Side',
  'end': 139,
  'lat_long': Location(Chicago, IL, USA, (41.8781136, -87.6297982, 0.0)),
  'start': 125,
  'text': 'Far South Side'},
 {'cleaned span': (201, 232),
  'cleaned text': '1100 block of West 112th Place,',
  'end': 249,
  'lat_long': Location(1100 W 112th Pl, Chicago, IL 60643, USA, (41.6894454, -87.6495735, 0.0)),
  'start': 218,
  'text': '1100 block of West 112th Place,'},
 {'cleaned span': (445, 478),
  'cleaned text': '11000 block of South State Street',
  'end': 495,
  'lat_long': Location(11000 S State St, Chicago, IL 60628, USA, (41.694279, -87.623482, 0.0)),
  'start': 462,
  'text': '11000 block of South State Street'}]

## Can see from running code block below that some of the queries didn't return results for a variety of reasons.

In [66]:
count = 0
no_results = []
for i in latlong_data:
    for j in i:
        if 'lat_long' not in j.keys():
            count += 1
            no_results.append(j)
print('Number of addresses that didn\'t recieve lat/log coords: {}.'.format(count))
no_results

Number of addresses that didn't recieve lat/log coords: 19.


[{'cleaned span': (137, 147),
  'cleaned text': 'South Side',
  'end': 156,
  'start': 146,
  'text': 'South Side'},
 {'cleaned span': (799, 831),
  'cleaned text': 'Peterson Avenue and Pulaski Road',
  'end': 840,
  'start': 808,
  'text': 'Peterson Avenue and Pulaski Road'},
 {'cleaned span': (95, 106),
  'cleaned text': 'Bronzeville',
  'end': 124,
  'start': 112,
  'text': 'Bronzeville '},
 {'cleaned span': (52, 66),
  'cleaned text': 'Princeton Park',
  'end': 75,
  'start': 61,
  'text': 'Princeton Park'},
 {'cleaned span': (80, 94),
  'cleaned text': 'Auburn Gresham',
  'end': 111,
  'start': 97,
  'text': 'Auburn Gresham'},
 {'cleaned span': (458, 478),
  'cleaned text': '4301 W. Chicago Ave.',
  'end': 497,
  'start': 477,
  'text': '4301 W. Chicago Ave.'},
 {'cleaned span': (527, 537),
  'cleaned text': 'North Side',
  'end': 554,
  'start': 544,
  'text': 'North Side'},
 {'cleaned span': (463, 470),
  'cleaned text': 'Beecher',
  'end': 2390,
  'start': 2382,
  'text': 'Beec

## Code block below can be run to see how the query works. You will need to supply your own api_key.

In [None]:
from geopy.geocoders import GoogleV3
api_key = ''
g = GoogleV3(api_key = api_key, timeout = 10)

In [None]:
addr_list[100][3]['cleaned text']

In [None]:
addr = addr_list[0][2]['cleaned text']
location = g.geocode(addr, components={'locality':'Chicago'})
location

## Code to construct URL for displaying lat/longs in Google Static Map

In [67]:
https://maps.googleapis.com/maps/api/staticmap?center=Chicago,IL&zoom=12&size=640x640&key=AIzaSyDaHtriFiQq6kLDLgQ6uCW-qVFuuCzsjSE

SyntaxError: invalid syntax (<ipython-input-67-15fbf88d5c26>, line 1)