In [1]:
import wikipedia as wp #
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from geopy.geocoders import Nominatim #geocoding library


import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans 

import folium # map rendering library
import time
import logging


   
#Get the html source for wikipedia containging table to clean
html = wp.page("List of postal codes of Canada: M").html().encode("UTF-8")
df = pd.read_html(html)[0]
df.to_csv('beautifulsoup_pandas.csv',header=0,index=False)
new_header = df.iloc[0] #grab the first row for the header
df = df[1:] #take the data less the header row
df.columns = new_header #set the header row as the df header
df = df[df.Borough !='Not assigned'] #drop rows if Borough is not assigned
df['Neighbourhood'] = np.where(df['Neighbourhood'] == 'Not assigned', df['Borough'], df['Neighbourhood']) #assign Borough to Neighbourhood, if Neighbourhood is not assigned
df= df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index() #combining postcode with same value into single row
print(df.shape)
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

#Following part of code is to get Latitide and Longitude coordinates for Neighbourhoods
PostCode_series = df['Postcode'].copy()
Borough_series=df['Borough'].copy()
neighborhoods_series=df['Neighbourhood'].copy()
addresses=(neighborhoods_series+','+Borough_series+ ',Toronto,Ontario,'+PostCode_series)#concatenated series of addresses

logger = logging.getLogger("root")
logger.setLevel(logging.DEBUG)
# create console handler
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)
BACKOFF_TIME = 30
API_KEY='AIzaSyBB4VEIERWf0tlL_nPG7ybvUkc8ED28kDo'
RETURN_FULL_RESULTS = False
def get_google_results(address, api_key=None, return_full_response=False):
    """
    Get geocode results from Google Maps Geocoding API.
    
    Note, that in the case of multiple google geocode reuslts, this function returns details of the FIRST result.
    
    @param address: String address as accurate as possible. For Example "18 Grafton Street, Dublin, Ireland"
    @param api_key: String API key if present from google. 
                    If supplied, requests will use your allowance from the Google API. If not, you
                    will be limited to the free usage of 2500 requests per day.
    @param return_full_response: Boolean to indicate if you'd like to return the full response from google. This
                    is useful if you'd like additional location details for storage or parsing later.
    """
    # Setting up Geocoding url
    geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address)
    if api_key is not None:
        geocode_url = geocode_url + "&key={}".format(api_key)
        
    # Ping google for the reuslts:
    results = requests.get(geocode_url)
    # Results will be in JSON format - convert to dict using requests functionality
    results = results.json()
    
    # if there's no results or an error, return empty results.
    if len(results['results']) == 0:
        output = {
            "formatted_address" : None,
            "latitude": None,
            "longitude": None,
            "accuracy": None,
            "google_place_id": None,
            "type": None,
            "postcode": None
        }
    else:    
        answer = results['results'][0]
        output = {
            "formatted_address" : answer.get('formatted_address'),
            "latitude": answer.get('geometry').get('location').get('lat'),
            "longitude": answer.get('geometry').get('location').get('lng'),
            "accuracy": answer.get('geometry').get('location_type'),
            "google_place_id": answer.get("place_id"),
            "type": ",".join(answer.get('types')),
            "postcode": ",".join([x['long_name'] for x in answer.get('address_components') 
                                  if 'postal_code' in x.get('types')])
        }
        
    # Append some other details:    
    output['input_string'] = address
    output['number_of_results'] = len(results['results'])
    output['status'] = results.get('status')
    if return_full_response is True:
        output['response'] = results
    
    return output

#------------------ PROCESSING LOOP -----------------------------

# Create a list to hold results
results = []
# Go through each address in turn
for address in addresses:
    # While the address geocoding is not finished:
    geocoded = False
    while geocoded is not True:
        # Geocode the address with google
        try:
            geocode_result = get_google_results(address, API_KEY, return_full_response=RETURN_FULL_RESULTS)
        except Exception as e:
            logger.exception(e)
            logger.error("Major error with {}".format(address))
            logger.error("Skipping!")
            geocoded = True
            
        # If we're over the API limit, backoff for a while and try again later.
        if geocode_result['status'] == 'OVER_QUERY_LIMIT':
            logger.info("Hit Query Limit! Backing off for a bit.")
            time.sleep(BACKOFF_TIME * 60) # sleep for 30 minutes
            geocoded = False
        else:
            # If we're ok with API use, save the results
            # Note that the results might be empty / non-ok - log this
            if geocode_result['status'] != 'OK':
                logger.warning("Error geocoding {}: {}".format(address, geocode_result['status']))
            logger.debug("Geocoded: {}: {}".format(address, geocode_result['status']))
            results.append(geocode_result)           
            geocoded = True

    # Print status every 100 addresses
    if len(results) % 100 == 0:
    	logger.info("Completed {} of {} address".format(len(results), len(addresses)))
            
# All done
logger.info("Finished geocoding all addresses")
results=pd.DataFrame(results)
df["latitude"]=results["latitude"]#storing corresponding latitude to main dataframe
df["longitude"]=results["longitude"]#storing corresponding longitude to main dataframe
print(df)


(103, 3)
The dataframe has 11 boroughs and 103 neighborhoods.


Geocoded: Rouge,Malvern,Scarborough,Toronto,Ontario,M1B: OK
Geocoded: Highland Creek,Rouge Hill,Port Union,Scarborough,Toronto,Ontario,M1C: OK
Geocoded: Guildwood,Morningside,West Hill,Scarborough,Toronto,Ontario,M1E: OK
Geocoded: Woburn,Scarborough,Toronto,Ontario,M1G: OK
Geocoded: Cedarbrae,Scarborough,Toronto,Ontario,M1H: OK
Geocoded: Scarborough Village,Scarborough,Toronto,Ontario,M1J: OK
Geocoded: East Birchmount Park,Ionview,Kennedy Park,Scarborough,Toronto,Ontario,M1K: OK
Geocoded: Clairlea,Golden Mile,Oakridge,Scarborough,Toronto,Ontario,M1L: OK
Geocoded: Cliffcrest,Cliffside,Scarborough Village West,Scarborough,Toronto,Ontario,M1M: OK
Geocoded: Birch Cliff,Cliffside West,Scarborough,Toronto,Ontario,M1N: OK
Geocoded: Dorset Park,Scarborough Town Centre,Wexford Heights,Scarborough,Toronto,Ontario,M1P: OK
Geocoded: Maryvale,Wexford,Scarborough,Toronto,Ontario,M1R: OK
Geocoded: Agincourt,Scarborough,Toronto,Ontario,M1S: OK
Geocoded: Clarks Corners,Sullivan,Tam O'Shanter,Scarboroug

    Postcode           Borough  \
0        M1B       Scarborough   
1        M1C       Scarborough   
2        M1E       Scarborough   
3        M1G       Scarborough   
4        M1H       Scarborough   
5        M1J       Scarborough   
6        M1K       Scarborough   
7        M1L       Scarborough   
8        M1M       Scarborough   
9        M1N       Scarborough   
10       M1P       Scarborough   
11       M1R       Scarborough   
12       M1S       Scarborough   
13       M1T       Scarborough   
14       M1V       Scarborough   
15       M1W       Scarborough   
16       M1X       Scarborough   
17       M2H        North York   
18       M2J        North York   
19       M2K        North York   
20       M2L        North York   
21       M2M        North York   
22       M2N        North York   
23       M2P        North York   
24       M2R        North York   
25       M3A        North York   
26       M3B        North York   
27       M3C        North York   
28       M3H  