# TOC
<div class="alert alert-block alert-info" style="margin-top: 20px">

* <a href="#item1">Wrangle wikipedia Postal Code data</a>
* <a href="#item2">Geocoding</a>

</div>

# Toronto neighborhood

In [283]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#import json # library to handle JSON files

#from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#import requests # library to handle requests
#from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
#import matplotlib.cm as cm
#import matplotlib.colors as colors

# import k-means from clustering stage
#from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
#import folium # map rendering library

#import lxml

In [284]:
toronto_boroughs = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
toronto_boroughs = toronto_boroughs[0]
toronto_boroughs.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Not assigned
8,M8A,Not assigned,Not assigned
9,M9A,Downtown Toronto,Queen's Park


## Drop rows where Borough is "Not assigned"

In [285]:
toronto_1 = toronto_boroughs[toronto_boroughs['Borough'].map(lambda x: str(x) != "Not assigned")]
toronto_1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


## Assign name of Borough to Neighbourhoods with value "Not assigned"

In [286]:
toronto_1["Neighbourhood"] = toronto_1["Neighbourhood"].where(cond=toronto_1["Neighbourhood"] != "Not assigned", other=toronto_1["Borough"], axis=0)
toronto_1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


## Group Neighbourhoods in comma separated lists, by Borough

In [287]:
toronto_1['Neighbourhood'] = toronto_1[['Postcode','Borough','Neighbourhood']].groupby(['Postcode','Borough'])['Neighbourhood'].\
    transform(lambda x: ','.join(x))

toronto_1 = toronto_1[["Postcode", "Borough", "Neighbourhood"]].drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [288]:
# Rename Postcode
toronto_1.columns = ["PostalCode", "Borough", "Neighbourhood"]

In [289]:
# Reset the index
toronto_1.reset_index(drop=True, inplace=True)

In [290]:
# Verify 1 case
toronto_1[toronto_1["PostalCode"] == "M1R"]

Unnamed: 0,PostalCode,Borough,Neighbourhood
71,M1R,Scarborough,"Maryvale,Wexford"


In [291]:
toronto_1.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [292]:
toronto_1.shape

(103, 3)

# Geocoding

In [293]:
import geocoder # import geocoder
import json

In [294]:
# Check if this provider works
geojson = geocoder.arcgis('M8Z, Toronto, Ontario').json
geojson

{'address': 'M8Z',
 'bbox': {'northeast': [43.65363000000003, -79.49794976199996],
  'southwest': [43.59563000000002, -79.55594976199995]},
 'confidence': 5,
 'lat': 43.624630000000025,
 'lng': -79.52694976199996,
 'ok': True,
 'quality': 'Postal',
 'raw': {'name': 'M8Z',
  'extent': {'xmin': -79.55594976199995,
   'ymin': 43.59563000000002,
   'xmax': -79.49794976199996,
   'ymax': 43.65363000000003},
  'feature': {'geometry': {'x': -79.52694976199996, 'y': 43.624630000000025},
   'attributes': {'Score': 100, 'Addr_Type': 'Postal'}}},
 'score': 100,
 'status': 'OK'}

In [295]:
# The google provider doesn't work, this one seems to do (and it doesn't need the silly while loop)
def getLatLng(postal_code):
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code)).json
    lat = g["lat"]
    lng = g["lng"]
    return lat, lng

In [296]:
# Check if the function works
getLatLng("M8Z")

(43.624630000000025, -79.52694976199996)

In [297]:
# Apply the function to the entire dataframe
toronto_1["lat", "lng"] = toronto_1.apply(lambda x: getLatLng(x.PostalCode), axis=1 )

In [298]:
toronto_1.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,"(lat, lng)"
0,M3A,North York,Parkwoods,"(43.75242000000003, -79.32924245299995)"
1,M4A,North York,Victoria Village,"(43.73060024600005, -79.31326499999994)"
2,M5A,Downtown Toronto,Harbourfront,"(43.65029500000003, -79.35916572299999)"
3,M6A,North York,"Lawrence Heights,Lawrence Manor","(43.72327000000007, -79.45128601699997)"
4,M7A,Queen's Park,Queen's Park,"(43.66115033500006, -79.39171499999998)"


In [300]:
# Create 2 columns for lat and lng instead of one column containing tuples
toronto_1[["lat", "lng"]] = pd.DataFrame( toronto_1[("lat", "lng")].tolist() )
toronto_1.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,"(lat, lng)",lat,lng
0,M3A,North York,Parkwoods,"(43.75242000000003, -79.32924245299995)",43.75242,-79.329242
1,M4A,North York,Victoria Village,"(43.73060024600005, -79.31326499999994)",43.7306,-79.313265
2,M5A,Downtown Toronto,Harbourfront,"(43.65029500000003, -79.35916572299999)",43.650295,-79.359166
3,M6A,North York,"Lawrence Heights,Lawrence Manor","(43.72327000000007, -79.45128601699997)",43.72327,-79.451286
4,M7A,Queen's Park,Queen's Park,"(43.66115033500006, -79.39171499999998)",43.66115,-79.391715


In [301]:
# Drop the old tuple column
toronto_1.drop([("lat", "lng")], axis=1, inplace=True)

In [302]:
toronto_1.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,lat,lng
0,M3A,North York,Parkwoods,43.75242,-79.329242
1,M4A,North York,Victoria Village,43.7306,-79.313265
2,M5A,Downtown Toronto,Harbourfront,43.650295,-79.359166
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.72327,-79.451286
4,M7A,Queen's Park,Queen's Park,43.66115,-79.391715
