In [9]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
!pip install folium
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

**Let's continue to import necessary libraries as well as take a look at the data that we'll be using from this Wikipedia link.**

In [10]:
from bs4 import BeautifulSoup
url=  requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(url.text,'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className=document.documentElement.className.replace(/(^|\s)client-nojs(\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":906439794,"wgRevisionId":906439794,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":!1,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June",

In [11]:
import csv
csv_file=open('canada1.csv','w')
csv_writer=csv.writer(csv_file)
csv_writer.writerow(['Postcode', 'Borough', 'Neighbourhood'])
for tr in soup.find_all('tr')[1:]:
    tds = tr.find_all('td')
    if len(tds)==3:
        Postcode=tds[0].text
        Borough=tds[1].text
        Neighbourhood=tds[2].text
        csv_writer.writerow([Postcode, Borough, Neighbourhood])
csv_file.close()

**Now let's load the data from a CSV file that's been cleaned.**

In [17]:
import pandas as pd
df_cd=pd.read_csv('canada1.csv')

In [18]:
df_cd[df_cd.columns] = df_cd.apply(lambda x: x.str.strip('\n'))
df_cd

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


**Get those 'Not assigned' values out of here!**

In [19]:
df_ab=df_cd[df_cd.Borough!='Not assigned']
df_ab

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


**Now we'll combine neighborhoods with similar postal codes, and we'll separate them with a comma.**

In [20]:
df_ab.set_index(['Postcode','Borough'],inplace=True)
res = df_ab.groupby(level=['Postcode','Borough'], sort=False).agg( ','.join)

In [21]:
res.reset_index()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


**Now we'll assign the Borough's name to any Neighborhood where a 'Not assigned' values lingers.**

In [34]:
res.loc[res['Neighbourhood'] =='Not assigned']=res[res['Neighbourhood'] == 'Not assigned'].index
res

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Harbourfront,Regent Park"
M6A,North York,"Lawrence Heights,Lawrence Manor"
M7A,Queen's Park,Queen's Park
M9A,Etobicoke,Islington Avenue
M1B,Scarborough,"Rouge,Malvern"
M3B,North York,Don Mills North
M4B,East York,"Woodbine Gardens,Parkview Hill"
M5B,Downtown Toronto,"Ryerson,Garden District"


**Let's make it look a little nicer by restting the index of the dataframe.**

In [36]:
df=res.reset_index()
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


**Let's see how many rows and columns there are.**

In [37]:
df.shape
#The number of rows will be seen first followed by the number of columns.

(103, 3)

In [38]:
df["Address"]=df['Borough'] +", " +df['Postcode'] 
df

Unnamed: 0,Postcode,Borough,Neighbourhood,Address
0,M3A,North York,Parkwoods,"North York, M3A"
1,M4A,North York,Victoria Village,"North York, M4A"
2,M5A,Downtown Toronto,"Harbourfront,Regent Park","Downtown Toronto, M5A"
3,M6A,North York,"Lawrence Heights,Lawrence Manor","North York, M6A"
4,M7A,Queen's Park,Queen's Park,"Queen's Park, M7A"
5,M9A,Etobicoke,Islington Avenue,"Etobicoke, M9A"
6,M1B,Scarborough,"Rouge,Malvern","Scarborough, M1B"
7,M3B,North York,Don Mills North,"North York, M3B"
8,M4B,East York,"Woodbine Gardens,Parkview Hill","East York, M4B"
9,M5B,Downtown Toronto,"Ryerson,Garden District","Downtown Toronto, M5B"


**We now need to upload a url to acquire latitudes and longitudes, and we'll use Google's API to do so.**

You can obtain an API through Google from this website: https://developers.google.com/maps/documentation/javascript/get-api-key

In [62]:
url2 = []
for i in range(0, len(df)):
    url='https://maps.googleapis.com/maps/api/geocode/json?address={},+Toronto+Ontario,+CA&key=AIzaSyAxpPAqgEQ_28-R5wLQwL9L1LTFvBIAXeA'.format(df['Address'][i])
    url2.append(url)

In [63]:
df['Latitude']=0
df['Longitude']=0
for i, url in enumerate(url2):
    r = requests.get(url2[i], params='params')
    results = r.json()['results']
    if results:
        location = results[0]['geometry']['location']
    else:
        print ("no results")
    df['Latitude'].iloc[i] = location['lat']
    df['Longitude'].iloc[i]= location['lng']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results
no results

In [64]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood,Address,Latitude,Longitude
0,M3A,North York,Parkwoods,"North York, M3A",43.753259,-79.329656
1,M4A,North York,Victoria Village,"North York, M4A",43.753259,-79.329656
2,M5A,Downtown Toronto,"Harbourfront,Regent Park","Downtown Toronto, M5A",43.753259,-79.329656
3,M6A,North York,"Lawrence Heights,Lawrence Manor","North York, M6A",43.753259,-79.329656
4,M7A,Queen's Park,Queen's Park,"Queen's Park, M7A",43.753259,-79.329656
5,M9A,Etobicoke,Islington Avenue,"Etobicoke, M9A",43.753259,-79.329656
6,M1B,Scarborough,"Rouge,Malvern","Scarborough, M1B",43.753259,-79.329656
7,M3B,North York,Don Mills North,"North York, M3B",43.753259,-79.329656
8,M4B,East York,"Woodbine Gardens,Parkview Hill","East York, M4B",43.753259,-79.329656
9,M5B,Downtown Toronto,"Ryerson,Garden District","Downtown Toronto, M5B",43.753259,-79.329656
