# Segmenting and Clustering Neighborhoods in Toronto

***

<font color=green> Section 1 of the assignment</font>

Scraping the wikipedia page and getting the data into a dataframe

In [1]:
#Importing the libraries I am going to use
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd


In [2]:
# specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

In [3]:
# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")

In [4]:
# use the 'find_all' function to bring back all instances of the 'table' tag in the HTML and store in 'all_tables' variable
all_tables=soup.find_all("table")
all_tables

[<table class="wikitable sortable">
 <tbody><tr>
 <th>Postal Code
 </th>
 <th>Borough
 </th>
 <th>Neighbourhood
 </th></tr>
 <tr>
 <td>M1A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M2A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M3A
 </td>
 <td>North York
 </td>
 <td>Parkwoods
 </td></tr>
 <tr>
 <td>M4A
 </td>
 <td>North York
 </td>
 <td>Victoria Village
 </td></tr>
 <tr>
 <td>M5A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Regent Park, Harbourfront
 </td></tr>
 <tr>
 <td>M6A
 </td>
 <td>North York
 </td>
 <td>Lawrence Manor, Lawrence Heights
 </td></tr>
 <tr>
 <td>M7A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Queen's Park, Ontario Provincial Government
 </td></tr>
 <tr>
 <td>M8A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M9A
 </td>
 <td>Etobicoke
 </td>
 <td>Islington Avenue, Humber Valley Village
 </td></tr>
 <tr>
 <td>M1B
 </td>
 <td>Scarborough
 </td>
 <td>Malvern, Rouge
 </td></tr>
 <tr>
 <td>M2B

In [5]:
# Looking through the output of ”all_tables” we can again see that the class id of our chosen table is ”wikitable sortable”. 
# We can use this to get BS to only bring back the table data for this particular table and keep that in a variable called ”right_table“

right_table=soup.find('table', class_='wikitable sortable')

# There are three columns in our table that we want to scrape the data from so we will set up three empty lists 
# (Postalcode, Borough, Neighbourhood) to store our data in

Postalcode=[]
Borough=[]
Neighbourhood=[]

#Using findAll function of Beautiful soup package I am getting all data from the table in the list

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        Postalcode.append(cells[0].find(text=True).rstrip("\n"))
        Borough.append(cells[1].find(text=True).rstrip("\n"))
        Neighbourhood.append(cells[2].find(text=True))
        
#Define my Toronto Data Frame using the lists above

torontodf=pd.DataFrame(Postalcode,columns=['Postalcode'])
torontodf['Borough']=Borough
torontodf['Neighbourhood']=Neighbourhood
torontodf



Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [6]:
# As the data is sufficiently clean (no duplicate Postalcode, no empty Neighborhood) I am just removing rows where Borough = 'Not assigned'
torontodf.drop(torontodf[torontodf.Borough == 'Not assigned'].index, inplace=True)
torontodf.shape

(103, 3)

<font color=green> This ends section 1 of the assignment </font> 

***
    

***

<font color=red> Section 2 of the assignment</font>

Getting longitude and latitude of each Postalcode

In [7]:
#Getting the postcodes from the URL and adding to a pandas dataframe
postcodes = pd.read_csv("http://cocl.us/Geospatial_data")
postcodes.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
# Merge the dataframes into another dataframe based on Postalcode
# First I have to rename a column in the postcode df to match the column in the toronto df
postcodes.rename(columns={"Postal Code": "Postalcode"}, inplace=True)
torontopostcodedf = pd.merge(torontodf, postcodes, how='outer', on=['Postalcode'])
torontopostcodedf.head()                                                          

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


<font color=red> This ends section 2 of the assignment </font> 

***

***

<font color=Blue> Section 3 of the assignment</font>

Explore and cluster the neighborhoods in Toronto

In [None]:
# Importing the necessary libraries


The Foursquare API credentials definition follows in an hidden cells

In [None]:
# The code was removed by Watson Studio for sharing.