Title: Segmenting and Clustering Neighborhoods in Toronto

In [58]:
import pandas as pd
import numpy as np
import requests 
from bs4 import BeautifulSoup

import os

Retrieving Data from Wikipedia URL and Creating Pandas Dataframe

In [59]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" 
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')

table = soup.find('table', {'class':'wikitable sortable'}).tbody 

rows = table.find_all('tr')
columns = [v.text.replace('\n','') for v in rows[0].find_all('th')]
columns

df = pd.DataFrame(columns=columns)

# populate dataframe with the table created
for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
    if len(tds) == 3:
        values = [tds[0].text.replace('\n',''), tds[1].text.replace('\n',''), tds[2].text.replace('\n','')]
    else:
        values = [td.text.replace('\n','') for td in tds]
    #print(values)    
    df = df.append(pd.Series(values, index=columns), ignore_index=True)

df = df.sort_values(by ='Postal Code' ) #sort values by postal code just to compare when joining data frames
df.reset_index(drop=True, inplace=True) #reset index values
df.head(10) 


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M1B,Scarborough,"Malvern, Rouge"
2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
6,M1J,Scarborough,Scarborough Village
7,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
8,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
9,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"


In [60]:
df.tail(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
170,M9N,York,Weston
171,M9P,Etobicoke,Westmount
172,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
173,M9S,Not assigned,Not assigned
174,M9T,Not assigned,Not assigned
175,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
176,M9W,Etobicoke,"Northwest, West Humber - Clairville"
177,M9X,Not assigned,Not assigned
178,M9Y,Not assigned,Not assigned
179,M9Z,Not assigned,Not assigned


Cleaning Dataframe Created Based on Assignment Instructions 

In [61]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
#Note: all not assigned neighbourhood don't have borough
df['Borough'].replace("Not assigned", np.nan, inplace = True)
df.dropna(subset=['Borough'], axis=0, inplace=True)

#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
print(df.shape)

df = df.sort_values(by ='Postal Code' ) #sort values by postal code just to compare when joining data frames
df.reset_index(drop=True, inplace=True) #reset index values
df.head(10)

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [62]:
df.tail(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
93,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
94,M9B,Etobicoke,"West Deane Park, Princess Gardens, Martin Grov..."
95,M9C,Etobicoke,"Eringate, Bloordale Gardens, Old Burnhamthorpe..."
96,M9L,North York,Humber Summit
97,M9M,North York,"Humberlea, Emery"
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
102,M9W,Etobicoke,"Northwest, West Humber - Clairville"


Getting Latitude and the Longitude Coordinates of each Neighborhood 

In [63]:
wiki_url = "https://github.com/cneves20/Coursera_Capstone/blob/main/Geospatial_Coordinates.csv" 
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')

table2 = soup.find('table', {'class':'js-csv-data csv-data js-file-line-container'}).tbody 

rows2 = table2.find_all('tr')

values2_list = [] # create empty list to store final values

#populate dataframe with the table created
for k in range(0, len(rows2)):
    tds2 = rows2[k].find_all('td')
    if len(tds2) == 3:
        values2 = [tds2[0].text, tds2[1].text, tds2[2].text, tds2[3].text]
    else:
        values2 = [td.text for td in tds2]
    values2_list.append(values2)      

df2 = pd.DataFrame(values2_list, columns = ['Number','Postal Code','Latitude','Longitude'])

del df2['Number']
df2 = df2.sort_values(by ='Postal Code' ) #sort values by postal code just to compare when joining data frames
df2.reset_index(drop=True, inplace=True) #reset index values
df2.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.8066863,-79.1943534
1,M1C,43.7845351,-79.1604971
2,M1E,43.7635726,-79.1887115
3,M1G,43.7709921,-79.2169174
4,M1H,43.773136,-79.2394761
5,M1J,43.7447342,-79.2394761
6,M1K,43.7279292,-79.2620294
7,M1L,43.7111117,-79.2845772
8,M1M,43.716316,-79.2394761
9,M1N,43.692657,-79.2648481


In [64]:
df2.tail(10)

Unnamed: 0,Postal Code,Latitude,Longitude
93,M9A,43.6678556,-79.5322424
94,M9B,43.6509432,-79.5547244
95,M9C,43.6435152,-79.5772008
96,M9L,43.7563033,-79.5659633
97,M9M,43.7247659,-79.5322424
98,M9N,43.706876,-79.5181884
99,M9P,43.696319,-79.5322424
100,M9R,43.6889054,-79.5547244
101,M9V,43.7394164,-79.5884369
102,M9W,43.7067483,-79.5940544


Joining Both Dataframes

In [67]:
merged_inner = pd.merge(left=df, right=df2, left_on='Postal Code', right_on='Postal Code')
merged_inner.reset_index(drop=True, inplace=True)
merged_inner.shape
merged_inner = merged_inner.sort_values(by ='Postal Code' ) #sort values by postal code just to compare when joining data frames
merged_inner.reset_index(drop=True, inplace=True) #reset index values
merged_inner.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.8066863,-79.1943534
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.7845351,-79.1604971
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,43.7709921,-79.2169174
4,M1H,Scarborough,Cedarbrae,43.773136,-79.2394761
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.5181884
99,M9P,Etobicoke,Westmount,43.696319,-79.5322424
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.6889054,-79.5547244
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.7394164,-79.5884369


In [66]:
merged_inner.tail(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
93,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.6678556,-79.5322424
94,M9B,Etobicoke,"West Deane Park, Princess Gardens, Martin Grov...",43.6509432,-79.5547244
95,M9C,Etobicoke,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",43.6435152,-79.5772008
96,M9L,North York,Humber Summit,43.7563033,-79.5659633
97,M9M,North York,"Humberlea, Emery",43.7247659,-79.5322424
98,M9N,York,Weston,43.706876,-79.5181884
99,M9P,Etobicoke,Westmount,43.696319,-79.5322424
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.6889054,-79.5547244
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.7394164,-79.5884369
102,M9W,Etobicoke,"Northwest, West Humber - Clairville",43.7067483,-79.5940544
