# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

The needed libraries are imported, among them requests to fetch the file and BeautifulSoup to parse it

In [18]:
# importing libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as BS

# installing geocoder
!pip3 install geocoder --user

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## First Part: Retrieving the Table and Processing it

Using request to fetch the file

In [19]:
#loading text from the web page

# source url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# performing the request
file = requests.get(url).text


Parsing the text with Beautiful Soup and retrieving the data table

In [20]:
# parsing data with Beautiful Soup
parsable_file = BS(file, 'lxml')

# retrieving the table
data_table = parsable_file.find('table')

Converting the table into a dataframe

In [21]:
# converting the table into a list
list = pd.read_html(str(data_table), header=0)
list

[    Postcode           Borough  \
 0        M1A      Not assigned   
 1        M2A      Not assigned   
 2        M3A        North York   
 3        M4A        North York   
 4        M5A  Downtown Toronto   
 5        M5A  Downtown Toronto   
 6        M6A        North York   
 7        M6A        North York   
 8        M7A      Queen's Park   
 9        M8A      Not assigned   
 10       M9A         Etobicoke   
 11       M1B       Scarborough   
 12       M1B       Scarborough   
 13       M2B      Not assigned   
 14       M3B        North York   
 15       M4B         East York   
 16       M4B         East York   
 17       M5B  Downtown Toronto   
 18       M5B  Downtown Toronto   
 19       M6B        North York   
 20       M7B      Not assigned   
 21       M8B      Not assigned   
 22       M9B         Etobicoke   
 23       M9B         Etobicoke   
 24       M9B         Etobicoke   
 25       M9B         Etobicoke   
 26       M9B         Etobicoke   
 27       M1C       

In [22]:
# converting the list into a dataframe
df = list[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Ignoring the rows with a not assigned borough

In [23]:
# ignoring the rows with a Borough equal to "Not assigned"
df = df[df.Borough != "Not assigned"].reset_index()
df.head()

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Harbourfront
3,5,M5A,Downtown Toronto,Regent Park
4,6,M6A,North York,Lawrence Heights


Combining in the same row the neighbourhoods corresponding to the same postcode

In [24]:
# grouping together all the neighbourhoods corresponding to the same postcode
df = df.groupby(by=['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index() 
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Managing the not assigned neighbourhoods

In [25]:
# substituting the corresponding borough value into the not assigned neighbourhoods
for i, neighbourhood in enumerate(df.Neighbourhood):
    if neighbourhood == 'Not assigned':
        df.Neighbourhood[i] = df.Borough[i]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Showing the final dataframe shape

In [26]:
df.shape

(103, 3)

## Second Part: Retrieving the coordinates of each Postal Code

Trying to use geocoder

In [None]:
# import geocoder
import geocoder 

# initializing coordinate variables 
latitude = []
longitude = []

# initializing counter
i=0

# initializing counter length
length = len(df)

# initializing postal code
postal_code = df.loc[:,'Postcode']

# looping until I get the coordinates
for i in range(len(df)):
    lat_lng_coords = None
    print("entered the for loop "+str(i))
    while(lat_lng_coords is None) and i<length:
        print("entered the while loop "+str(i))
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code[i]))
        print("went past geocoder "+str(i))
        lat_lng_coords = g.latlng
    latitude.append(lat_lng_coords[0])
    longitude.append(at_lng_coords[1])
    
print(latitude)
print(longitude)



**Unfortunately, the geocoder gets stuck and the code never reaches the print("went past geocoder "+str(i)) instruction**

Using the csv instead to retrieve coordinates

In [36]:
# loading the geospatial csv into a dataframe and reading it
geosp_url = 'https://cocl.us/Geospatial_data'
df_geosp=pd.read_csv(geosp_url)
df_geosp.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [38]:
# checking all the 103 Postal Codes present in the original dataframe are present
df_geosp.shape

(103, 3)

In [39]:
# joining the two dataframes using the postal code as key
df_final = df.join(df_geosp.set_index('Postal Code'), on='Postcode')

In [40]:
# checking the final result
df_final.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848
