This notebook is a part of Data Science Capstone project.

1) Scrape the data from the wiki page and convert the required table into a dataframe.

In [1]:
import pandas as pd
import folium
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

print("Libraries imported")

Libraries imported


In [2]:
# Web scraping
scrape = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
print("No of tables in the webpage: ",len(scrape))
print("\nSelecting the first table")
df=scrape[0]
df.head()

No of tables in the webpage:  3

Selecting the first table


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [3]:
#to check if there is assigned boroughs with unassigned neighborhood

print("No of assigned boroughs with unassigned neighborhood :",
      len(df[(df["Borough"]!="Not assigned") & (df["Neighborhood"]=="Not assigned")]))

#to check if there is any unassigned boroughs

print("No of unassigned boroughs :",
      len(df[df["Borough"]=="Not assigned"]))

No of assigned boroughs with unassigned neighborhood : 0
No of unassigned boroughs : 77


In [4]:
#creating the dataframe after removing the 77 rows of unassigned boroughs and printing the number of rows

can_df = df[df["Borough"]!= "Not assigned"].reset_index(drop=True)
print("No of rows of new DataFrame : ",can_df.shape[0])

No of rows of new DataFrame :  103


2) Load the geospatial coordinates data and merge it with the neighborhood data.

In [5]:
# Load the coordinates data

coordinates=pd.read_csv("C:\\Users\\91814\\Downloads\\Geospatial_Coordinates.csv")
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
#merge the data from the geospatial .csv file

toronto_df = pd.merge(can_df,coordinates,on="Postal Code", how="left")
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [7]:
# Get the latitude and longitude values for Toronto

geolocator = Nominatim(user_agent="loc_finder")
coordinates=geolocator.geocode("Toronto, Ontario")
coordinates

Location(Toronto, Golden Horseshoe, Ontario, M5H 2N2, Canada, (43.6534817, -79.3839347, 0.0))

In [8]:
# Generate map and plot the neighborhoods

map_toronto = folium.Map(location=[coordinates.latitude, coordinates.longitude], zoom_start=10)

for lat, long, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], 
                                            toronto_df['Borough'], toronto_df['Neighborhood']):
        label = '{}, {}'.format(neighborhood, borough)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, long],
            radius=5,
            popup=label,
            color="Teal",
            fill=True,
            fill_opacity=0.7,
            parse_html=False).add_to(map_toronto)  

map_toronto

3) Clustering the neighborhoods based on latitude and logitude values

In [9]:
# Setting the number of clusters and fitting the data

k=5
kmeans = KMeans(n_clusters = k,random_state=4)
kmeans.fit(toronto_df[["Latitude","Longitude"]])
kmeans.labels_

array([0, 0, 4, 2, 4, 1, 3, 0, 0, 4, 2, 1, 3, 0, 0, 4, 4, 1, 3, 0, 4, 4,
       3, 0, 4, 4, 3, 2, 2, 0, 4, 4, 3, 2, 2, 0, 4, 4, 3, 2, 2, 0, 4, 4,
       0, 2, 1, 0, 4, 1, 1, 3, 2, 1, 0, 2, 1, 1, 0, 2, 1, 2, 2, 1, 1, 3,
       2, 2, 4, 1, 1, 3, 2, 2, 4, 4, 1, 1, 3, 4, 4, 1, 3, 4, 4, 3, 4, 4,
       1, 1, 3, 4, 4, 1, 1, 3, 4, 4, 1, 4, 0, 1, 1])

In [10]:
#adding the cluster label to toronto_df

toronto_df["Cluster"] = kmeans.labels_
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster
0,M3A,North York,Parkwoods,43.753259,-79.329656,0
1,M4A,North York,Victoria Village,43.725882,-79.315572,0
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,2
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,4


In [11]:
# Generate map and plot the clustered neighborhood

map_toronto = folium.Map(location=[coordinates.latitude, coordinates.longitude], zoom_start=10)

#to set a list of colors which would be used to plot the neighborhood on the map

col_series=["Blue","Red","Yellow","Green","Black","Pink","Purple","Grey","Teal"]

for i,clust in enumerate(toronto_df["Cluster"].unique()):
    filtered=toronto_df[toronto_df["Cluster"]==clust]
    n=i
    for lat, long, borough, neighborhood in zip(filtered['Latitude'], filtered['Longitude'], filtered['Borough'], filtered['Neighborhood']):
        label = '{}, {}'.format(neighborhood, borough)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, long],
            radius=5,
            popup=label,
            color=col_series[i],
            fill=True,
            fill_opacity=0.7,
            parse_html=False).add_to(map_toronto) 
        
map_toronto