In [1]:
!pip install BeautifulSoup4
!pip install requests



In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

soup = BeautifulSoup(source, "lxml")

# print(soup.prettify())


In [3]:
table = soup.find("table", {"class":"wikitable sortable"})
table_rows = table.find_all('tr')

# print(table_rows)

In [4]:
empty_list=[]
for row in table_rows:
    empty_list.append([t.text.strip() for t in row.find_all("td")])

print(empty_list)

[[], ['M1A', 'Not assigned', 'Not assigned'], ['M2A', 'Not assigned', 'Not assigned'], ['M3A', 'North York', 'Parkwoods'], ['M4A', 'North York', 'Victoria Village'], ['M5A', 'Downtown Toronto', 'Regent Park, Harbourfront'], ['M6A', 'North York', 'Lawrence Manor, Lawrence Heights'], ['M7A', 'Downtown Toronto', "Queen's Park, Ontario Provincial Government"], ['M8A', 'Not assigned', 'Not assigned'], ['M9A', 'Etobicoke', 'Islington Avenue, Humber Valley Village'], ['M1B', 'Scarborough', 'Malvern, Rouge'], ['M2B', 'Not assigned', 'Not assigned'], ['M3B', 'North York', 'Don Mills'], ['M4B', 'East York', 'Parkview Hill, Woodbine Gardens'], ['M5B', 'Downtown Toronto', 'Garden District, Ryerson'], ['M6B', 'North York', 'Glencairn'], ['M7B', 'Not assigned', 'Not assigned'], ['M8B', 'Not assigned', 'Not assigned'], ['M9B', 'Etobicoke', 'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale'], ['M1C', 'Scarborough', 'Rouge Hill, Port Union, Highland Creek'], ['M2C', 'Not assigned'

In [5]:
df = pd.DataFrame(empty_list, columns=["PostalCode", "Borough", "Neighbourhood"])

print(df.head(15))

   PostalCode           Borough                                Neighbourhood
0        None              None                                         None
1         M1A      Not assigned                                 Not assigned
2         M2A      Not assigned                                 Not assigned
3         M3A        North York                                    Parkwoods
4         M4A        North York                             Victoria Village
5         M5A  Downtown Toronto                    Regent Park, Harbourfront
6         M6A        North York             Lawrence Manor, Lawrence Heights
7         M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government
8         M8A      Not assigned                                 Not assigned
9         M9A         Etobicoke      Islington Avenue, Humber Valley Village
10        M1B       Scarborough                               Malvern, Rouge
11        M2B      Not assigned                                 Not assigned

In [6]:
df = df[~df['PostalCode'].isnull()]  # filter null list items

print(df.head(10))

   PostalCode           Borough                                Neighbourhood
1         M1A      Not assigned                                 Not assigned
2         M2A      Not assigned                                 Not assigned
3         M3A        North York                                    Parkwoods
4         M4A        North York                             Victoria Village
5         M5A  Downtown Toronto                    Regent Park, Harbourfront
6         M6A        North York             Lawrence Manor, Lawrence Heights
7         M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government
8         M8A      Not assigned                                 Not assigned
9         M9A         Etobicoke      Islington Avenue, Humber Valley Village
10        M1B       Scarborough                               Malvern, Rouge


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180 entries, 1 to 180
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   PostalCode     180 non-null    object
 1   Borough        180 non-null    object
 2   Neighbourhood  180 non-null    object
dtypes: object(3)
memory usage: 5.6+ KB


In [8]:
df.shape

(180, 3)

In [9]:
df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


In [10]:
df1 = df.reset_index()
df1.head(10)

Unnamed: 0,index,PostalCode,Borough,Neighbourhood
0,3,M3A,North York,Parkwoods
1,4,M4A,North York,Victoria Village
2,5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,6,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,10,M1B,Scarborough,"Malvern, Rouge"
7,12,M3B,North York,Don Mills
8,13,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,14,M5B,Downtown Toronto,"Garden District, Ryerson"


In [11]:
df2 = df1.groupby("PostalCode").agg(lambda x: ",".join(x))
# More than one neighborhood can exist in one postal code area.
# For example, in the table on the Wikipedia page, you will notice 
# that M5A is listed twice and has two neighborhoods: Harbourfront 
# and Regent Park. These two rows will be combined into one row 
# with the neighborhoods separated with a comma as shown in row 11 
# in the above table.

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


In [12]:
df2.loc[df2['Neighbourhood']=="Not assigned",'Neighbourhood']=df2.loc[df2['Neighbourhood']=="Not assigned",'Borough']

df2.reset_index().head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [13]:
df2.shape

(103, 2)

In [14]:
df.to_csv("df2.csv", index=False)
print("Q1 ends here \n-----------------------------------------------------------------------")
print("Q2 starts here")

PermissionError: [Errno 13] Permission denied: 'df2.csv'

In [15]:
import pandas as pd
data = pd.read_csv("Geospatial_Coordinates.csv")
data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
data.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
data.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
## join 2 tables (df2 and data)

new_df = pd.merge(df2,data,on=['PostalCode','PostalCode'],how="inner")

new_df.head(15)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [18]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PostalCode     103 non-null    object 
 1   Borough        103 non-null    object 
 2   Neighbourhood  103 non-null    object 
 3   Latitude       103 non-null    float64
 4   Longitude      103 non-null    float64
dtypes: float64(2), object(3)
memory usage: 4.8+ KB


In [19]:
# column_order = ['PostalCode', 'Borough', 'Neighbourhood','Latitude', 'Longitude']
# print(column_order)

In [20]:
# new_df=new_df[column_order]
# new_df.head(15)

In [21]:
sorted_df = new_df.sort_values(by=["Borough", "Latitude", "Longitude"], 
                               ascending=[True, True, True],
                              ignore_index=True)

sorted_df

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.672710,-79.405678
1,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049
2,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.383160
3,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307
4,M4S,Central Toronto,Davisville,43.704324,-79.388790
...,...,...,...,...,...
98,M6N,York,"Runnymede, The Junction North",43.673185,-79.487262
99,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512
100,M6M,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",43.691116,-79.476013
101,M6C,York,Humewood-Cedarvale,43.693781,-79.428191


In [22]:
print("Q2 ends here \n------------------------------------------------------------------------")
print("Q3 starts here - Clustering Visualization")

Q2 ends here 
------------------------------------------------------------------------
Q3 starts here - Clustering Visualization


In [None]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [None]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

map_toronto

In [None]:
df_for_Q3 = sorted_df[(sorted_df["Borough"] == "Downtown Toronto") | 
                      (sorted_df["Borough"] == "Central Toronto") | 
                      (sorted_df["Borough"] == "East Toronto") |
                      (sorted_df["Borough"] == "West Toronto")] 

df_for_Q3

In [None]:
df_for_Q3.info()

In [None]:
df = df_for_Q3.drop(["Borough","PostalCode","Neighbourhood"], axis=1)
df.info()

In [None]:
# Standardization (Data Pre-processing)

from sklearn.preprocessing import StandardScaler

X = df.values
X = np.nan_to_num(X)
cluster_dataset = StandardScaler().fit_transform(X)
cluster_dataset

In [85]:
# Modeling

num_clusters = 4

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(cluster_dataset)
labels = k_means.labels_

print(labels)

[2 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 0 0 0 0 0 3 3 3 3
 3 3]


In [86]:
# Insights

df["Label"] = labels  # Adding [labels] to df
df

Unnamed: 0,Latitude,Longitude,Label
0,43.67271,-79.405678,2
1,43.686412,-79.400049,1
2,43.689574,-79.38316,1
3,43.696948,-79.411307,1
4,43.704324,-79.38879,1
5,43.711695,-79.416936,1
6,43.712751,-79.390197,1
7,43.715383,-79.405678,1
8,43.72802,-79.38879,1
9,43.628947,-79.39442,2


In [91]:
df.groupby("Label").mean()

Unnamed: 0_level_0,Latitude,Longitude
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,43.669436,-79.324654
1,43.705639,-79.398114
2,43.654764,-79.383083
3,43.655066,-79.445472


In [94]:
print("4 clusters with their k-mean GET!")

4 clusters with their k-mean GET!
