In [49]:
# Scrape the web for toronto's data 
# import libraries

from bs4 import BeautifulSoup
import requests as requests 
import pandas as pd

In [50]:
URL='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response= requests.get(URL)
soup= BeautifulSoup(response.text, 'html.parser')

#table = soup.find('table', {'class':'wikitable sortable jquery-tablesorter'})

print(soup)


<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"7d7f7700-9037-4d5c-b82b-6c37b4edbcf9","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":975466835,"wgRevisionId":975466835,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Communications in Ontario","

In [51]:
# Classify Tables, Rows And Columns

table = soup.find('table', {'class':'wikitable sortable'}).tbody

rows = table.find_all('tr')

columns = [v.text.replace('\n', '' ) for v in rows[0].find_all('th')]

print(columns)

['Postal Code', 'Borough', 'Neighbourhood']


In [52]:
# Create a DataFrame And Populate It 

df= pd.DataFrame(columns=columns)

for i in range(1,len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds) == 3:
        values= [tds[0].text.replace('\n',''), tds[1].text.replace('\n',''), tds[2].text.replace('\n','')]
    else:
        values= [td.text.replace('\n','') for td in tds]
        
    df = df.append(pd.Series(values, index=columns), ignore_index=True)
    

df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [53]:
# Deal with not assigned values

import numpy as np

# Replace Not assigned with NAN
df.replace("Not assigned", np.nan, inplace= True)

# Identify Missing Values using isnull() or notnull()
missing_data= df.isnull()
missing_data

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,False,True,True
1,False,True,True
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False
7,False,True,True
8,False,False,False
9,False,False,False


<h4>Count missing values in each column</h4>
<p>
Using a for loop in Python, we can quickly figure out the number of missing values in each column. As mentioned above, "True" represents a missing value, "False"  means the value is present in the dataset.  In the body of the for loop the method  ".value_counts()"  counts the number of "True" values. 
</p>

In [54]:
for column in missing_data.columns.values.tolist():
    print(column)
    print(missing_data[column].value_counts())
    print("")

Postal Code
False    180
Name: Postal Code, dtype: int64

Borough
False    103
True      77
Name: Borough, dtype: int64

Neighbourhood
False    103
True      77
Name: Neighbourhood, dtype: int64



<p> According to the above findings there are 77 missing values in Districts And Neigbourhoods. I haven't found any particular Neighbourhood with a assigned District but a null value so I will proceed with taking out all the rows with Not Assigned Values.</p>

In [55]:
# Simply drop whole row with NaN in "Neighbourhoods" and 'Districts' Column

df.dropna(subset=['Borough','Neighbourhood'], axis=0, inplace=True)

# reset index, because we dropped two rows 
df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [56]:
# The Shape of the Data Frame
df.shape

(103, 3)

In [57]:
# Import file containing the co-ordinates containing longitudes and latitudes into a dataframe df_cord
other_path= 'http://cocl.us/Geospatial_data'
df_cord =  pd.read_csv(other_path)

In [58]:
df_cord

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [69]:
left= df
right= df_cord
pd.merge_ordered(left, right)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


<p> 1. The above data frame contains all the co ordinates of all the postal codes in different boroughs, single and multiple neighbourhoods are enlisted in the neighborhoods depending upon the number of neighbourhoods located in a borough with respect to their postal codes as demanded in the assignment.<br> 2. The next step is to implement K- Means clustering on the derived data frame.</p>