# Peer Graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

In [15]:
# Import Libraries
#
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
##

 ### PART 1 START
 We're going to scrape our web source and clean our table to start.

In [16]:
source_1 = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

soup = BeautifulSoup(requests.get(source_1).text, 'html5lib')

for i in soup.table.find_all('a'):  # Elimentating hyperlinks
    i = i.unwrap()


Now we'll feed our table pushed as text into a pandas dataframe

In [17]:
# Reading table into pandas
panda_soup = pd.read_html(soup.table.prettify(), flavor='bs4', na_values= "Not assigned")


In [18]:
# Let's check it out
df = panda_soup[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Let's also examine the shape we're starting with.
df.shape would work just fine if you're running the cells sequentially. I'm using the master copy because the cells below will modify df

In [19]:
panda_soup[0].shape

(287, 3)

#### Handling our missing values

In [20]:
#Find all the Null Boroughs to get rid of them

Null_Borough_Indices = df[df['Borough'].isna()==True].index

df.drop(Null_Borough_Indices, inplace=True)   # Dropping them

# Next replace all non-existent Neighborhoods with the Borough name

df.fillna(df['Borough'])

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


Here we'll concatanate all the neighborhoods that exist within the same postcode

In [21]:
# Merging Neighborhoods of Duplicate Postcodes

df = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [22]:
# Result
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Finally as per project requirement here is the ending shape

In [23]:
df.shape

(103, 3)

## PART 1 END

--- --- --- 

## Part 2 START

In [24]:
!wget https://cocl.us/Geospatial_data

--2020-03-12 19:02:58--  https://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.48.113.194, 158.85.108.86, 158.85.108.83
Connecting to cocl.us (cocl.us)|169.48.113.194|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-03-12 19:02:59--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.29.197
Connecting to ibm.box.com (ibm.box.com)|107.152.29.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-03-12 19:02:59--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr

In [25]:
gp_data = pd.read_csv('Geospatial_data')


Now that we've downloaded and read in the Data, we'll rename the Postcode column to be consistent

In [26]:
gp_data.rename(columns={'Postal Code': 'Postcode'}, inplace=True)

Now we'll do a full outer join using pandas' merge function on the Postcode key

In [27]:
new_frame = pd.merge(df,gp_data, how='outer', on=['Postcode'])


This creates the DataFrame we need for the result of Part 2

In [28]:
new_frame.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## Part 2 END

---

## Part 3 START

In [3]:
# Gotta install folium sometimes. Uncomment if needed
#!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 17.6MB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [12]:
# Import required libraries

import folium
from sklearn.neighbors import KNeighborsClassifier

In [32]:
Toronto_lat = 43.6487
Toronto_long = -79.38544
main_map = folium.Map(location = [Toronto_lat, Toronto_long], zoom_start = 10)

In [33]:
for lat, lng, borough, postcode in zip(new_frame['Latitude'], new_frame['Longitude'], new_frame['Borough'], new_frame['Postcode']):
    label = '{}, {}'.format(postcode, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(main_map)  


In [34]:
main_map