<h2>Segmenting and Clustering Neighborhood Notebook</h2>

In [1]:
import pandas as pd
import numpy as np
import requests

In [2]:
#Beautifulsoup to scrape web data
from bs4 import BeautifulSoup

In [3]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [4]:
soup = BeautifulSoup(source.text, 'lxml')

In [5]:
#using soup object, iterate the .wikitable to get the data from the HTML page and store it into a list
data = []
columns = []
table = soup.find(class_='wikitable')

In [6]:
for index, tr in enumerate(table.find_all('tr')):
        section = []
        for td in tr.find_all(['th','td']):
            section.append(td.text.rstrip())
        
        #First row of data is the header\n,
        if (index == 0):
            columns = section
        else:
            data.append(section)

<h2>Create Dataframe</h2>

In [7]:
canada_df = pd.DataFrame(data = data,columns = columns)
canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


<h2>Data Cleanup</h2>

<li>Remove Boroughs that are 'Not assigned'</li>

In [8]:
indexNames = canada_df[canada_df['Borough'] == 'Not assigned' ].index
canada_df.drop(indexNames , inplace=True)
canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


<li>Group by neighbourhood & remove duplicate</li>

In [9]:
#remove duplicates
    canada_df.drop_duplicates()
    canada_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [10]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
    canada_df['Neighbourhood'].replace("Not assigned", canada_df["Borough"],inplace=True)
    canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


<li> Shape of Dataframe </li>

In [11]:
 canada_df.shape

(210, 3)

<h2> New Dataframe with latitude & longitude </h2>

In [12]:
df_coord = pd.read_csv("http://cocl.us/Geospatial_data")
df_coord.head(12)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [13]:
#Rename Coloumn name as Canada_df
df_coord.rename(columns={'Postal Code':'Postcode'}, inplace=True)

In [14]:
#join two dataframe together on Postcode
  result = pd.merge(canada_df, df_coord, how='inner', on = 'Postcode')

In [15]:
result.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763
5,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
6,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
7,M1B,Scarborough,Rouge,43.806686,-79.194353
8,M1B,Scarborough,Malvern,43.806686,-79.194353
9,M3B,North York,Don Mills North,43.745906,-79.352188
