<h2>Segmenting and Clustering Neighborhood Notebook</h2>

In [46]:
import pandas as pd

In [47]:
import numpy as np
import requests

In [48]:
#Beautifulsoup to scrape web data
from bs4 import BeautifulSoup

In [49]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [50]:
soup = BeautifulSoup(source.text, 'lxml')

In [51]:
#using soup object, iterate the .wikitable to get the data from the HTML page and store it into a list
data = []
columns = []
table = soup.find(class_='wikitable')

In [52]:
for index, tr in enumerate(table.find_all('tr')):
        section = []
        for td in tr.find_all(['th','td']):
            section.append(td.text.rstrip())
        
        #First row of data is the header\n,
        if (index == 0):
            columns = section
        else:
            data.append(section)
    

<h2>Create Dataframe</h2>

In [53]:
canada_df = pd.DataFrame(data = data,columns = columns)
canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


<h2>Data Cleanup</h2>

<li> Remove Boroughs that are 'Not assigned' </li>

In [54]:
indexNames = canada_df[canada_df['Borough'] == 'Not assigned' ].index
canada_df.drop(indexNames , inplace=True)

In [55]:
canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


<li>Group by neighbourhood & remove duplicate</li>

In [56]:
canada_df["Neighbourhood"] = canada_df.groupby("Postcode")["Neighbourhood"].transform(lambda neigh: ', '.join(neigh))

In [57]:
#remove duplicates
    canada_df.drop_duplicates()
    canada_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M6A,North York,"Lawrence Heights, Lawrence Manor"
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,"Rouge, Malvern"
11,M1B,Scarborough,"Rouge, Malvern"
13,M3B,North York,Don Mills North


In [58]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
    canada_df['Neighbourhood'].replace("Not assigned", canada_df["Borough"],inplace=True)
    canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M6A,North York,"Lawrence Heights, Lawrence Manor"


<li>Number of Rows in Dataframe</li>

In [59]:
 canada_df.shape

(210, 3)