## Segmenting and Clustering Neighborhoods in Toronto

### Load data from url

In [1]:
!pip install BeautifulSoup4

import urllib.request
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969'
page = urllib.request.urlopen(url)

soup = BeautifulSoup(page, 'html.parser')



### Parsing data in html

In [2]:
table_contents = []
table = soup.find('table')
for row in table.findAll('tr'):
    cell = {}
    col = 0
    for td in row.findAll('td'):
        text = td.text.replace('\n', '')
        if col == 0:
            cell['PostalCode'] = text
        elif col == 1:
            if text == 'Not assigned':
                break
            cell['Borough'] = text
        elif col == 2:
            cell['Neighborhood'] = text
            if cell['Neighborhood'] == 'Not assigned':
                cell['Neighborhood'] = cell['Borough']
        col = col + 1
        
        if col == 2:
            table_contents.append(cell)

### Convert data to dataframe

In [3]:
import pandas as pd
df = pd.DataFrame(table_contents)

In [4]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### Print dataframe rows and columns

In [5]:
df.shape

(103, 3)

### Set the latitude and longitude of address

In [15]:
df_postcode = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index() 
df_postcode.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [18]:
locgeo_df = pd.read_csv('https://cocl.us/Geospatial_data', index_col='Postal Code')
toronto_data = df_postcode.join(locgeo_df, on='PostalCode')

In [19]:
toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
