# Segmenting and Clustering Neighborhoods in Toronto

## Loi Dinh

In [21]:
from bs4 import BeautifulSoup
import requests
import re
import json
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import lxml.html as lh
import pandas as pd
import numpy as np

### Scrape Data from Wiki

In [22]:
url ="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [23]:
source = requests.get(url).text
soup = BeautifulSoup(source, 'html.parser')

In [24]:
my_table = soup.find("table", class_ = 'wikitable sortable')

In [25]:
A=[]
B=[]
C=[]
for row in my_table.find_all('tr'):
    cells=row.find_all('td')
    if len(cells)==3:
        A.append(cells[0].text)
        B.append(cells[1].text)
        C.append(cells[2].text.rstrip('\n')) # remove the new line char from neighborhood c

### Create a Data Frame from Data Scrape

In [26]:
df = pd.DataFrame()
df['Postcode']=A
df['Borough']=B
df['Neighbourhood']=C


### Drop rows with Borough != 'Not assigned'

In [27]:

df = df[df.Borough != 'Not assigned'].reset_index(drop=True)

### Merge row with the same Postcode

In [28]:
df = df.groupby(['Postcode', 'Borough'], as_index=False).agg(lambda x: ','.join(x))
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


### Clean with Not assigned neighborhood

In [29]:
na_rows = df.Neighbourhood == 'Not assigned'
df.loc[na_rows, 'Neighbourhood'] = df.loc[na_rows, 'Borough']
df[na_rows]

Unnamed: 0,Postcode,Borough,Neighbourhood


### Shape of Data Frame

In [30]:
df.shape

(103, 3)

# Part 2

In [31]:
geocoder = "https://cocl.us/Geospatial_data"
coords = pd.read_csv(geocoder)

In [32]:
print(coords.shape, "\n", coords.head())

(103, 3) 
   Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476


### Merge 2 Data Frame

In [35]:
df_temp = df.set_index('Postcode')
coords_temp = coords.set_index('Postal Code')
df_coords = pd.concat([df_temp, coords_temp], axis=1, join='inner')

In [37]:
# reset index
df_coords.index.name = 'Postcode'
df_coords.reset_index(inplace=True)

In [39]:
df_coords.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
