# Segmenting and Clustering Neighborhoods in City of Toronto

In [3]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install pandas
import pandas as pd

# import lxml.html to read data from html
!{sys.executable} -m pip install lxml
import lxml.html

Collecting pandas
  Downloading pandas-1.0.5-cp38-cp38-macosx_10_9_x86_64.whl (10.2 MB)
[K     |████████████████████████████████| 10.2 MB 3.3 MB/s eta 0:00:01    |███████████████▋                | 5.0 MB 3.4 MB/s eta 0:00:02
[?25hCollecting pytz>=2017.2
  Downloading pytz-2020.1-py2.py3-none-any.whl (510 kB)
[K     |████████████████████████████████| 510 kB 3.3 MB/s eta 0:00:01
Collecting numpy>=1.13.3
  Downloading numpy-1.19.1-cp38-cp38-macosx_10_9_x86_64.whl (15.3 MB)
[K     |████████████████████████████████| 15.3 MB 3.0 MB/s eta 0:00:01
Installing collected packages: pytz, numpy, pandas
Successfully installed numpy-1.19.1 pandas-1.0.5 pytz-2020.1
Collecting lxml
  Downloading lxml-4.5.2-cp38-cp38-macosx_10_9_x86_64.whl (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 2.4 MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.2


## 1. Scraping neighborhood table from wikipedia and exploring the dataset

In [4]:
# scraping neighborhood data from wikipedia

path = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
Toronto_temp = pd.read_html(path)
Toronto_data = Toronto_temp[0]
Toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
# get index name of row which it borough is not assigned
indexs = Toronto_data[Toronto_data['Borough']=='Not assigned'].index

# drop rows in indexs
Toronto_data.drop(indexs, inplace=True)
Toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
# check for Not assigned in Neighbourhood column
Toronto_data[Toronto_data['Neighbourhood']=='Not assigned'].shape

(0, 3)

#### The result above shows that there is no row that contains 'Not assigned' data. 
### Next is to rename the Postal Code column and reset the dataframe index so that the index is set as pandas default starting from 0,1,2,... and so on

In [15]:
Toronto_data.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
Toronto_data.reset_index(drop=True, inplace=True)
Toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## 2. Get the latitude and the longitude coordinates of each neighborhood

The geocoder did not work. I waited for the code to run for a long time but it's still not getting any coordinates so I decided to use the provided csv file instead.

In [46]:
url = 'http://cocl.us/Geospatial_data'
geodata = pd.read_csv(url)
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [52]:
geodata.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
Toronto_merged = Toronto_data.merge(geodata, copy=False)

In [53]:
Toronto_merged

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
