# Segmenting and Clustering Neighborhoods in Toronto

Step 1: Import all necessary libraries 

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  54.43 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  35.91 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  39.02 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  45.23 MB/s
Libraries imported.


Step 2: Scrape data from webpage 

In [97]:
html_doc= requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
source = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [29]:
from bs4 import BeautifulSoup 
soup = BeautifulSoup(html_doc, 'html.parser') 

#print(soup.prettify())

In [98]:
PostalCodeList = []
BoroughList = []
NeighborhoodList = []
tbody = soup.find('tbody')


In [99]:
for index, value in enumerate(tbody.find_all('td')):
    if (index%3 == 0):
        PostalCodeList.append(value.text.strip())
    elif(index%3 == 1):
        BoroughList.append(value.text.strip())
    else:
        NeighborhoodList.append(value.text.strip())
dataDic = { "PostalCode":PostalCodeList, "Borough":BoroughList, "Neighborhood": NeighborhoodList }

In [100]:
df = pd.DataFrame.from_dict(dataDic)
print( df.head() )

            Borough      Neighborhood PostalCode
0      Not assigned      Not assigned        M1A
1      Not assigned      Not assigned        M2A
2        North York         Parkwoods        M3A
3        North York  Victoria Village        M4A
4  Downtown Toronto      Harbourfront        M5A


# Remove 'Not assigned' in Borough


In [101]:

df = df[df.Borough != 'Not assigned']
df.reset_index(drop=True, inplace=True)
print( df.head() )


            Borough      Neighborhood PostalCode
0        North York         Parkwoods        M3A
1        North York  Victoria Village        M4A
2  Downtown Toronto      Harbourfront        M5A
3  Downtown Toronto       Regent Park        M5A
4        North York  Lawrence Heights        M6A


The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [102]:
df.head()

Unnamed: 0,Borough,Neighborhood,PostalCode
0,North York,Parkwoods,M3A
1,North York,Victoria Village,M4A
2,Downtown Toronto,Harbourfront,M5A
3,Downtown Toronto,Regent Park,M5A
4,North York,Lawrence Heights,M6A


Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

## Step 3: Setting Neighbourhood same name as borough if not assigned

In [103]:
aggregate_fun = {'PostalCode': 'first',
                 "Borough": 'first', 
                 "Neighborhood": lambda col: ','.join(col)}
df_new = df.groupby(df['PostalCode']).aggregate(aggregate_fun)

df_new.reset_index(drop=True, inplace=True)
#df_new


In [104]:
for index, row in df_new.iterrows():
    if (row.Neighborhood) == 'Not assigned':
        row.Neighborhood = row.Borough

In [105]:
print(df_new.iloc[85])

PostalCode               M7A
Borough         Queen's Park
Neighborhood    Queen's Park
Name: 85, dtype: object


Postal code in Canada shape 

In [106]:
df_new.shape


(103, 3)