# Segmenting and Clustering Neighborhoods in Toronto

In [54]:
import requests # library to handle requests
from bs4 import BeautifulSoup
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

import folium # plotting library


### 1. Web Scraping: Using beautiful soups to get the necessary data from Wikipedia page on Toronto

In [55]:
url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=926287641'
data  = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")

In [56]:
#soup

In [57]:
table=soup.find('table')
#table

In [58]:
table.findAll('td')
table.findAll('td')[7].text

'North York'

In [130]:
table_contents=pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])
table=soup.find('table')
tabledata = table.findAll('td')
for row in range(len(tabledata)):
    cell = {}
    if len(tabledata[row].text)==3:
        if tabledata[row].text in table_contents['PostalCode'].values:
            x = table_contents[table_contents['PostalCode'] == tabledata[row].text].index
            table_contents.iloc[x, 2] = table_contents.iloc[x, 2].values[0]+', '+tabledata[row+2].text
            
        else:
            table_contents = table_contents.append({'PostalCode':tabledata[row].text, 'Borough':tabledata[row+1].text,
                                                   'Neighborhood':tabledata[row+2].text}, ignore_index=True)
    else:
        pass
df=pd.DataFrame(table_contents)

### Data Wrangling

In [131]:
df = df.replace('\n', '', regex=True)
df = df.replace('Not assigned', np.nan, regex=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront, Regent Park"


In [132]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M9A,Etobicoke,Islington Avenue


In [133]:
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [134]:
#df.head()
df.shape

(102, 3)

### 2. Getting the Coordinates for each Postal Code

In [124]:
import geocoder # import geocoder

In [None]:
for postal_code in df['PostalCode'].values:
    lat_lng_coords = None

    while(lat_lng_coords is None):
      geo = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = geo.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    df = df.append({'Latitude':latitude, 'Longitude':longitude})

#### Geocoder wasn't very effective.

In [135]:
geo = pd.read_csv('Geospatial_Coordinates.csv')
geo.columns = ['PostalCode', 'Latitude', 'Longitude']
geo.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging data from different dataframes to get the required dataset.

In [136]:
df = df.merge(geo, on='PostalCode', how='left', indicator=True)

In [138]:
df.drop('_merge', axis=1, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242


In [140]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
...,...,...,...,...,...
97,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
98,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
99,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
100,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.636258,-79.498509


### 3. Visualisation and Clustering

In [141]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="Toronto")
location = geolocator.geocode(address)
latit = location.latitude
longit = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latit, longit))

The geograpical coordinate of Manhattan are 43.6534817, -79.3839347.


In [143]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latit, longit], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto