# SEGMENTING NEIGHBORHOODS IN TORONTO

## 1. Importing libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner
from bs4 import BeautifulSoup
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


print('Libraries imported.')

Libraries imported.


## 2.Importing the Data from Wikipedia, and creating the dataframe that will store the data

In [4]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

In [5]:
neighborhoods = pd.DataFrame(columns=['PostalCode','Borough','Neighborhood'])

## 3.Storing the data on the dataframe

In [6]:
sopa = soup.findAll('table',class_='wikitable sortable')
i = -1

for div in sopa:
    rows = div.findAll('tr')
    for row in rows:
        if i>-1:
            ro = row.findAll('td')
            j=0
            tds = []
            for r in ro:
                tds.insert(j,r.text.strip())
                j+=1
            neighborhoods.loc[i]=tds
        i+=1

In [7]:
neighborhoods.shape

(288, 3)

In [8]:
neighborhoods.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## 4. Deleting the cells (rows) that are not assigned to a Borough

In [9]:
j=0
tds=[]
for i in neighborhoods.index:
    if neighborhoods.loc[i,'Borough']=="Not assigned":
        tds.insert(j,i)
        j+=1
neighborhoods = neighborhoods.drop(tds)
neighborhoods = neighborhoods.reset_index(drop=True)

In [10]:
neighborhoods.shape

(211, 3)

In [11]:
neighborhoods.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


## 5. If do not have a neighborhood assigned, we'll assign the same borough

In [12]:
for i in neighborhoods.index:
    if neighborhoods.loc[i,'Neighborhood']=="Not assigned":
        neighborhoods.loc[i,'Neighborhood'] = neighborhoods.loc[i,'Borough']

In [13]:
neighborhoods.shape

(211, 3)

In [14]:
neighborhoods.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


## 6. Grouping under a single postal code. Changing principal dataframe (Neighborhoods to Data)

In [15]:
data = neighborhoods.copy()

In [16]:
for i in neighborhoods.index:
    tds = []
    sub = []
    for j in neighborhoods.index:
        n=0
        if neighborhoods.loc[i,'PostalCode']==neighborhoods.loc[j,'PostalCode']:
            tds.insert(n,j)
            sub.insert(n,neighborhoods.loc[j,'Neighborhood'])
            n+=1
    seperator = ', '
    data.loc[i,'Neighborhood'] = seperator.join(sub)

In [17]:
data = data.drop_duplicates(subset='PostalCode', keep='first')

In [18]:
data = data.reset_index(drop=True)

In [22]:
data.shape

(103, 3)

In [23]:
data

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## 7. Importing the csv file (with Long and Lat Information)

In [25]:
import io

url = "https://cocl.us/Geospatial_data"
s = requests.get(url).content
ds = pd.read_csv(io.StringIO(s.decode('utf-8')))
ds.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## 8. Adding Latitude and Longitude Information to the Dataframe

In [26]:
for i in data.index:
    for j in ds.index:
        if data.loc[i,'PostalCode']==ds.loc[j,'Postal Code']:
            data.loc[i,'Latitude'] = ds.loc[j,'Latitude']
            data.loc[i,'Longitude'] = ds.loc[j,'Longitude']

In [27]:
data.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [28]:
data.shape

(103, 5)

## 9. Importing libraries for map

In [29]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  51.73 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  35.91 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  38.65 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  48.16 MB/s


## 10. Use geopy library to get the latitude and longitude values of Toronto

In [32]:
address = 'Toronto, TO'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6524203, -79.3834045.


## 11. Create a map of Toronto with neighborhoods superimposed on top

In [34]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(data['Latitude'], data['Longitude'], data['Borough'], data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto