# Neighborhoods in Toronto
Applied Data Science Capstone Week 3 Assignment

## 1.  First Part of the Assginment: Scrape Wikipedia

#### Import the relevant libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlopen
from bs4 import BeautifulSoup
%matplotlib inline

#### Fetch the HTML content of the url and parse it with BeautifulSoup, find and reference the table with the codes

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
table = soup.find_all('table', class_='wikitable')[0]

#### Fill the initial dataframe with all uncleansed table rows
I know this causes triple iteration through the data rows. But it is more readable than a big for loop.

- Preventing the recursive search in `findChildren` will only return direct children. Not the anchors **a** contained in links.
- `get_text()` strips the HTML tags
- `strip()` strips whitespaces from the remaining string

In [3]:
df = pd.DataFrame({
    'PostalCode': [row.findChildren(recursive=False)[0].get_text().strip() for row in table.select('tr')],
    'Borough': [row.findChildren(recursive=False)[1].get_text().strip() for row in table.select('tr')],
    'Neighborhood': [row.findChildren(recursive=False)[2].get_text().strip() for row in table.select('tr')]
})

#### Drop the first row containing the header information

In [4]:
df = df.iloc[1:]

#### Drop the 'Not assigned' boroughs:

In [5]:
df = df[df['Borough']!='Not assigned']

#### Replace the Neighborhood if it is not assigned

In [6]:
df['Neighborhood'] = df.apply(lambda x: x['Borough'] if x['Neighborhood'] == 'Not assigned' else x['Neighborhood'], axis=1)

#### Group By Postal Code and Borough and concatenate Neighborhood

In [7]:
df = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: ', '.join(x)).reset_index()
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [8]:
df.shape

(103, 3)

## 2. Second part of the Assignment: Fetch Geo location data

#### Unable to fetch locations from geocoder... therefore I had to ue the CSV

    !pip install geocoder
    import geocoder # import geocoder
    
    def fetchFromGeoCoder(pc):
        lat_lng_coords = None
        while(lat_lng_coords is None):
            g = geocoder.google('{}, Toronto, Ontario'.format(pc))
            lat_lng_coords = g.latlng
        return lat_lng_coords
    
    df['latlng'] = df.apply(lambda x: fetchFromGeoCoder(x['PostalCode']), axis=1)

#### Fetch the geospatial data and prepare column name for merge

In [9]:
locations = pd.read_csv('http://cocl.us/Geospatial_data')

#### Prepare the PC column name for the merge

In [10]:
locations.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

#### Merge the two data sets

In [11]:
df = pd.merge(df, locations, how='left', on='PostalCode')

In [12]:
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## 3. Third part of the Assignment: Clustering the data

In [16]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 geopy --yes
import folium
from geopy.geocoders import Nominatim

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.18.1-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  21.96 MB/s
geopy-1.18.1-p 100% |################################| Time: 0:00:00  31.02 MB/s


In [17]:
df['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

#### Fetch the center of Toronto

In [20]:
geolocator = Nominatim(user_agent="ds-learning")
location = geolocator.geocode('Toronto, Ontario')
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### Display the map of Toronto

In [28]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for index, row in df.iterrows():
    label = '{}: {} ({})'.format(row['Borough'], row['Neighborhood'], row['PostalCode'])
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [row['Latitude'], row['Longitude']],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Subselection of Boroughs containing Toronto in their names

In [32]:
toronto_data = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [36]:
map_toronto_center = folium.Map(location=[latitude, longitude], zoom_start=12)

for index, row in toronto_data.iterrows():
    label = folium.Popup(row['Neighborhood'], parse_html=True)
    folium.CircleMarker(
        [row['Latitude'], row['Longitude']],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_center)  
    
map_toronto_center

In [43]:
# The code was removed by Watson Studio for sharing.