# Coursera IBM Data Science Professional Specialization Capstone Project

## Description
This notebook will be used to complete the final capstone project.

In [5]:
import pandas as pd
import numpy as np

In [2]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


## Scrape Wikipedia for Toronto neighbourhoods

In [None]:
# Install BeautifulSoup
!conda install -c anaconda beautifulsoup4 --yes

In [1]:
from bs4 import BeautifulSoup
import requests

r = requests.get("https://en.wikipedia.org/wiki/List_of_neighbourhoods_in_Toronto")
soup = BeautifulSoup(r.content)

In [3]:
# Check that we have the right page
print(soup.title)

<title>List of neighbourhoods in Toronto - Wikipedia</title>


In [93]:
# Get coordinates for a neighbourhood
# Returns a LatLng array
def get_coordinates(neighbourhood):
    base_uri = 'https://en.wikipedia.org'
    r_neigh = requests.get(base_uri + neighbourhood.a['href'])
    soup_neigh = BeautifulSoup(r_neigh.content)
    
    # Get latitude
    try:
        lat = soup_neigh.find('span', class_='latitude').get_text()
    except:
        lat = None
    
    # Get longitude
    try:
        lng = soup_neigh.find('span', class_='longitude').get_text()
    except:
        lng = None
    
    return [lat, lng]

# Build dataframe
column_names = ['District', 'Neighbourhood', 'Latitude', 'Longitude']
neigh = pd.DataFrame(columns=column_names)

# Iterate through districts
for district in soup.find_all('h3'):
    # Get district name
    heading = district.find('span', class_='mw-headline')
    if heading is None:
        break
    district_name = heading.get_text()
    print('Getting data for {}'.format(district_name))
    
    # Get neighbourhoods in district
    for neighbourhood in district.find_next_sibling('div', class_=None).table.find_all('li'):        
        lat_lng = get_coordinates(neighbourhood)
        neighbourhood_name = neighbourhood.get_text()
        neigh = neigh.append({'District': district_name,
                              'Neighbourhood': neighbourhood_name,
                              'Latitude': lat_lng[0],
                              'Longitude': lat_lng[1]}, ignore_index=True)

neigh

Getting data for Old Toronto
Getting data for East York
Getting data for Etobicoke
Getting data for North York
Getting data for Scarborough
Getting data for York


Unnamed: 0,District,Neighbourhood,Latitude,Longitude
0,Old Toronto,Alexandra Park,43°39′N,79°24′W
1,Old Toronto,The Annex,43°40′12″N,79°24′14″W
2,Old Toronto,Baldwin Village,43°39′22″N,79°23′36″W
3,Old Toronto,Cabbagetown,43°39′59″N,79°21′46″W
4,Old Toronto,CityPlace,43°38′24″N,79°23′43″W
5,Old Toronto,Chinatown,43°39′10″N,79°23′53″W
6,Old Toronto,Church and Wellesley,43°39′56.50″N,79°22′51.44″W
7,Old Toronto,Corktown,43°39′20″N,79°21′35″W
8,Old Toronto,Discovery District,43°39′29″N,79°23′17″W
9,Old Toronto,Distillery District,,


In [101]:
# Save data as CSV
neigh.to_csv('toronto_neigh.csv')

In [95]:
print('The dataframe has {} districts and {} neighbourhoods.'.format(
        len(neigh['District'].unique()),
        neigh.shape[0]
    )
)

The dataframe has 6 districts and 212 neighbourhoods.


In [96]:
!conda install -c conda-forge geopy --yes

Collecting package metadata: ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\Desmond\Anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.3.9           |           py37_0         149 KB  conda-forge
    conda-4.6.14               |           py37_0         2.1 MB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    geopy-1.19.0               |             py_0          53 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.3 MB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.49-py_0
  geopy              conda-forge/noarch::geopy-1.19.0-py_0

The following packages will be UPDATED:





  current version: 4.6.11
  latest version: 4.6.14

Please update conda by running

    $ conda update -n base -c defaults conda


'ET' is not recognized as an internal or external command,
operable program or batch file.


## Get coordinates for Toronto

In [2]:
from geopy.geocoders import Nominatim

In [3]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


## Wrangle data

In [82]:
df = pd.read_csv('toronto_neigh.csv', index_col=0)

# Convert latitude and longitude to decimals
# When there is no minute
df.loc[df['Latitude'].str.len() == 7, 'Latitude'] = \
    pd.to_numeric(df['Latitude'].str.slice(start=0, stop=2)) + \
    pd.to_numeric(df['Latitude'].str.slice(start=3, stop=5))/60

# When minutes is included
df.loc[df['Latitude'].str.len() > 7, 'Latitude'] = \
    pd.to_numeric(df['Latitude'].str.slice(start=0, stop=2)) + \
    pd.to_numeric(df['Latitude'].str.slice(start=3, stop=5))/60 + \
    np.nan_to_num(pd.to_numeric(df['Latitude'].str.slice(start=6, stop=-2)))/3600

# When there is no minute
df.loc[df['Longitude'].str.len() == 7, 'Longitude'] = \
    - pd.to_numeric(df['Longitude'].str.slice(start=0, stop=2)) - \
    pd.to_numeric(df['Longitude'].str.slice(start=3, stop=5))/60

# When minutes is included
df.loc[df['Longitude'].str.len() > 7, 'Longitude'] = \
    - pd.to_numeric(df['Longitude'].str.slice(start=0, stop=2)) - \
    pd.to_numeric(df['Longitude'].str.slice(start=3, stop=5))/60 - \
    np.nan_to_num(pd.to_numeric(df['Longitude'].str.slice(start=6, stop=-2)))/3600

df.head()

Unnamed: 0,District,Neighbourhood,Latitude,Longitude
0,Old Toronto,Alexandra Park,43.65,-79.4
1,Old Toronto,The Annex,43.67,-79.4039
2,Old Toronto,Baldwin Village,43.6561,-79.3933
3,Old Toronto,Cabbagetown,43.6664,-79.3628
4,Old Toronto,CityPlace,43.64,-79.3953


In [83]:
# Find NaN coordinates
df[df['Latitude'].isnull()]

Unnamed: 0,District,Neighbourhood,Latitude,Longitude
9,Old Toronto,Distillery District,,
10,Old Toronto,The Entertainment District,,
19,Old Toronto,Little Japan[5],,
78,Old Toronto,Little Tibet,,
109,Etobicoke,Humberwood,,
193,Scarborough,Steeles,,


In [84]:
# Clean Neighbourhood name
df.loc[19]['Neighbourhood'] = df.loc[19]['Neighbourhood'][0:-3]

df.loc[19]

District          Old Toronto
Neighbourhood    Little Japan
Latitude                  NaN
Longitude                 NaN
Name: 19, dtype: object

In [85]:
# Get coordinates for missing coordinates
for i, row in df[df['Latitude'].isnull()].iterrows():
    location = geolocator.geocode(row['Neighbourhood'] + ', Toronto, Canada')
    
    if location is not None:
        df.loc[row.name]['Latitude'] = location.latitude
        df.loc[row.name]['Longitude'] = location.longitude

In [86]:
# Find NaN coordinates
df[df['Latitude'].isnull()]

Unnamed: 0,District,Neighbourhood,Latitude,Longitude
19,Old Toronto,Little Japan,,


In [87]:
# drop row 19 as location cannot be found
df.drop([19], 0, inplace=True)

In [88]:
df

Unnamed: 0,District,Neighbourhood,Latitude,Longitude
0,Old Toronto,Alexandra Park,43.65,-79.4
1,Old Toronto,The Annex,43.67,-79.4039
2,Old Toronto,Baldwin Village,43.6561,-79.3933
3,Old Toronto,Cabbagetown,43.6664,-79.3628
4,Old Toronto,CityPlace,43.64,-79.3953
5,Old Toronto,Chinatown,43.6528,-79.3981
6,Old Toronto,Church and Wellesley,43.6657,-79.381
7,Old Toronto,Corktown,43.6556,-79.3597
8,Old Toronto,Discovery District,43.6581,-79.3881
9,Old Toronto,Distillery District,43.6503,-79.3595


In [89]:
# Find NaN coordinates
df[df['Latitude'].isnull()]

Unnamed: 0,District,Neighbourhood,Latitude,Longitude


In [94]:
df.to_csv('toronto_neigh_clean.csv')

## Create map

In [10]:
!conda install -c conda-forge folium -y

Collecting package metadata: ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [93]:
import folium

df.read_csv('')
map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers
for lat, lng, district, neighbourhood in zip(df['Latitude'], df['Longitude'], df['District'], df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, district)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map) 
map