## Capstone Project Notebook


Scraping Wikipedia page & forming dataframe

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
res = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table), header = 0)[0]


## Cleaning the dataframe

In [2]:
# Removing all cells with unassigned boroughs
df = df[df.Borough != 'Not assigned']


In [3]:
# Combining neighbourhoods with same postal code
# Using comma to separate multiple neighbourhoods with the same postal code
df = df.groupby(['Postcode','Borough'],as_index=False,sort = False).agg(lambda x:', '.join(set(x)))


In [6]:
# Replacing 'Not assigned' neighbourhoods with their respective boroughs.
df.loc[:,'Neighbourhood'].replace('Not assigned', df.loc[:,'Borough'], inplace=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [5]:
# Shape of the dataframe
df.shape

(103, 3)

## Making data frame from the csv file of geographical coordinates

In [7]:
lat_long=pd.read_csv('http://cocl.us/Geospatial_data')
lat_long.columns=['Postcode', 'Latitude', 'Longitude']


In [8]:
# Merging the two data frames to get the dataframe of postal codes, boroughs, neighbourhood, latitudes, and longitudes
merged=pd.merge(df,lat_long,on='Postcode')
merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [9]:
merged.shape

(103, 5)

Downloading dependencies needed for exploration and clustering

In [10]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  56.94 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  36.05 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  39.29 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  46.16 MB/s
Libraries imported.


In [11]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(merged['Borough'].unique()),merged.shape[0]))

The dataframe has 11 boroughs and 103 neighborhoods.


#### Creating a map of Toronto with neighborhoods superimposed on top.

In [12]:
# Getting geographical coordinates of Toronto
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [13]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(merged['Latitude'], merged['Longitude'], merged['Borough'], merged['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Utilizing the Foursquare API to explore the neighborhoods and segment them.

#### Define Foursquare Credentials and Version

In [14]:
# The code was removed by Watson Studio for sharing.

Your credentails:
CLIENT_ID: BXZPG3NUBOSGVJMFMDGMLMD1SYCNOTRGH5ZMAA1OY0C3HJ50
CLIENT_SECRET:YQFFK1MY1VF4MRKLHXZAP03VSXT1UNCYHXUKTR1H4NUPNDU2


#### Explore the Neighborhoods in Toronto and Working With Only the Boroughs Containing Toronto

In [15]:
# slicing the original data to get all areas of Toronto
toronto_data = merged[merged['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()



Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
