### This notebook will be mainly used for IBM Coursera Data Science Professional Certification ###

In [3]:
import pandas as pd
import numpy as np

In [5]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


# Capstone Project #

## Segmenting and Clustering Neighbourhoods in Toronto ##

### Part 1: explore and cluster the neighbourhoods in Toronto ###

In [11]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
website_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_text,'xml')

table = soup.find('table',{'class':'wikitable sortable'})
table_rows = table.find_all('tr')

data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

df = pandas.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
176,M5Z,Not assigned,Not assigned
177,M6Z,Not assigned,Not assigned
178,M7Z,Not assigned,Not assigned
179,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


#### For ignoring the cells without a Borough I will use the following code:  ####

In [12]:
df_toronto_drop_NA = df[df.Borough != 'Not assigned'].reset_index(drop=True)

#### For combining two neighbourhoods that exist in one postal code area I will use the following code: ####

In [13]:
toronto_join = df_toronto_drop_NA.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))

#### For replacing Neighbourhood names that is 'Not Assigned' to be the same as its Borough I will use the following code: ####

In [17]:
df_mask = toronto_join['Neighbourhood'] == "Not assigned"
toronto_join.loc[df_mask, 'Neighbourhood'] = toronto_join.loc[df_mask, 'Borough']

toronto_join

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


#### Number of rows and columns of my dataframe: ####

In [22]:
toronto_shape = toronto_join.shape 

print("The data has Rows {}, Columns {}".format(*toronto_shape))

The data has Rows 103, Columns 3


### Part 2: Get the latitude and the longitude coordinates of each neighbourhood ###

In [33]:
#Download the the geographical coordinates of each postal code

Geospatial_Coordinates=pd.read_csv('https://cocl.us/Geospatial_data')

In [36]:
#By using only one word to assigned Postal Code will help further development for SQL purpose

Geospatial_Coordinates.rename(index=str, columns={"Postal Code": "PostalCode"}, inplace = True)
Neighbourhood = pd.merge(toronto_join, Geospatial_Coordinates, on='PostalCode', how='inner')
Neighbourhood

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


### Part 3: Explore and cluster the neighborhoods in Toronto ###

#### 3.1 Download the remaining dependencies that I need. ####

In [39]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip3 install folium
import folium # map rendering library

print('Libraries imported.')

Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 4.7 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1
Libraries imported.


#### 3.2 Get the latitude and longitude values of Toronto ####

In [40]:
address = "Toronto, ON"

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto city are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto city are 43.6534817, -79.3839347.


#### 3.3 Create a map of the whole Toronto City with neighborhoods superimposed on top. ####

In [46]:
# create map of Toronto using latitude and longitude values
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)
toronto_map

Image above is map of Toronto, you can see the png version of the map here: https://ibb.co/S6NJ9wS

_Disclaimer: According to github, When you add Jupyter Notebook or IPython Notebook files with a .ipynb extension on GitHub, they will render as static HTML files in your repository. The interactive features of the notebook, such as custom JavaScript plots, will not work in your repository on GitHub, so that's why I can't show the folium maps on Github._

#### 3.4 Add markers to the map. ####

In [47]:
for lat, lng, borough, neighborhood in zip(
        Neighbourhood['Latitude'], 
        Neighbourhood['Longitude'], 
        Neighbourhood['Borough'], 
        Neighbourhood['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  

toronto_map

Image above is map and marker for all neighbourhoods. you can see the png version of the map here: https://ibb.co/g4BPprk

_Disclaimer: According to github, When you add Jupyter Notebook or IPython Notebook files with a .ipynb extension on GitHub, they will render as static HTML files in your repository. The interactive features of the notebook, such as custom JavaScript plots, will not work in your repository on GitHub, so that's why I can't show the folium maps on Github._

### Map of a part of Toronto City ###
I'm going to work with only the boroughs that contain the word "Toronto".

#### 3.5 Work with only the boroughs that contain the word "Toronto". ####

In [48]:
# "cend" = [C]entral Toronto, [E]ast Toronto, [N]orth Toronto, [D]owntown Toronto

df_toronto_cend = Neighbourhood[Neighbourhood['Borough'].str.contains("Toronto")].reset_index(drop=True)
df_toronto_cend.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


#### 3.6 Plot again the map and the markers for this region. ####

In [49]:
map_toronto_cend = folium.Map(location=[latitude, longitude], zoom_start=12)
for lat, lng, borough, neighborhood in zip(
        df_toronto_cend['Latitude'], 
        df_toronto_cend['Longitude'], 
        df_toronto_cend['Borough'], 
        df_toronto_cend['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_cend)  

map_toronto_cend

Image above is map and marker for CEND Toronto, You can see the png version of the map in here: https://ibb.co/4MrPFVn

_Disclaimer: According to github, When you add Jupyter Notebook or IPython Notebook files with a .ipynb extension on GitHub, they will render as static HTML files in your repository. The interactive features of the notebook, such as custom JavaScript plots, will not work in your repository on GitHub, so that's why I can't show the folium maps on Github._