# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto
## Part 1: Scraping data from Wikipedia page

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

In [2]:
# Identifying source of data and url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text

In [3]:
# Using BeautifulSoup to locate table and contents
soup = BeautifulSoup(source, 'xml')
table = soup.find('table')

In [4]:
# Create new empty dataframe; The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
columns_name = ['PostalCode','Borough','Neighborhood']
df = pd.DataFrame(columns = columns_name)

In [5]:
# Search rows in table for PostalCode, Borough, and Neighborhood data then placing it into dataframe 'df'
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
        if len(row_data)==3:
            df.loc[len(df)] = row_data
            
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [6]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is 'Not assigned'
df = df[df['Borough'] != 'Not assigned']

# Grouping Neighborhood with same Postal Code into the same row
df_group = df.groupby(['PostalCode','Borough'], sort=False).agg(', '.join)
df_group.reset_index(inplace=True)

# For any cell with 'Not Assigned' in Neighborhood column, let Borough == Neighborhood
df_group['Neighborhood'] = np.where(df_group['Neighborhood']=='Not assigned', df_group['Borough'], df_group['Neighborhood'])

df_group.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [7]:
df_group.shape

(103, 3)

## Part 2: Getting the latitude and longitude coordinates of each neighborhood

In [8]:
# Fetching data from the provided url
coord = pd.read_csv('http://cocl.us/Geospatial_data')
coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
# Rename column name to match with df_group
coord.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

# Combining coord (new coordinates dataframe) with df_group dataframe
df_can = pd.merge(df_group, coord, on='PostalCode')

df_can.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


## Part 3: Explore and cluster neighborhoods in Toronto

In [10]:
# Filtering neighborhoods which are in Toronto
df_toronto = df_can[df_can['Borough'].str.contains('Toronto')]

df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [11]:
#!conda install -c conda-forge folium=0.5.0 --yes (Note: Uncomment if folium is not previously ran/installed)
import folium

In [12]:
# Create map of Toronto using mean latitude and longitude values
lat_mean = df_toronto['Latitude'].mean()
lng_mean = df_toronto['Longitude'].mean()

map_toronto = folium.Map(location=[lat_mean, lng_mean], zoom_start=10)

# Add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [13]:
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [14]:
# Setting number of centroids, k as 5
k = 5

# Dropping data which is not a coordinate
toronto_clustering = df_toronto.drop(['PostalCode','Borough','Neighborhood'],1)

# K-Means model
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_

# Providing clusters with labels
df_toronto.insert(0, 'Cluster Labels', kmeans.labels_)

df_toronto.head()

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,0,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
9,0,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [15]:
# Create map of clusters in Toronto's neighborhoods
lat_mean = df_toronto['Latitude'].mean()
lng_mean = df_toronto['Longitude'].mean()

map_clusters = folium.Map(location=[lat_mean, lng_mean], zoom_start=10)

# Set color scheme for clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lng, neighborhood, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood'], df_toronto['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters