In [4]:
'''
Capstone Project on segmentation and clustering of neighbor hoods in Toroto
'''
#!pip install wikipedia
#!conda install -c conda-forge folium=0.5.0 --yes
#!conda install -c conda-forge geopy --yes
#!pip install beautifulsoup4
#!pip install lxml

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import numpy as np
import wikipedia as wp

import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#Map rendering
import folium

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
from bs4 import BeautifulSoup
from IPython.display import display_html
print("Imported Libraries")

URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_source = requests.get(URL).text
soup = BeautifulSoup(wiki_source,'lxml')

wiki_table = str(soup.table)

#display_html(wiki_table,raw=True)

wiki_df = pd.read_html(wiki_table)[0]

df = wiki_df[wiki_df.Borough != 'Not assigned' ]

df_group = df.groupby(['Postcode','Borough'],sort=False).agg(','.join)
df_group.reset_index(inplace=True)

for index,row in df_group.iterrows():
    if row['Neighbourhood']== 'Not assigned':
        row['Neighbourhood'] = row['Borough']
print('CSV Grouping Completed')

Collecting wikipedia
  Downloading https://files.pythonhosted.org/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/dsxuser/.cache/pip/wheels/87/2a/18/4e471fd96d12114d16fe4a446d00c3b38fb9efcb744bd31f4a
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0     

In [5]:
df_group.shape

(103, 3)

In [6]:
df_group

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


In [10]:
'''
Question 2 : Get lat long values
'''
#FILE = 'Geospatial_Coordinates.csv'
#df_group_lat_long = pd.read_csv(FILE)
df_group_lat_long = pd.read_csv('https://cocl.us/Geospatial_data')
df_group_lat_long.rename(columns={'Postal Code':'Postcode'},inplace = True)
df_group_lat_long.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
df_data_merge = pd.merge(df_group_lat_long,df_group,on='Postcode')
df_data_merge.head()

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge,Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [13]:
'''
Q3 : Clutering Toronto neighborhood
'''
df_toronto = df_data_merge[df_data_merge['Borough'].str.contains('Toronto',regex=False)]
df_toronto.head()

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood
37,M4E,43.676357,-79.293031,East Toronto,The Beaches
41,M4K,43.679557,-79.352188,East Toronto,"The Danforth West,Riverdale"
42,M4L,43.668999,-79.315572,East Toronto,"The Beaches West,India Bazaar"
43,M4M,43.659526,-79.340923,East Toronto,Studio District
44,M4N,43.72802,-79.38879,Central Toronto,Lawrence Park


In [15]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(df_toronto['Latitude'],df_toronto['Longitude'],df_toronto['Borough'],df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

In [18]:
'''
Creating clusters using Kmeans
'''
k_clusters = 5
toronto_clusters = df_toronto.drop(['Postcode','Borough','Neighbourhood'],1)
toronto_clusters

Unnamed: 0,Latitude,Longitude
37,43.676357,-79.293031
41,43.679557,-79.352188
42,43.668999,-79.315572
43,43.659526,-79.340923
44,43.72802,-79.38879
45,43.712751,-79.390197
46,43.715383,-79.405678
47,43.704324,-79.38879
48,43.689574,-79.38316
49,43.686412,-79.400049


In [20]:
# run k-means clustering
kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(toronto_clusters)

# check cluster labels generated for each row in the dataframe
kmeans.labels_


array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       1, 1, 3, 3, 3, 4, 4, 4, 3, 2, 3, 3, 2, 2, 2, 4, 0], dtype=int32)

In [21]:
df_toronto.insert(0, 'Cluster Labels', kmeans.labels_)

In [22]:
df_toronto.head()

Unnamed: 0,Cluster Labels,Postcode,Latitude,Longitude,Borough,Neighbourhood
37,0,M4E,43.676357,-79.293031,East Toronto,The Beaches
41,0,M4K,43.679557,-79.352188,East Toronto,"The Danforth West,Riverdale"
42,0,M4L,43.668999,-79.315572,East Toronto,"The Beaches West,India Bazaar"
43,0,M4M,43.659526,-79.340923,East Toronto,Studio District
44,1,M4N,43.72802,-79.38879,Central Toronto,Lawrence Park


In [26]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)
# set color scheme for the clusters
x = np.arange(k_clusters)
ys = [i + x + (i*x)**2 for i in range(k_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighbourhood'], df_toronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters