# Segmenting and Clustering Neighborhoods in Toronto

## Scrapping Wikipedia page with beautifulsoup

Import all neccessary libraries

In [13]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
import re

Read the whole page busing the standard `urllib` python library

In [7]:
POSTAL_CODES_URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
contents = urllib.request.urlopen(POSTAL_CODES_URL).read()

Select most suitable cells 

In [8]:
soup = BeautifulSoup(contents, 'html.parser')
cells = soup.select('td > p')

We can see that postal code can be easily obtaned from the html `b` tag in cell, unfortunately we cant say it for the borough and the neighborhood. But we see that neighborhood always in parentheses and beautiful soup can transform a tag into the text, so we can just write the proper reqular expression that split a text on the two components

In [9]:
span_parser = re.compile('(.*)\(+(.*)\)+')

And finally we can form the data for the dataframe

In [10]:
dataframe_data = []
for cell in cells:    
    if cell.span.i == None:
        postal_code = cell.b.string
        span_text = cell.span.getText()
        match = span_parser.match(span_text)
        if match == None:
            borough = span_text
            neighborhood = span_text
        else:
            borough = match.group(1)
            neighborhood = match.group(2)
            
        dataframe_data.append({
            'Postal Code': postal_code, 
            'Borough': ','.join(x.strip() for x in borough.split('/')),
            'Neighborhood': ','.join(x.strip() for x in neighborhood.split('/'))
        })

In [11]:
df = pd.DataFrame(dataframe_data)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park,Harbourfront"
3,M6A,North York,"Lawrence Manor,Lawrence Heights"
4,M7A,"Queen's Park,Ontario Provincial Government","Queen's Park,Ontario Provincial Government"


Print the number of rows of this dataframe.

In [12]:
bdf.shape

(103, 3)

In [None]:
# set number of clusters
w_df_2 = w_df.copy()
w_df_2 = w_df_2.loc[w_df_2['total_venues'] < 45]
w_df_2 = w_df_2.loc[w_df_2['Grocery Store'] > 0]
print(w_df_2.shape)
kclusters = 3

kl_clustering = w_df_2.drop(['lat', 'lng', 'dist_from_cbd', 'sum_crimes', 'total_venues', 'Grocery Store'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(kl_clustering)

kl_merged = w_df_2.copy()

# add clustering labels
kl_merged["Cluster Labels"] = kmeans.labels_
kl_merged.head()

map_clusters = folium.Map(location=[-33.299276, 151.407568], zoom_start=8.4)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(kl_merged['lat'], kl_merged['lng'], kl_merged.index, kl_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters