## Segmenting and Clustering Neighborhoods in Toronto

### Task 1 : Web scraping and Data Wrangling

In [1]:
import pandas as pd
import numpy as np

import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# scraping
import requests # library to handle requests
from urllib.request import urlopen
import lxml.html as lh

# geocoders
from geopy.geocoders import Nominatim

# visualization libraries
import matplotlib.cm as cm
import matplotlib.colors as colors
#!conda install -c conda-forge folium=0.5.0
import folium  # map rendering library

# import k-means from clustering stage
from sklearn.cluster import KMeans

print('Libraries imported.')

Libraries imported.


#### Scraping Wikipedia page, creating Pandas Dataframe

In [2]:
# credits : https://towardsdatascience.com/web-scraping-html-tables-with-python-c9baba21059
# codes below allows us to get the Neigbourhod & Postal codes data of the HTML table

import lxml.html as lh
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

# parse the first row as our header
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d: "%s"'%(i,name))
    col.append((name,[]))

1: "Postcode"
2: "Borough"
3: "Neighbourhood
"


In [3]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not table data we want 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1
        
# check the length of each column. Ideally, they should all be the same.
[len(C) for (title,C) in col]

[287, 287, 287]

In [4]:
# create the DataFrame
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)
# remove newlines
df = df.replace('\n','', regex=True)

In [5]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Cleaning data

In [6]:
#dropping the "Not Assigned" borough
df = df[df.Borough != 'Not assigned']
df = df.sort_values(by=['Postcode','Borough'])

df.reset_index(inplace=True)
df.drop('index',axis=1,inplace=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Port Union
5,M1E,Scarborough,Guildwood
6,M1E,Scarborough,Morningside
7,M1E,Scarborough,West Hill
8,M1G,Scarborough,Woburn
9,M1H,Scarborough,Cedarbrae


In [7]:
#Consolidating the neighbourhoods that share the postcode

df_postcodes = df['Postcode']
df_postcodes.drop_duplicates(inplace=True)
df1 = pd.DataFrame(df_postcodes)
df1['Borough'] = '';
df1['Neighbourhood'] = '';


df1.reset_index(inplace=True)
df1.drop('index', axis=1, inplace=True)
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)

for i in df1.index:
    for j in df.index:
        if df1.iloc[i, 0] == df.iloc[j, 0]:
            df1.iloc[i, 1] = df.iloc[j, 1]
            df1.iloc[i, 2] = df1.iloc[i, 2] + ', ' + df.iloc[j, 2]
            
for i in df1.index:
    s = df1.iloc[i, 2]
    if s[0] == ',':
        s =s [1:]
    df1.iloc[i,2 ] = s
    
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
print('There are {} rows in the dataframe.'.format(df.shape[0]))

There are 210 rows in the dataframe.


### End of Task 1  (Week 3 ~ Capstone Assignment)