# Toronto Neighborhood Analysis
## Clustering Coursera Capstone Project - Week 3 

### Eduardo Palomero-López

### PART 1: Converting Wikipedia Table Data into DataFrame
### PART 2: Add Latitude and Longitude Data

In [2]:
pip install beautifulsoup4

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/66/25/ff030e2437265616a1e9b25ccc864e0371a0bc3adb7c5a404fd661c6f4f6/beautifulsoup4-4.9.1-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 3.5MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.1 soupsieve-2.0.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/55/6f/c87dffdd88a54dd26a3a9fef1d14b6384a9933c455c54ce3ca7d64a84c88/lxml-4.5.1-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 6.0MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
from bs4 import BeautifulSoup as bsoup
from urllib.request import urlopen as uReq
import requests
import lxml
import pandas as pd
from pandas import DataFrame
import numpy as np

In [5]:
URL='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' 

In [6]:
r=requests.get(URL)

### Use BeautifulSoup to parse the data in Wikipedia

In [7]:
parsed_web=bsoup(r.text,"html.parser")
# uncomment next line to check parsing results
#parsed_web

In [8]:
# get table from parsed data
Table=parsed_web.table
# uncomment next line to check results
#Table

In [9]:
results=Table.find_all('tr')
number_rows=len(results)
print('Total rows=',number_rows,'; so total number of data rows (removing header row)=', number_rows-1)

Total rows= 181 ; so total number of data rows (removing header row)= 180


In [10]:
header=results[0].text.split()
header

['Postal', 'Code', 'Borough', 'Neighborhood']

__Check that 'Postal' and 'Code' have been split and need to merge__

In [11]:
header=[header[0]+header[1],header[2],header[3]]
header

['PostalCode', 'Borough', 'Neighborhood']

__Check data__

In [12]:
results[7].text

"\nM7A\n\nDowntown Toronto\n\nQueen's Park, Ontario Provincial Government\n"

In [13]:
results[7].text.split('\n')

['',
 'M7A',
 '',
 'Downtown Toronto',
 '',
 "Queen's Park, Ontario Provincial Government",
 '']

In [14]:
PostalCode=results[7].text.split('\n')[1]
PostalCode

'M7A'

In [15]:
Borough=results[7].text.split('\n')[3]
Borough

'Downtown Toronto'

In [16]:
Neighborhood=results[7].text.split('\n')[5]
Neighborhood

"Queen's Park, Ontario Provincial Government"

In [17]:
# Loop to extract data

Data =[]
n=1
while n < number_rows :
    Postcode=results[n].text.split('\n')[1]
    Borough=results[n].text.split('\n')[3]
    Neighborhood=results[n].text.split('\n')[5]
    Data.append((Postcode, Borough,Neighborhood))
    n=n+1

df=pd.DataFrame(Data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df.head(5)



Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [18]:
df.shape

(180, 3)

In [19]:
# Remove rows Borough='Not assigned'
df1=df[~df.Borough.str.contains("Not assigned")]
df1=df1.reset_index(drop=True)
print(df1.shape)
print(df1.head())

(103, 3)
  PostalCode           Borough                                Neighbourhood
0        M3A        North York                                    Parkwoods
1        M4A        North York                             Victoria Village
2        M5A  Downtown Toronto                    Regent Park, Harbourfront
3        M6A        North York             Lawrence Manor, Lawrence Heights
4        M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government


In [26]:
distinct_PostalCode = df1['PostalCode'].nunique()
distinct_borough = df1['Borough'].nunique()
distinct_neighbourhood= df1['Neighbourhood'].nunique()
print('Different Postal Codes : ' + str(distinct_PostalCode))
print('Different Boroughs  : '+ str(distinct_borough))
print('Different Neighbourhoods  :' + str(distinct_neighbourhood))

Different Postal Codes : 103
Different Boroughs  : 10
Different Neighbourhoods  :99


In [21]:
df1.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


__Check if any Neighborhood='Not assigned'__

In [22]:
((df1['Neighbourhood'] == 'Not assigned').groupby)

<bound method Series.groupby of 0      False
1      False
2      False
3      False
4      False
       ...  
98     False
99     False
100    False
101    False
102    False
Name: Neighbourhood, Length: 103, dtype: bool>

__There is no data with Neighborhood='Not assigned'__

In [24]:
df1.shape

(103, 3)

## --------------------END OF PART 1----------------------------------

### PART 2: Add Latitude and Longitude Data

In [29]:
# Get coordinates from csv file provided
df_codes=pd.read_csv('http://cocl.us/Geospatial_data')
df_codes.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [30]:
df_codes.shape

(103, 3)

In [32]:
df_codes.columns = ['PostalCode', 'Latitude', 'Longitude']
df_codes.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [33]:
df1sorted=df1.sort_values('PostalCode')
df1sorted.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
6,M1B,Scarborough,"Malvern, Rouge"
12,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae


In [34]:
NBRHs=pd.merge(df1sorted,df_codes, how='right', on = 'PostalCode')
NBRHs.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [35]:
NBRHs.shape

(103, 5)