# IBM Applied Data Science Capstone Assignment 2


Segmenting and Clustering Neighborhoods in Toronto

## Part 1: Scrape wikipedia page and build dataframe

#### Import libraries

In [2]:
import requests
import pandas as pd
import lxml.html as lh

#### Data Integration

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Create list of HTML objects
olist = lh.fromstring(requests.get(url).content).xpath('//tr')

# Create list of lists
fetch = lambda o: o.text_content().strip().split('\n')
valid = lambda l: len(l) == 3 and l[0][0] == 'M'
rows = list(filter(valid, map(fetch, olist)))

# Create Pandas Dataframe
df = pd.DataFrame(rows, columns=['PostalCode', 'Borough', 'Neighborhood'])

#### Data Preparation

In [4]:
# Drop rows with a Borough that is Not assigned
df = df.drop(df.index[df['Borough'] == 'Not assigned'])

# Merge Neigborhood on PostalCode and Borough
df = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(','.join).reset_index()

# Assign Borough to Not assigned Neigborhood
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df['Borough']

#### Check Dataframe

In [5]:
# Show Results
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [6]:
# Show the neighborhood of Queen's Park  
df[df['Borough'] == 'Queen\'s Park']

Unnamed: 0,PostalCode,Borough,Neighborhood
93,M9A,Queen's Park,Queen's Park


In [7]:
# Show the dataframe dimensions
df.shape

(103, 3)

## Part 2: Add geospatial data from CSV file

#### Data integration

In [12]:
# Import geospatial data from CSV file
url = "http://cocl.us/Geospatial_data"
df_geo = pd.read_csv(url)

# Fix column names
df_geo.columns = ['PostalCode', 'Latitude', 'Longitude']

# Inner join on key PostalCode
df = pd.merge(df, df_geo, on='PostalCode')

#### Check Dataframe

In [16]:
# Show Results
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848
