# Segmenting and Clustering Neighborhoods in Toronto 

Import packages we will use

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np 

## Preprocessing

### Web Scraping
We will web scrape our data using BeautifulSoup. 

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

In [3]:
toronto_table = soup.find('table', class_='wikitable sortable')

We will use a for loop to go through the table and create a list, checking for empty cells and skipping if appropriate. 

In [4]:
table_con = []
try: 
    for row in toronto_table.find_all('tr'):
        col = row.find_all('td')
        if len(col) == 3:    #if cell is empty, we will skip it
            table_con.append((col[0].text.strip(), col[1].text.strip(), col[2].text.strip()))
except: pass

Now we will convert our list to an array. 

In [5]:
toronto_array = np.asarray(table_con)

### Data Cleaning
The array will then be converted to a dataframe.

In [6]:
df = pd.DataFrame(toronto_array)
df.columns = ['PostalCode', 'Borough', 'Neighborhood']
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [7]:
print('Borough: ', (df.Borough == 'Not assigned').sum())
print('Neighborhood: ', (df.Neighborhood == 'Not assigned').sum())

Borough:  77
Neighborhood:  78


All "Not assigned" will be replace to 'NaN' using numpy for convenience. 

In [8]:
df.replace('Not assigned', np.nan, inplace=True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,
9,M8A,,


Check to verify replacement is correct. 

In [9]:
df.isnull().sum()

PostalCode       0
Borough         77
Neighborhood    78
dtype: int64

Assume that neighbohood is the same as borough if neighborhood is missing and borough is present.  

In [10]:
for i in range(0,len(df.index)): 
    if df.iloc[i,1] is not np.nan and df.iloc[i,2] is np.nan:
        df.iloc[i,2] = df.iloc[i,1]

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
9,M8A,,


In [11]:
df.isnull().sum()

PostalCode       0
Borough         77
Neighborhood    77
dtype: int64

Drop all rows in which there are missing values for borough and neighborhood. 

In [12]:
df.dropna(inplace=True)
df.isnull().sum()

PostalCode      0
Borough         0
Neighborhood    0
dtype: int64

Group records with same postal code to same row, separated by commas.

In [13]:
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood]], Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


Dimension of dataframe

In [14]:
df.shape

(103, 3)

## Generating Coordinates
Install and import geocoder

In [16]:
!pip install geocoder
print('Geocoder installed!')
import geocoder

Requirement not upgraded as not directly required: geocoder in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: ratelim in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: six in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: click in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: future in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests->geocoder)
Requirement not upgraded as not directly required: i

Get the appropriate latitude and longitude for each postal code. 

In [17]:
latitude = []
longitude = []

# loop until you get the coordinates
for i in range(0,len(df.index)):
    g = geocoder.google('{}, Toronto, Ontario'.format(df.iloc[i,0]))
    lat_lng_coords = g.latlng

    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1])

Convert the list to a dataframe and combine it with our previous dataframe (the one with columns PostalCode, Borough, Neighborhood). 

In [18]:
df_lat = pd.DataFrame(latitude)
df_lat.columns = ['Latitude']

df_long = pd.DataFrame(longitude)
df_long.columns = ['Longitude']

df = pd.concat([df, df_lat, df_long], axis=1)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood]], Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [20]:
df.shape

(103, 5)