# Segmenting and Clustering Neighborhoods in Toronto

By Eduardo Faria

This notebook purpose is an assignment of Data Science course. 
The objective of this assignment is to explore, segment, and cluster the neighborhoods in the city of Toronto.

In [1]:
import pandas as pd
import numpy as np

## Task 1 - Read data from web and prepare dataframe

In [2]:
# Install package BeautifulSoup
#!pip install beautifulsoup4
#!pip install lxml

In [3]:
import requests
from bs4 import BeautifulSoup

### Read postal code data from wikipedia
Uses requests and BeautifulSoup to get the data from webpage.

In [4]:
# Site with postal code data of Toronto
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
#print(soup.prettify())

In [5]:
site_table = soup.find('table', class_='wikitable')
#print(site_table.prettify())

### Create a dictionary and clean the data  
- Ignore cells with a borough that is 'Not assigned'.
- If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [6]:
# Read Postal Code table from wikipedia
postalCode_list = []
borough_list = []
neighborhood_list = []
line_num = 0
for line in site_table.find_all('tr'):
    #Proccess each data row
    col_item = 0
    postalCode = ''
    borough = ''
    neighborhood = ''
    #Read each column
    for col in line.find_all('td'):
        if col_item == 0:
            postalCode = col.text.strip(' \t\n\r')
        elif col_item == 1:
            borough = col.text.strip(' \t\n\r')
        elif col_item == 2:
            neighborhood = col.text.strip(' \t\n\r')
        col_item += 1
    
    line_num += 1
    
    #ignore Borough 'Not assigned'
    if (borough != 'Not assigned') and (len(borough) > 0):
        #Set neighborhood the same as the borough, if cell has a borough but a Not assigned neighborhood
        if (not neighborhood):
            neighborhood = borough
        #Change multiple neighborhood separator '/' per ', ' if needed
        neighborhood = neighborhood.replace(' /', ',')
        #Append the row data
        postalCode_list.append(postalCode)
        borough_list.append(borough)
        neighborhood_list.append(neighborhood)
data = {
    'PostalCode': postalCode_list,
    'Borough': borough_list,
    'Neighborhood': neighborhood_list
}

### Create pandas DataFrame

In [7]:
# Creating pandas Dataframe
neighborhoods = pd.DataFrame(data)
neighborhoods.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Merge rows with same postal code

In [8]:
# Merge rows with same postal code
for i, row in neighborhoods.iterrows():
    # Go through the list and compare 'row' Postal Code with 'other_row' Postal Code
    for other_i, other_row in neighborhoods.iterrows():
        # if same Postal Code, merge data
        if (other_row.PostalCode == row.PostalCode) and (other_i != i) and (other_i > i):
            print('duplicate: ', i, ' ', other_i)
            # combine data in column neighborhood
            neighborhoods.iloc[i].Neighborhood = neighborhoods.iloc[i].Neighborhood + ', ' + neighborhoods.iloc[other_i].Neighborhood
            print(neighborhoods.iloc[i].Neighborhood)
            # delete duplicated postal code row
            neighborhoods.drop(other_i, inplace=True)
neighborhoods.head(10)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Shape of the dataframe with neighborhood information

In [9]:
neighborhoods.shape

(103, 3)

## Task 2 - Getting Latitudes and Longitudes

### Note: Geocoder lib did not work well, then csv file with latitude and longitude is used

In [10]:
# install Geocoder API to get Latitude and longitude
#!pip install geocoder
#import geocoder

# initialize variable to None
#lat_lng_coords = None

# loop until get the coordinates
#while(lat_lng_coords is None):
#  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#  lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

### Read Coordinates from csv

In [11]:
coords = pd.read_csv('https://cocl.us/Geospatial_data')
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Include latitude and longitude in the main Dataframe 'neighborhoods' 

In [12]:
# Find coords for each neigborhoods
latitude = []
longitude = []
for i, neighborhood in neighborhoods.iterrows():
    postalCode = neighborhood.PostalCode 
    coord = coords.loc[coords['Postal Code'] == postalCode]
    latitude.append(coord.Latitude.item())
    longitude.append(coord.Longitude.item())
# Include Latitude and Longitude in the Dataframe
neighborhoods['Latitude'] = latitude
neighborhoods['Longitude'] = longitude
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
