# Segmenting and Clustering Neighborhoods in Toronto

## I. Data manging

In [2]:
import pandas as pd
import numpy as np
import lxml.html as LH

### I.1 Fetch raw data

In [35]:
# scrap data and save the result as a dataframe
url_wiki = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

raw_data = pd.read_html(url_wiki, header=0)[0]
raw_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [36]:
print("raw data shape:", raw_data.shape)

raw data shape: (288, 3)


### I.2 Clean raw data

In [37]:
raw_data = raw_data.replace({"Not assigned": np.nan})
raw_data = raw_data.dropna(axis=0)
raw_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [38]:
print("cleaned raw data shape:", raw_data.shape)

cleaned raw data shape: (210, 3)


### I.3 Append locations

#### 1ST OPTION (very long...)

In [15]:
#!conda install -c conda-forge geocoder --yes
import geocoder

def appendLoc(x):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(x))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longitude

Solving environment: done

# All requested packages already installed.



In [None]:
tmp = raw_data.Postcode.map(appendLoc).map(pd.Series)

In [None]:
raw_data = raw_data.merge(tmp, left_index=True, right_index=True)
raw_data.columns = ["Postcode", "Borough", "Neighbourhood", "Latitude", "Longitude"]
raw_data.head()

*1st option was aborted because the calculation is very very long...*

#### 2ND OPTION (fast)

In [39]:
url_loc = "https://cocl.us/Geospatial_data"

In [40]:
tmp = pd.read_csv(url_loc, sep=",", encoding="utf-8")
tmp.columns = ["Postcode", "Latitude", "Longitude"]
raw_data = raw_data.merge(tmp, how="left", on="Postcode")
raw_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763


In [41]:
print("data shape:", raw_data.shape)

data shape: (210, 5)
