# Toronto Neighborhoods

- imports

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
import folium
import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

- will use 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
- check for status code and headers

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = requests.get(url)
print(url)
print(result.status_code)
print(result.headers)

https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
200
{'Date': 'Mon, 18 May 2020 14:00:00 GMT', 'Vary': 'Accept-Encoding,Cookie,Authorization', 'Server': 'ATS/8.0.7', 'Content-Type': 'text/html; charset=UTF-8', 'X-Content-Type-Options': 'nosniff', 'P3P': 'CP="See https://en.wikipedia.org/wiki/Special:CentralAutoLogin/P3P for more info."', 'Content-language': 'en', 'Last-Modified': 'Thu, 07 May 2020 17:47:29 GMT', 'Content-Encoding': 'gzip', 'Age': '88936', 'X-Cache': 'cp5008 hit, cp5007 hit/55', 'X-Cache-Status': 'hit-front', 'Server-Timing': 'cache;desc="hit-front"', 'Strict-Transport-Security': 'max-age=106384710; includeSubDomains; preload', 'Set-Cookie': 'WMF-Last-Access=19-May-2020;Path=/;HttpOnly;secure;Expires=Sat, 20 Jun 2020 12:00:00 GMT, WMF-Last-Access-Global=19-May-2020;Path=/;Domain=.wikipedia.org;HttpOnly;secure;Expires=Sat, 20 Jun 2020 12:00:00 GMT, GeoIP=IN:AP:Madanapalle:13.55:78.50:v4; Path=/; secure; Domain=.wikipedia.org', 'X-Client-IP': '175.101.143.2

- will get data + clean it

In [3]:
soup = BeautifulSoup(result.content, 'html.parser')
table = soup.find('table')
trs = table.find_all('tr')
rows = []
for tr in trs:
    i = tr.find_all('td')
    if i:
        rows.append(i)
        
lst = []
for row in rows:
    postalcode = row[0].text.rstrip()
    borough = row[1].text.rstrip()
    neighborhood = row[2].text.rstrip()
    if borough != 'Not assigned':
        if neighborhood == 'Not assigned':
            neighborhood = borough
        lst.append([postalcode, borough, neighborhood])

# lst

- convert into a dataframe

In [4]:
cols = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(lst, columns=cols)
print(df.shape)
# df[df.duplicated(['PostalCode'], keep=False)] - this would have shown the duplicate PostalCodes

(103, 3)


- custom groupby / agg to merge Neighborhoods
    - groupby PostalCode, keep the first Borough and join() Neighborhoods

In [5]:
df = df.groupby('PostalCode').agg(
    {
        'Borough':'first', 
        'Neighborhood': ', '.join,}
    ).reset_index()

- will check if 'M5A' example is done correctly

In [6]:
df.loc[df['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
53,M5A,Downtown Toronto,"Regent Park, Harbourfront"


- will check the df.shape

In [7]:
df.shape

(103, 3)

- will read geo data from a CSV
- need to rename a column so the merge (below) will work

In [8]:
dfgeo = pd.read_csv("https://cocl.us/Geospatial_data")
dfgeo.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)

- need to merge 2 dataframes based on a PostalCode column

In [9]:
df2 = pd.merge(df, dfgeo, on="PostalCode", how='left')

In [15]:
df2.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
