# Toronto Exploration

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### First, scrape the Wiki page to create a table of all Toronto neighborhoods

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

soup.title

<title>List of postal codes of Canada: M - Wikipedia</title>

In [3]:
# Copy the selector from the inspection of the table where lies the data that we are interested in
table_selector = '#mw-content-text > div > table:nth-child(6)'

table = soup.select(table_selector)
table = BeautifulSoup(str(table[0]), 'html.parser')

# Then make a list of all cells within the <td> tags
cells = table.find_all('td')
len(cells)

180

In [4]:
# To extract the right information from each cell, we take a look at a single cell's text by the get_text() method from BeautifulSoup
text = BeautifulSoup(str(cells[4])).get_text()
text

'\nM5ADowntown Toronto(Regent Park / Harbourfront)\n\n'

In [5]:
# We observe that the 3-character postal code is of index 1, since '\n' is a special character instead of 2,
# followed by the borough's name, followed by its neighborhoods' names in parentheses. Let's store these information
# in variables.
postal_code = text[1:4]
open_paren = text.index('(')
close_paren = text.index(')')
borough = text[4:open_paren]
neighborhoods = text[open_paren+1 : close_paren].replace(' / ', ', ')
print(f"The {borough} borough's neighborhoods where the postal code is {postal_code} include the following: {neighborhoods}")

The Downtown Toronto borough's neighborhoods where the postal code is M5A include the following: Regent Park, Harbourfront


#### Next, let's try geocoding with the Google Maps' API to find latitude and longitude of a neighborhood

In [6]:
API = 'AIzaSyD6KT3TOnp-lCdkdrbumpA6UBX3wRr5o-c'
postal_code = 'M5M'
url = f'https://maps.googleapis.com/maps/api/geocode/json?address={postal_code},+Toronto,+Ontario&key={API}'

results = requests.get(url).json()
results

{'results': [{'address_components': [{'long_name': 'M5M',
     'short_name': 'M5M',
     'types': ['postal_code', 'postal_code_prefix']},
    {'long_name': 'Toronto',
     'short_name': 'Toronto',
     'types': ['locality', 'political']},
    {'long_name': 'Toronto Division',
     'short_name': 'Toronto Division',
     'types': ['administrative_area_level_2', 'political']},
    {'long_name': 'Ontario',
     'short_name': 'ON',
     'types': ['administrative_area_level_1', 'political']},
    {'long_name': 'Canada',
     'short_name': 'CA',
     'types': ['country', 'political']}],
   'formatted_address': 'Toronto, ON M5M, Canada',
   'geometry': {'bounds': {'northeast': {'lat': 43.751579, 'lng': -79.402656},
     'southwest': {'lat': 43.718839, 'lng': -79.4337391}},
    'location': {'lat': 43.7332825, 'lng': -79.4197497},
    'location_type': 'APPROXIMATE',
    'viewport': {'northeast': {'lat': 43.751579, 'lng': -79.402656},
     'southwest': {'lat': 43.718839, 'lng': -79.4337391}}},
  

In [7]:
# Finding the exact place of lat and lng within the JSON response
results['results'][0]['geometry']['location']

{'lat': 43.7332825, 'lng': -79.4197497}

In [8]:
lat = results['results'][0]['geometry']['location']['lat']
lng = results['results'][0]['geometry']['location']['lng']
print(f'The latitude and longitude of {postal_code} is {lat}, {lng}')

The latitude and longitude of M5M is 43.7332825, -79.4197497


In [9]:
# Instantiate the dataframe
column_names = ['Postal Code', 'Borough', 'Neighborhoods', 'Latitude', 'Longitude'] 
df = pd.DataFrame(columns=column_names)

for cell in cells:
    if 'Not assigned' not in str(cell) and 'M7' not in str(cell):  # Only get the relevant cells, plus avoiding 'M7x' which are not neighborhoods
        try:
            text = BeautifulSoup(str(cell)).get_text()
            postal_code = text[1:4]
            open_paren = text.index('(')
            close_paren = text.index(')')
            borough = text[4:open_paren]
            neighborhoods = text[open_paren+1 : close_paren].replace(' / ', ', ')
        except:
            continue
        
        # Make the API call to get the information about the neighborhood
        url = f'https://maps.googleapis.com/maps/api/geocode/json?address={postal_code},+Toronto,+Ontario&key={API}'
        results = requests.get(url).json()
        
        lat = results['results'][0]['geometry']['location']['lat']
        lng = results['results'][0]['geometry']['location']['lng']
        
        # Populate the data table
        df = df.append({'Postal Code': postal_code,
                        'Borough': borough,
                        'Neighborhoods': neighborhoods,
                        'Latitude': lat,
                        'Longitude': lng}, ignore_index=True)
        print(postal_code, end=' ')

print("\n\nData loaded successfully")
df

M3A M4A M5A M6A M9A M1B M3B M4B M5B M6B M9B M1C M3C M4C M5C M6C M9C M1E M4E M5E M6E M1G M4G M5G M6G M1H M2H M3H M4H M5H M6H M1J M2J M3J M4J M5J M6J M1K M2K M3K M4K M5K M6K M1L M2L M3L M4L M5L M6L M9L M1M M2M M3M M4M M5M M6M M9M M1N M2N M3N M4N M5N M6N M9N M1P M2P M4P M5P M6P M9P M1R M2R M4R M5R M6R M9R M1S M4S M5S M6S M1T M4T M5T M1V M4V M5V M8V M9V M1W M4W M5W M8W M9W M1X M4X M5X M8X M4Y M8Y M8Z 

Data loaded successfully


Unnamed: 0,Postal Code,Borough,Neighborhoods,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
5,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
6,M3B,North York,Don Mills,43.745906,-79.352188
7,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
8,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
9,M6B,North York,Glencairn,43.709577,-79.445073


In [10]:
# We should also drop the M5W since it's also not a neighborhood
df = df.drop(90)
df = df.reset_index(drop=True)
df.loc[88:92]

Unnamed: 0,Postal Code,Borough,Neighborhoods,Latitude,Longitude
88,M1W,Scarborough,"Steeles West, L'Amoreaux West",43.799525,-79.318389
89,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
90,M8W,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484
91,M9W,EtobicokeNorthwest,"Clairville, Humberwood, Woodbine Downs, West H...",43.706748,-79.594054
92,M1X,Scarborough,Upper Rouge,43.836125,-79.205636


In [11]:
# Save the above table to a csv file for further uses
filename = 'Toronto neighborhoods.csv'
df.to_csv(filename)

#### Let's see all the boroughs on a map

In [13]:
# List of all Toronto's boroughs
boroughs = np.unique(df['Borough'].values)
boroughs

array(['Central Toronto', 'Downtown Toronto', 'East Toronto', 'East York',
       'East YorkEast Toronto', 'Etobicoke', 'EtobicokeNorthwest',
       'North York', 'Scarborough', 'West Toronto', 'York'], dtype=object)

In [22]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

# Get Toronto's latitude and longitude
url = f'https://maps.googleapis.com/maps/api/geocode/json?address=Toronto,+Ontario&key={API}'
results = requests.get(url).json()
latitude = results['results'][0]['geometry']['location']['lat']
longitude = results['results'][0]['geometry']['location']['lng']

# Create a map of Toronto
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# Set color scheme for each borough
x = np.arange(len(boroughs))
ys = [i + x + (i*x)**2 for i in range(len(boroughs))]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
for lat, lon, borough, postal in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Postal Code']):
    label = folium.Popup(str(postal), parse_html=True)
    color = rainbow[np.where(boroughs==borough)[0][0]]
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7).add_to(map_toronto)
       
map_toronto

#### For the simplicity of this week's project, I'm going to stop here, supposing that my clusters are based directly on each neighborhood's borough