# Segmenting and Clustering Neighborhoods in Toronto

## Part 1 - Read the Toronto Neightborhood data from wikipedia

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
# Wikipedia page to load
wiki_page = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M";

# Get the html page
wiki_data = requests.get(wiki_page);

# response code 200 means the data was read OK
print(wiki_data)

<Response [200]>


Parse the html page using beautiful_soup.

In [4]:
soup = BeautifulSoup(wiki_data.text);

Use the hint code provided to scrape the table data and skip unassigned postal codes.

In [5]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    # Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

#print(table_contents)


Convert the list to a DataFrame

In [6]:
df=pd.DataFrame(table_contents);

Clean up the Borough names

In [7]:
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})


In [10]:
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.
Verify that no neighborhoods are not assigned

In [11]:
df.loc[df['Neighborhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [12]:
df.shape

(103, 3)

## Part 2 - Getting Latitude and Longitude Coordinates of each Neighborhood

In [13]:
# use geocoder to get the coordinates of the neighborhoods
!pip install --user geocoder

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 8.9 MB/s  eta 0:00:01
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [14]:
import geocoder 

# function to get the latitude/longitude coordinates using the postal code
def get_coordinates(postal_code):
  # initialize your variable to None
  lat_lng_coords = None

  # loop until you get the coordinates
  while(lat_lng_coords is None):
    print(lat_lng_coords)
    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords = g.latlng
  
  return lat_lng_coords;



Add empty columns

In [None]:
# df['Latitude'] = 0;
# df['Longitude'] = 0;
# df.head()

Test the get_coordinates function with one call.
geocoder never returned good coordinates.  Commenting this code out in favor of the CSV file.

In [15]:
# coords = get_coordinates("M5A")
# print(coords)

Code to apply the get_coordinates funtion to all rows.
But only None is returned, so commending that code out since processing never completed.

In [16]:
# df[['Latitude', 'Longitude']] = df['PostalCode'].apply(lambda x: pd.Series([get_coordinates(x)[0],get_coordinates(x)[1]]))
# print("processing complete")  

Use the provided csv file instead of using geocoder

Download the csv file

In [17]:
!wget -q -O 'Geospatial_Coordinates.csv' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv
print('Data downloaded!')

Data downloaded!


Read the csv file

In [18]:
coords_df = pd.read_csv('Geospatial_Coordinates.csv')
coords_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Rename the "Postal Code" column to match df.

In [19]:
coords_df.columns = ['PostalCode', 'Latitude', 'Longitude']
coords_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
df_merged = pd.merge(df, coords_df, on="PostalCode", how="left")
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


## Part 3: Explore the neighborhoods

Explore how many different Boroughs there are in Toronto

In [21]:
print('The dataframe has {} boroughs and {} postal codes.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 15 boroughs and 103 postal codes.


Get the geographical coordinates of Toronto.

In [22]:
from geopy.geocoders import Nominatim # to convert an address into latitude and longitude values

!pip install -U folium
import folium # map rendering library

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 5.2 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [23]:
address = 'Toronto Canada'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
toronto_latitude = location.latitude
toronto_longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(toronto_latitude, toronto_longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Create a map of Toronto and superimpose all of the neighborhoods.

In [24]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Borough'], df_merged['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto