# Segmenting and Clustering Neighborhoods in Toronto

## Part 1 - Read the Toronto Neightborhood data from wikipedia

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
# Wikipedia page to load
wiki_page = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M";

# Get the html page
wiki_data = requests.get(wiki_page);

# response code 200 means the data was read OK
print(wiki_data)

<Response [200]>


Parse the html page using beautiful_soup.

In [4]:
soup = BeautifulSoup(wiki_data.text);

Use the hint code provided to scrape the table data and skip unassigned postal codes.

In [5]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    # Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

#print(table_contents)


Convert the list to a DataFrame

In [6]:
df=pd.DataFrame(table_contents);

Clean up the Borough names

In [7]:
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})


In [8]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.
Verify that no neighborhoods are not assigned

In [9]:
df.loc[df['Neighborhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [10]:
df.shape

(103, 3)

## Part 2 - Getting Latitude and Longitude Coordinates of each Neighborhood

In [11]:
# use geocoder to get the coordinates of the neighborhoods
!pip install --user geocoder

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [12]:
import geocoder 

# function to get the latitude/longitude coordinates using the postal code
def get_coordinates(postal_code):
  # initialize your variable to None
  lat_lng_coords = None

  # loop until you get the coordinates
  while(lat_lng_coords is None):
    print(lat_lng_coords)
    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords = g.latlng
  
  return lat_lng_coords;



Add empty columns

In [13]:
# df['Latitude'] = 0;
# df['Longitude'] = 0;
# df.head()

Test the get_coordinates function with one call.
geocoder never returned good coordinates.  Commenting this code out in favor of the CSV file.

In [14]:
# coords = get_coordinates("M5A")
# print(coords)

Code to apply the get_coordinates funtion to all rows.
But only None is returned, so commending that code out since processing never completed.

In [15]:
# df[['Latitude', 'Longitude']] = df['PostalCode'].apply(lambda x: pd.Series([get_coordinates(x)[0],get_coordinates(x)[1]]))
# print("processing complete")  

Use the provided csv file instead of using geocoder

Download the csv file

In [16]:
!wget -q -O 'Geospatial_Coordinates.csv' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv
print('Data downloaded!')

Data downloaded!


Read the csv file

In [17]:
coords_df = pd.read_csv('Geospatial_Coordinates.csv')
coords_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Rename the "Postal Code" column to match df.

In [18]:
coords_df.columns = ['PostalCode', 'Latitude', 'Longitude']
coords_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [19]:
pd.merge(df, coords_df, on="PostalCode", how="left")

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
