# Segmenting and Clustering Neighborhoods in Toronto

### Import required libraries

In [1]:
#!pip install beautifulsoup4
#!pip install lxml

from bs4 import BeautifulSoup
import requests
import pandas as pd

## Get Canadian Postal Codes (M) from Wikipedia

### In a separate notebook, we parsed using Beautiful Soup

We did lots of clean-up and saved the result to a csv-file named 'Postal.csv'  
Now, we're going to pick-it-up, so we can continue with the task of getting the geo-data  
and combining into a single DataFrame.

In [2]:
df_zip = pd.read_csv('Postal.csv')
df_zip.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [3]:
print(df_zip.shape)

(103, 3)


## Get geospatial coordinates of zip codes and combine into a DataFrame

### This is how we would lookup geospatial coordinates using the Geocoder Python library

In [4]:
!pip install geocoder



In [5]:
import geocoder # import geocoder

In [6]:
for index in df_zip.index:
    Postcode = df_zip['Postcode'].iloc[index]
    Borough = df_zip['Borough'].iloc[index]
    Neighbourhood = df_zip['Neighbourhood'].iloc[index]
    
    # initialize your variable to None
    lat_lng_coords = None
    latitude = 0
    longitude = 0

    count = 0
    # loop until you get the coordinates
    if False:   #The while loop never came back, so I abandoned Geocoder
        while((lat_lng_coords is None) | (count >9) ):
            g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
            lat_lng_coords = g.latlng
            count =+ 1

        if lat_lng_coords is None:
            latitude = 0
            longitude = 0
        else:
            latitude = lat_lng_coords[0]
            longitude = lat_lng_coords[1]

#   print(Postcode, Borough, Neighbourhood, latitude, longitude)

### And this is how we actually get the geospatial data

In [7]:
!wget -q -O 'Geospatial_Coordinates.csv' https://cocl.us/Geospatial_data

In [8]:
df_geo = pd.read_csv('Geospatial_Coordinates.csv')
df_geo.rename(columns={'Postal Code':'Postcode'}, inplace=True)

Before we set the indexes, look at the shapes

In [9]:
print('Before setting Index')
print('    df_zip:',df_zip.shape)
print('    df_geo:',df_geo.shape)

Before setting Index
    df_zip: (103, 3)
    df_geo: (103, 3)


It is helpful to use the index for each DataFrame

In [10]:
df_geo.set_index('Postcode', inplace=True)
df_zip.set_index('Postcode', inplace=True)
print('df_zip:',df_zip.shape)
print('df_geo:',df_geo.shape)

df_zip: (103, 2)
df_geo: (103, 2)


In [11]:
df_geo.head()

Unnamed: 0_level_0,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


### Now, Join the two DataFrames, adding new columns, combining by matching Index

In [12]:
df_combined = df_zip.join(df_geo)
print(df_combined.shape)

(103, 4)


In [13]:
# we save it with index=Postcode
df_combined.to_csv('Toronto.csv')

In [15]:
df_combined.reset_index(inplace=True)

In [16]:
df_combined.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [18]:
df_combined.shape

(103, 5)