# Segmenting and Clustering Neighborhoods in Toronto
## by Dalia Y. Domínguez 

In [55]:
#Import the libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

Use the requests library to download the webpage https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M.
Save the text of the response as a variable named html_data and make the object beautiful_soup.

In [56]:
# url and get ready the information we'll need 
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_data  = requests.get(url).text 
beautiful_soup= BeautifulSoup(html_data, 'html.parser')

The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [57]:
# get ready the data frame
torontoPC = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"]) #headers
torontoPC

Unnamed: 0,PostalCode,Borough,Neighborhood


We're only process the cells that have an assigned borough.
if more than one neighborhood can exist in one postal code area. It will will have in one row the neighborhoods separated with a comma as shown.
If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [58]:
for row in beautiful_soup.table.find_all("tr"):
    for postalCodeInf in row.find_all("td"):
        if ( postalCodeInf.span.text != 'Not assigned'):
            postalCode = postalCodeInf.b.text
            borough = postalCodeInf.span.text.split('(')[0]
            neighborhood = postalCodeInf.span.text.split('(')[1].replace('/',',')[:-1] 
            torontoPC = torontoPC.append({"PostalCode":postalCode, "Borough":borough, "Neighborhood":neighborhood}, ignore_index=True)
            
torontoPC['Borough']=torontoPC ['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

#torontoPC[torontoPC['Neighborhood']=='Not assigned'] #verified if there's a missing neighborhood

Data Frame:

In [59]:
torontoPC

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


Number of rows of the frame

In [60]:
torontoPC.shape

(103, 3)

Now that you have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood. 

In [None]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

#for every postal code:
for postal_code in torontoPC["PostalCode"]:
    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
location = torontoPC.append({"Latitude":latitude, "Longitude":longitude}, ignore_index=True)

Important Note: There is a limit on how many times you can call geocoder.google function. It is 2500 times per day. 
Given that this package can be very unreliable, in case you are not able to get the geographical coordinates of the neighborhoods using the Geocoder package, "GeoSpatial Dataset" csv file that has the geographical coordinates of each postal code.

In [61]:
with open('Geospatial_Coordinates.csv', newline='') as csvfile:
    location = pd.read_csv(csvfile)

In [62]:
location

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


Order the postal codes of Toronto by Postal Code such as location data frame:

In [63]:
torontoPC=torontoPC.sort_values(by=['PostalCode'])
torontoPC=torontoPC.reset_index(drop=True) #to reset an index in data frame
torontoPC

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village , St. Phillips , Martin Grov..."
101,M9V,Etobicoke,"South Steeles , Silverstone , Humbergate , Jam..."


Add the location of each postal code to torontoPC data frame:

In [64]:
torontoPC['Latitude'] = location['Latitude'] 
torontoPC['Longitude'] = location['Longitude']
torontoPC.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848
