# Tornoto Neighborhood Scrape

In [58]:
# html_doc = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"


In [1]:
# import pandas library to work with dataframe
import pandas as pd
import numpy as np


In [2]:
# installing beautifulsoup4 package for scraping wikipedia page
# also install proper needed parsers to work with beautifulsoup4

!pip install BeautifulSoup4
!pip install lxml
!pip install html5lib



In [3]:
import bs4
import requests # library to handle requests
import json # library to handle JSON files
from bs4 import BeautifulSoup

In [4]:
# URL on wikipedia to scrap, thus we can extract postal codes, Borough and Neighbrhood 
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_doc = requests.get(url).text

In [5]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [6]:
# extract the postal code table
postal_data = []
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_body = table.find('tbody')

rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    postal_data.append([ele for ele in cols if ele]) # Get rid of empty values

# the resulted is a list, so let us convert it to dataframe    
df = pd.DataFrame(data=postal_data)

# assign proper headers to columns
df.columns = ['PostalCode', 'Borough', 'Neighborhood']

# Ignore cells with a borough that is Not assigned. 
df_new = df [df.Borough !=  "Not assigned"]

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
for index, row in df_new.iterrows():
    if row["Neighborhood"] == "Not assigned" :
        row["Neighborhood"] = row["Borough"]

# let us sort dataframe by column 'PostalCode', thus we can handle the next process
df_new.sort_values(by=['PostalCode'])
# delete first row conducted by parser as empty
df_new = df_new.iloc[1:]


# More than one neighborhood can exist in one postal code area
# let us comine these neighbors into one row
df_unique = df_new.PostalCode.unique()
dictOfPostal = dict.fromkeys(df_unique , '')


# read the latitude and longtude from csv file
geocsv = pd.read_csv("Geospatial_Coordinates.csv")

# create dictioanry for latitude property
dic_latitude = {}
geo_latitude = geocsv.iloc [:, 0:2]
for index, row in geo_latitude.iterrows():
    dic_latitude[row['Postal_Code']] = row['Latitude']

# create dictioanry for longtitude property
dic_longitude = {}
geo_longitude = geocsv.iloc [:, [0,2]]
for index, row in geo_longitude.iterrows():
    dic_longitude[row['Postal_Code']] = row['Longitude']

# the main concatenation of joining latitude & longtide from dictionaris with other properties from dataframe
old_postal_code = ''
old_neighborhood = ''
new_neighborhood = ''
for index, row in df_new.iterrows():
    
    if old_postal_code != row['PostalCode'] :
        old_postal_code = old_postal_code
        dictOfPostal[row['PostalCode']] = [ row['Borough'],  row['Neighborhood'], dic_latitude[row['PostalCode']], dic_longitude[row['PostalCode']] ]
    elif old_postal_code == row['PostalCode'] and old_postal_code != '':
        new_neighborhood = old_neighborhood + ', ' + row['Neighborhood']
        dictOfPostal[old_postal_code] = [ row['Borough'],  new_neighborhood, dic_latitude[old_postal_code], dic_longitude[old_postal_code] ]
    old_postal_code = row['PostalCode']
    old_neighborhood = row['Neighborhood']

print ('Our dataframe now is ' + str (df_new.PostalCode.unique().shape[0]) + " rows")

# finally assign each key to its values
dic_values_list = list(dictOfPostal.values())
dic_keys_list = list(dictOfPostal.keys())
dic_list = []
for i in range (0, len(dic_keys_list)):
    dic_list.append( ( dic_keys_list[i], dic_values_list[i][0], dic_values_list[i][1], dic_values_list[i][2], dic_values_list[i][3] ))
    
    
# here we are convert list to dataframe and give it proper headers
df_results = pd.DataFrame(dic_list)
df_results.columns = ['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']

#print results 
df_results






Our dataframe now is 103 rows


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
