##  1. Scraping and Data Arrangement  - Toronto Project

### 1.1 Scraping from Wikipedia

In [29]:
#importing base libraries 
#to get data
import requests

#for scraping
from bs4 import BeautifulSoup

#base python libraries
import pandas as pd
import numpy as np

In [30]:
#get the entire html of the url as a str
wikipedia_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [31]:
canada_data = BeautifulSoup(wikipedia_url, 'html.parser') #text to html

In [32]:
info_table = canada_data.find('table', class_ = 'wikitable')
rows_selected = info_table.find_all('tr')

In [33]:
# extract the info ('Postcode', 'Borough', 'Neighbourhood') from the table
canada_info = []
for row in rows_selected:
    info = row.text.split('\n')[1:-1] # remove empty str (first and last items)
    canada_info.append(info)
    
canada_info[:]

[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', 'Downtown Toronto', "Queen's Park"],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', "Queen's Park", 'Not assigned'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M9B',

In [34]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [35]:
#table (lists of list) to df

canada_info[0][-1] = 'Neighborhood'
canada_df = pd.DataFrame(canada_info[1:], columns=canada_info[0])

canada_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### 1.2. Data Arrangement 

In [36]:
not_assigned_boroughs = canada_df.index[canada_df['Borough'] == 'Not assigned']
not_assigned_neighborhoods = canada_df.index[canada_df['Neighborhood'] == 'Not assigned']
not_assigned_neighborhoods_and_borough = not_assigned_boroughs & not_assigned_neighborhoods
print('initial rows and columns:', canada_df.shape)
print('boroughs miss value:', not_assigned_boroughs.shape[0])
print('neighborhoods miss value:', not_assigned_neighborhoods.shape[0])
print('boroughs and neighborhoods miss value:', not_assigned_neighborhoods_and_borough.shape[0])

initial rows and columns: (287, 3)
boroughs miss value: 77
neighborhoods miss value: 78
boroughs and neighborhoods miss value: 77


In [37]:
# drop not_assigned and reseting index
canada_df.drop(canada_df.index[not_assigned_boroughs], inplace=True)
canada_df.reset_index(drop=True, inplace=True)
canada_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [38]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

# run this again because the indexes on the dataframe where reset
not_assigned_neighborhoods = canada_df.index[canada_df['Neighborhood'] == 'Not assigned']

for j in not_assigned_neighborhoods:
    canada_df['Neighborhood'][j] = canada_df['Borough'][j]
    
canada_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


Combining repeated postal code area

In [43]:
group = canada_df.groupby('Postcode')
grouped_neighborhoods = group['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
grouped_boroughs = group['Borough'].apply(lambda x: set(x).pop())
grouped_df = pd.DataFrame(list(zip(grouped_boroughs.index, grouped_boroughs, grouped_neighborhoods)))
grouped_df.columns = ['Postcode', 'Borough', 'Neighborhood']

grouped_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


Shape verification:

In [47]:
grouped_df.shape

(103, 3)

***

## 2. Reading the geographical coordinates link/csv file of each postal code

In [46]:
cor_df = pd.read_csv('https://cocl.us/Geospatial_data')
cor_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [51]:
# creating coordinates columns based on postal codes
canada_df2 = grouped_df.join(cor_df.set_index('Postal Code'), on='Postcode')
canada_df2.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
