# Neighborhoods in Toronto
## week 3
## Crawling data


In [84]:
import requests
from bs4 import BeautifulSoup
import pandas as pd



In [85]:
url = r"https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('tbody')
rows = table.select('tr')
row_data = [r.get_text() for r in rows]

Get dataframe from rowdata

In [86]:
columns=['Postcode','Borough','Neighborhood']
df_temp = pd.DataFrame(row_data)
df = df_temp[0].str.split('\n', expand = True)
df.drop(df.columns[[0, 4]], axis = 1, inplace=True)
df.columns = columns
df.drop(df.index[0], inplace=True)
# df.to_csv(r'data/toronto.csv')
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


Ignore cells with a borough that is Not assigned.



In [87]:
df = pd.DataFrame(df[df.Borough != 'Not assigned'])
df.head()



Unnamed: 0,Postcode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


combine neighborhoods which have same postal code

In [88]:
df1 = df.groupby(['Postcode', 'Borough'], sort=False).agg(','.join)
df1.reset_index(inplace=True)
df1.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


Change the value of the Neighborhood to be like the Borough (Queen's Park)

In [89]:
df2 = df1.replace("Not assigned", "Queen's Park")
df2.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


Shape of dataframe

In [90]:
df2.shape

df_lat_lon = pd.read_csv(r'config/Geospatial_Coordinates.csv')
df_lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Create dataframe with lat, lon 

In [91]:
df_lat_lon.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
df3 = pd.merge(df2, df_lat_lon, on='Postcode')
df3.to_csv(r'data/toronto.csv')
df3.head()


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
