# Neighborhoods Dataframe

Scrapes the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe.

Adds the geographical coordinates of each postal code from: http://cocl.us/Geospatial_data

### Import Libraries

In [1]:
import pandas as pd; pd.set_option('display.max_columns', None)
from urllib.request import urlopen

def BeautifulTablesFromPage(article):

    from pandas import DataFrame
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(article, 'html.parser')
    tables = soup.find_all('table', class_='sortable')
    
    all_tables_content = []
    for table in tables:
        ths = table.find_all('th')
        table_headings = [th.text.strip() for th in ths]
    
        table_content = []
        for tr in table.find_all('tr'):
            tds = tr.find_all('td')
            if not tds:
                continue
            table_content.append([td.text.strip() for td in tds])
        
        df = DataFrame(table_content)
        df.columns = table_headings
        all_tables_content.append(df)
        
        return(all_tables_content)

### Scraping

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# save the file locally
with open('List_of_postal_codes_of_Canada:_M', 'w') as art: 
    art.write(urlopen(url).read().decode())

# Load article
article = open('List_of_postal_codes_of_Canada:_M').read()

all_tables = BeautifulTablesFromPage(article)

postal_codes_of_Canada = all_tables[0].copy()

### Data Manipulation

In [4]:
#There are no duplicates in the postals codes
len(postal_codes_of_Canada['Postal Code'].unique())


#If Borough is Not assigned then drop line 
postal_codes_of_Canada = postal_codes_of_Canada[postal_codes_of_Canada['Borough'] != 'Not assigned']
postal_codes_of_Canada.reset_index(drop=True,inplace=True)

#If Neighbourhood is Not assigned then replace it with Borough
for i, row in postal_codes_of_Canada.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        postal_codes_of_Canada.loc[i]['Neighbourhood'] = row['Borough']

### Imports Geospatial Coordinate data

In [8]:
Geospatial_Coordinates = pd.read_csv('https://cocl.us/Geospatial_data')

### Merge Tables

In [10]:
target_locations = postal_codes_of_Canada.set_index('Postal Code').merge(Geospatial_Coordinates.set_index('Postal Code'), on='Postal Code')
target_locations.reset_index(inplace=True)

### Neighborhoods Dataframe

In [11]:
target_locations.head(11)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
