In [1]:
import pandas as pd
import bs4 as bs
import requests

## Reading in the page, and Pulling the Table from the HTML code

In [2]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

page_data = requests.get(wiki_url, timeout = 5)

page_content = bs.BeautifulSoup(page_data.content, "html.parser")


data_table = page_content.find('table',{'class':'wikitable sortable'})


## Selecting the Variable Names From the Table

In [3]:
columns = data_table.find_all('th')

In [4]:
variable_names = []

for col in columns: 
    
    variable_names.append(col.text)




In [5]:
variable_names ## Column names will need cleaning

['Postcode', 'Borough', 'Neighbourhood\n']

## Selecting and Cleaning the Rows from the Table

In [6]:
raw_rows = data_table.find_all('tr')[0:]
parsed_rows = []

In [7]:
for r in raw_rows:
    
    parsed_rows.append(r.text)
    
    

In [8]:
no_newline_rows = [r.replace('\n',',') for r in parsed_rows] ## Replacing Newlines 

In [9]:
parsed_into_rows = [nnr.split(',') for nnr in no_newline_rows] ## Parsing Text Strings into Rows

In [10]:
parsed_into_rows

[['', 'Postcode', 'Borough', 'Neighbourhood', ''],
 ['', 'M1A', 'Not assigned', 'Not assigned', ''],
 ['', 'M2A', 'Not assigned', 'Not assigned', ''],
 ['', 'M3A', 'North York', 'Parkwoods', ''],
 ['', 'M4A', 'North York', 'Victoria Village', ''],
 ['', 'M5A', 'Downtown Toronto', 'Harbourfront', ''],
 ['', 'M5A', 'Downtown Toronto', 'Regent Park', ''],
 ['', 'M6A', 'North York', 'Lawrence Heights', ''],
 ['', 'M6A', 'North York', 'Lawrence Manor', ''],
 ['', 'M7A', "Queen's Park", 'Not assigned', ''],
 ['', 'M8A', 'Not assigned', 'Not assigned', ''],
 ['', 'M9A', 'Etobicoke', 'Islington Avenue', ''],
 ['', 'M1B', 'Scarborough', 'Rouge', ''],
 ['', 'M1B', 'Scarborough', 'Malvern', ''],
 ['', 'M2B', 'Not assigned', 'Not assigned', ''],
 ['', 'M3B', 'North York', 'Don Mills North', ''],
 ['', 'M4B', 'East York', 'Woodbine Gardens', ''],
 ['', 'M4B', 'East York', 'Parkview Hill', ''],
 ['', 'M5B', 'Downtown Toronto', 'Ryerson', ''],
 ['', 'M5B', 'Downtown Toronto', 'Garden District', ''],


In [11]:
final_data = []

for pr in parsed_into_rows: ## Removing Empty Elements
    
    final_data.append([t for t in pr if t != ""])

## Formatting into Pandas DataFrame and Cleaning Columns

In [12]:
column_names = final_data[0]

In [13]:
first_column = [final_data[i][0] for i in range(1,len(final_data[1:])+1)] ## Creaing the 
second_column = [final_data[i][1] for i in range(1,len(final_data[1:])+1)]
third_column = [final_data[i][2] for i in range(1,len(final_data[1:])+1)]

In [14]:
pd_dict = {column_names[0]:first_column, column_names[1]: second_column, column_names[2]: third_column}

In [15]:
full_neighborhood_data = pd.DataFrame(pd_dict, columns = column_names)

In [16]:
bad_value = full_neighborhood_data.Borough[0] ## Removing 'Not assigned' boroughs

In [17]:
data_with_good_buroughs = full_neighborhood_data.loc[full_neighborhood_data['Borough'] != bad_value,:]

In [18]:
data_with_good_buroughs.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


## Aggregating Postcodes with Multiple Neighborhoods

In [19]:
Postcode_counts = data_with_good_buroughs.Postcode.value_counts() 
unique_postcodes = list(Postcode_counts.index)## Identifying post codes with multiple neighbrhoods

In [20]:
new_rows = [] ## Rows for our final data set

for m in unique_postcodes:
    
    data_this_post_code = data_with_good_buroughs.loc[data_with_good_buroughs.Postcode == m,:] 
    ## Data for a given post code
    
    neighborhoods = list(data_this_post_code.loc[:,'Neighbourhood']) ## Neighborhoods in that post code
    
    counted = 0 ## A way of dealing with 'Not assigned' neighborhoods, and making sure when correcting for this
    ## We don't duplicate the value of the borough (so each neighborhood name only appears once)
    
    for i in range(len(neighborhoods)):
        
        if neighborhoods[i] == bad_value: ## If "Not assigned", give it the borough name
            
            ## Makes sure that we don't but borough name in more than once,
                     ##           in case a given borough has multiple missing neighborhoods.
            
            if counted == 0: 
                
                            ## Makes sure that we don't but borough name in more than once,
                            ##  in case a given borough has multiple missing neighborhoods.
                 
                neighborhoods[i] = list(data_this_post_code.loc[:,'Borough'])[0]
                
                counted +=1
            
    new_rows.append(', '.join(neighborhoods)) ## Joining all neighborhoods together 
       
            
                     
        



## Creating Final data

In [21]:
postcodes_and_neighborhoods = pd.DataFrame({'Postcode':unique_postcodes, 'Neighbourhood':new_rows},
                                          columns = ('Postcode', 'Neighbourhood'))  

## Data With Postcodes and Neighborhoods

In [22]:
postcodes_and_boroughs = data_with_good_buroughs.loc[:,['Postcode','Borough']].drop_duplicates()

## Data with Unique Postcodes and Boroughs

In [23]:
final_neighborhood_data = postcodes_and_boroughs.merge(postcodes_and_neighborhoods,how = 'inner' ,on = 'Postcode')

## Inner joining the two previous data sets to obtain the final data set

In [24]:
neighborhoods_to_look_at = ['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V','M9L','M5V','M1B','M5A']

final_neighborhood_data.loc[final_neighborhood_data['Postcode'].isin(neighborhoods_to_look_at),:]

## Presenting the neighborhoods that are listed in the assignment sumbmission image for verification.
## The neighborhoods will appear in a different order than shown on that image, but they correspond 
## to the correct borough and postal code.''' 

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
6,M1B,Scarborough,"Rouge, Malvern"
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
23,M4G,East York,Leaside
24,M5G,Downtown Toronto,Central Bay Street
27,M2H,North York,Hillcrest Village
32,M1J,Scarborough,Scarborough Village
50,M9L,North York,Humber Summit
54,M4M,East Toronto,Studio District
71,M1R,Scarborough,"Maryvale, Wexford"


In [25]:
final_neighborhood_data.shape ## Dimensions of our final data set

(103, 3)