### **Segmenting and Clustering Neighborhoods in Toronto (First Step)**

##### **Import Resources**

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests

##### **Part 1: Scrape Wikipedia Page**

In [2]:
#define link
link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#get page from link
page_text= requests.get(link).text
page = bs(page_text,'xml')

##### **Part 2: Extract & Clean Table and Ignore Borough "Not Assigned"**

In [3]:
#find table
table = page.find('table')

Postcode = []
Borough = []
Neighbourhood = []

#clean table & ignore borough with "not assigned"
for tr_cell in table.find_all('tr'):
    
    counter = 1
    Postcode_var = -1
    Borough_var = -1
    Neighbourhood_var = -1
    
    for td_cell in tr_cell.find_all('td'):
        if counter == 1: 
            Postcode_var = td_cell.text
        if counter == 2: 
            Borough_var = td_cell.text
            tag_a_Borough = td_cell.find('a')
        if counter == 3: 
            Neighbourhood_var = str(td_cell.text).strip()
            tag_a_Neighbourhood = td_cell.find('a')
            
        counter +=1
        
    if (Borough_var == 'Not assigned'): 
        continue
    if(Postcode_var == -1 or Borough_var == -1 or Neighbourhood_var == -1):
        continue    
        
    Postcode.append(Postcode_var)
    Borough.append(Borough_var)
    Neighbourhood.append(Neighbourhood_var)

##### **Part 3: Create Unique Postcodes and Merge Neighbourhoods**

In [4]:
#define unique postcodes
unique_p = set(Postcode)

Postcode_u = []
Borough_u = []
Neighbourhood_u = []

#combine multiple neighbourhoods
for postcode_unique_element in unique_p:
    p_var = ''; b_var = ''; n_var = ''; 
    for postcode_idx, postcode_element in enumerate(Postcode):
        if postcode_unique_element == postcode_element:
            p_var = postcode_element;
            b_var = Borough[postcode_idx]
            if n_var == '': 
                n_var = Neighbourhood[postcode_idx]
            else:
                n_var = n_var + ', ' + Neighbourhood[postcode_idx]
    Postcode_u.append(p_var)
    Borough_u.append(b_var)
    Neighbourhood_u.append(n_var)

##### **Part 4: Create Pandas Dataframe and Save CSV for Second Step**

In [7]:
#Pandas Dataframe
toronto = {'Postcode':Postcode_u, 'Borough':Borough_u, 'Neighbourhood':Neighbourhood_u}
df = pd.DataFrame.from_dict(toronto)
df.to_csv('toronto_part1.csv')
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M9W,Etobicoke,Northwest
1,M5S,Downtown Toronto,"Harbord, University of Toronto"
2,M3J,North York,"Northwood Park, York University"
3,M2H,North York,Hillcrest Village
4,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."


In [10]:
df.shape

(103, 3)