# Segmenting and Clustering Neighborhoods in Toronto

### Importing libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Extracting data from Wikipedia

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data  = requests.get(url).text

### Parsing data using BeautifulSoup

In [3]:
soup = BeautifulSoup(data, 'html5lib')

In [4]:
type(soup)

bs4.BeautifulSoup

### Creating dataframe according to mentioned attributes

1. The dataframe will consist of three columns: __PostalCode__, __Borough__ and __Neighborhood__
2. Only process the cells that have an assigned borough. Ignore cells with a borough that is __Not assigned__.
3. More than one neighborhood can exist in one postal code area and will be combined into one row with the neighborhoods separated with a comma
4. If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [5]:
toronto_data = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])

for row in soup.find("tbody").find_all("tr"):
    for col in row.find_all("td"):
        nbr = None
        
        spn = col.find("span").text
        if(spn != "Not assigned"):
            pcd = col.find("p").text[:3]
            
            brgh = spn.split('(')[0]
            nbr = (((((spn).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
            
            if(nbr == "Not assigned"):
                nbr = brgh
        
            toronto_data = toronto_data.append({"PostalCode":pcd, "Borough":brgh, "Neighborhood":nbr}, ignore_index=True)

In [6]:
toronto_data['Borough']=toronto_data['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade' : 'Downtown Toronto',
                                        'East TorontoBusiness reply mail Processing Centre969 Eastern' : 'East Toronto',
                                        'EtobicokeNorthwest' : 'Etobicoke',
                                        'East YorkEast Toronto' : 'East York/East Toronto',
                                        'MississaugaCanada Post Gateway Processing Centre' : 'Mississauga'})

### use of .head method to print top 5 rows of your dataframe

In [7]:
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


### use of .shape method to print the number of rows of your dataframe

In [8]:
toronto_data.shape

(103, 3)