# Segmenting and Clustering Neighborhoods in Toronto

### Importing libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Extracting data from Wikipedia

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data  = requests.get(url).text

### Parsing data using BeautifulSoup

In [3]:
soup = BeautifulSoup(data, 'html5lib')

In [4]:
type(soup)

bs4.BeautifulSoup

### Creating dataframe according to mentioned attributes

1. The dataframe will consist of three columns: __PostalCode__, __Borough__ and __Neighborhood__
2. Only process the cells that have an assigned borough. Ignore cells with a borough that is __Not assigned__.
3. More than one neighborhood can exist in one postal code area and will be combined into one row with the neighborhoods separated with a comma
4. If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [5]:
toronto_data = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])

for row in soup.find("tbody").find_all("tr"):
    for col in row.find_all("td"):
        nbr = None
        
        spn = col.find("span").text
        if(spn != "Not assigned"):
            pcd = col.find("p").text[:3]
            
            brgh = spn.split('(')[0]
            nbr = (((((spn).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
            
            if(nbr != "Not assigned"):
                nbr = brgh
        
            toronto_data = toronto_data.append({"PostalCode":pcd, "Borough":brgh, "Neighborhood":nbr}, ignore_index=True)

In [6]:
toronto_data['Borough']=toronto_data['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade' : 'Downtown Toronto',
                                        'East TorontoBusiness reply mail Processing Centre969 Eastern' : 'East Toronto',
                                        'EtobicokeNorthwest' : 'Etobicoke',
                                        'East YorkEast Toronto' : 'East York/East Toronto',
                                        'MississaugaCanada Post Gateway Processing Centre' : 'Mississauga'})

### use of .head method to print top 5 rows of your dataframe

In [7]:
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,North York
1,M4A,North York,North York
2,M5A,Downtown Toronto,Downtown Toronto
3,M6A,North York,North York
4,M7A,Queen's Park,Queen's Park


### use of .shape method to print the number of rows of your dataframe

In [8]:
toronto_data.shape

(103, 3)

### Extracting geographical coordinates of each postal code from given csv file

In [9]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Renaming Postal Code to PostalCode for merging both the dataframe

In [10]:
df_latlng.rename(columns = {'Postal Code' : 'PostalCode'}, inplace='True')
df_latlng.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### use of .shape method to print the number of rows of geographical dataframe

In [11]:
df_latlng.shape

(103, 3)

### Merging neighborhood and geographical dataframe into df_toronto

In [12]:
df_toronto = pd.merge(toronto_data, df_latlng, on='PostalCode')
df_toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,North York,43.753259,-79.329656
1,M4A,North York,North York,43.725882,-79.315572
2,M5A,Downtown Toronto,Downtown Toronto,43.65426,-79.360636
3,M6A,North York,North York,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Etobicoke,43.667856,-79.532242
6,M1B,Scarborough,Scarborough,43.806686,-79.194353
7,M3B,North York,North York,43.745906,-79.352188
8,M4B,East York,East York,43.706397,-79.309937
9,M5B,Downtown Toronto,Downtown Toronto,43.657162,-79.378937


### use of .shape method to print the number of rows of toronto dataframe

In [13]:
df_toronto.shape

(103, 5)