## Segmenting and Clustering Neighborhoods in Toronto ##

### Part 1 - Applied Data Science Capstone Project ###

In this part we rescue a table from Wikipedia with PostalCode, Borough, and Neighborhood information of Toronto.

In [1]:
# Import main libraries for this project.

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# First extract the url using request and create a BeautifulSoup object.

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
extracting_data = requests.get(url).text
soup = BeautifulSoup(extracting_data, "lxml")


In [3]:
# Now extract the table and process its information.

table_contents=[]
table=soup.find("table")

for row in table.findAll("td"):    
    cell = {}
    
    if row.span.text == "Not assigned":
        pass
    
    else:
        cell["PostalCode"] = row.p.text[:3]
        cell["Borough"] = (row.span.text).split("(")[0]
        cell["Neighborhood"] = (((((row.span.text).split("(")[1]).strip(")")).replace(" /", ",")).replace(")"," ")).strip(" ")
        table_contents.append(cell)

In [4]:
# Convert the data to a Pandas DataFrame.

df = pd.DataFrame(table_contents)
df["Borough"] = df["Borough"].replace({"Downtown TorontoStn A PO Boxes25 The Esplanade":"Downtown Toronto Stn A",
                                        "East TorontoBusiness reply mail Processing Centre969 Eastern":"East Toronto Business",
                                        "EtobicokeNorthwest":"Etobicoke Northwest",
                                        "East YorkEast Toronto":"East York/East Toronto",
                                        "MississaugaCanada Post Gateway Processing Centre":"Mississauga"})

In [5]:
# Do some data processing.

df = df[df["Borough"] != "Not assigned"]
df = df.groupby(["PostalCode", "Borough"])["Neighborhood"].apply(", ".join).reset_index()


In [6]:
# Print the DataFrame.

df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [7]:
# Shape of the DataFrame.

print(df.shape)

(103, 3)
