# Week 3 Project: Segmenting and Clustering Neighborhoods in the City of Toronto

## Task: Transform the data in the table on the Wikipedia page into the above pandas dataframe

#### Import required libraries

In [7]:
import pandas as pd

from bs4 import BeautifulSoup
import requests

#### Download Wikipedia webpage

In [8]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wiki_data = BeautifulSoup(requests.get(wiki_url).text, 'lxml')

# print wiki_data
# print(wiki_data)

#### Scraping data and transform into dataframe

In [9]:
# create dataframe
toronto_df = pd.DataFrame(columns = ['Postalcode','Borough','Neighborhood'])

# Locate table in Wiki page
wiki_table = wiki_data.find('div', class_='mw-parser-output').table.tbody

# Initialize
postcode = 999
borough = 999
neighborhood = 999

# Loop to find postcode, borough, neighborhood 
for tr in wiki_table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postcode = td.text
            i = i + 1
        elif i == 1:
            borough = td.text
            i = i + 1
        elif i == 2:
            neighborhood = td.text.strip('\n')
    toronto_df = toronto_df.append({'Postalcode': postcode,'Borough': borough,'Neighborhood': neighborhood},ignore_index=True)

toronto_df.head(15)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,999,999,999
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned


In [10]:
# Ignore cells with a borough that is not assigned
toronto_df = toronto_df[toronto_df.Borough!=999]
toronto_df = toronto_df[toronto_df.Borough!='Not assigned']
toronto_df.reset_index(drop = True, inplace = True)

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
i = 0
for i in range(0,toronto_df.shape[0]):
    if toronto_df.iloc[i][2] == 'Not assigned':
        toronto_df.iloc[i][2] = toronto_df.iloc[i][1]

# Combine neighborhoods that belong to the same borough into one row with a comma       
toronto_df = toronto_df.groupby(['Postalcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()


toronto_df.head(15)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [13]:
# Save dataframe to csv for later use
toronto_df.to_csv('toronto_df.csv',index=False)

In [14]:
# Use the .shape method to print the number of rows of dataframe
toronto_df.shape

(103, 3)