# Segmenting and Clustering Neighbourhoods in the city of Toronto

##### Installing beautifulsoup4, lxml, geocoder, html5lib, folium

In [64]:
#!conda install -c conda-beautifulsoup4 lxml --yes

In [65]:
#!conda install -c conda-forge lxml --yes

In [66]:
#!conda install -c conda-forge geocoder --yes

In [67]:
#!conda install -c conda-forge html5lib --yes

In [68]:
#!conda install -c conda-forge folium=0.5.0 --yes 

In [1]:
#importing the required libraries
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

##### Step 1: Using requests and beautifulsoup, Scrape and Extract the required information from the webpage: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
soup = BeautifulSoup(source, 'lxml')

In [4]:
toronto_pc= soup.find('table')                                #find the tablular column containing the required information
toronto_pc = toronto_pc.find('tbody')

toronto_pclist =[]                                            #create an empty list to store the extracted data

rows = toronto_pc.find_all('tr')                              #find all the rows in the table
for row in rows:
    cols = row.find_all('td')
    cols = [entry.text.strip() for entry in cols]             #extract the required text corresponding to each column
    toronto_pclist.append([entry for entry in cols if entry]) #append the data to the list

col_name = toronto_pc.find_all('th')                          #extract the column names
toronto_pclist[0] = [col.text.strip() for col in col_name]    #column names for the final dataframe


In [5]:
type(toronto_pclist)

list

### Required dataframe

In [6]:
toronto_df = pd.DataFrame(toronto_pclist)   #convert the list to a dataframe
toronto_df.columns = toronto_df.iloc[0]     #name the columns
toronto_df.drop(0, inplace=True)            #drop the first row with column names
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


#### Cleaning Stage
#### 1. Drop all the rows with 'Not assigned' Boroughs

In [7]:
toronto_df = toronto_df[toronto_df['Borough']!='Not assigned']  
toronto_df.reset_index(drop=True, inplace=True)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### 2. Combine the neighbourhoods with the same postal code

In [8]:
toronto_df = toronto_df.groupby(['Postcode','Borough'],as_index=False).agg(', '.join)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### 3. Replace the 'Not assigned' neighbourhoods with the name of the corresponding Borough

In [9]:
not_assigned_neigh = toronto_df.loc[toronto_df['Neighbourhood'].eq('Not assigned'),'Neighbourhood'] #check for 'not assigned' neighbourhoods
print('Index ', not_assigned_neigh +' neighbourhood') #check if there are any 'not assigned neighbourhoods'


Index  85    Not assigned neighbourhood
Name: Neighbourhood, dtype: object


In [11]:
print(toronto_df.loc[85]) # neighboourhood not assigned for 'Queen's Park' Borough

0
Postcode                  M7A
Borough          Queen's Park
Neighbourhood    Not assigned
Name: 85, dtype: object


In [12]:
#assign the corresponding Borough Name for 'not assigned neighbourhoods'
toronto_df.loc[toronto_df['Neighbourhood'].eq('Not assigned'),'Neighbourhood'] = toronto_df.loc[toronto_df['Neighbourhood'].eq('Not assigned'),'Borough']
toronto_df.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [13]:
print(toronto_df.loc[85])  #check the neighbourhood name for 'Queen's Park' Borough

0
Postcode                  M7A
Borough          Queen's Park
Neighbourhood    Queen's Park
Name: 85, dtype: object


#### Number of rows in the processed dataframe 

In [14]:
print('Number of rows in the dataframe after cleaning:',toronto_df.shape[0])

Number of rows in the dataframe after cleaning: 103
