# SEGMENTING NEIGHBORHOODS IN TORONTO

## 1. Importing libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner
from bs4 import BeautifulSoup
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


print('Libraries imported.')

Libraries imported.


## 2.Importing the Data from Wikipedia, and creating the dataframe that will store the data

In [4]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

In [5]:
neighborhoods = pd.DataFrame(columns=['PostalCode','Borough','Neighborhood'])

## 3.Storing the data on the dataframe

In [6]:
sopa = soup.findAll('table',class_='wikitable sortable')
i = -1

for div in sopa:
    rows = div.findAll('tr')
    for row in rows:
        if i>-1:
            ro = row.findAll('td')
            j=0
            tds = []
            for r in ro:
                tds.insert(j,r.text.strip())
                j+=1
            neighborhoods.loc[i]=tds
        i+=1

In [7]:
neighborhoods.shape

(288, 3)

In [8]:
neighborhoods.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## 4. Deleting the cells (rows) that are not assigned to a Borough

In [9]:
j=0
tds=[]
for i in neighborhoods.index:
    if neighborhoods.loc[i,'Borough']=="Not assigned":
        tds.insert(j,i)
        j+=1
neighborhoods = neighborhoods.drop(tds)
neighborhoods = neighborhoods.reset_index(drop=True)

In [10]:
neighborhoods.shape

(211, 3)

In [11]:
neighborhoods.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


## 5. If do not have a neighborhood assigned, we'll assign the same borough

In [12]:
for i in neighborhoods.index:
    if neighborhoods.loc[i,'Neighborhood']=="Not assigned":
        neighborhoods.loc[i,'Neighborhood'] = neighborhoods.loc[i,'Borough']

In [13]:
neighborhoods.shape

(211, 3)

In [14]:
neighborhoods.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


## 6. Grouping under a single postal code. Changing principal dataframe (Neighborhoods to Data)

In [15]:
data = neighborhoods.copy()

In [16]:
for i in neighborhoods.index:
    tds = []
    sub = []
    for j in neighborhoods.index:
        n=0
        if neighborhoods.loc[i,'PostalCode']==neighborhoods.loc[j,'PostalCode']:
            tds.insert(n,j)
            sub.insert(n,neighborhoods.loc[j,'Neighborhood'])
            n+=1
    seperator = ', '
    data.loc[i,'Neighborhood'] = seperator.join(sub)

In [17]:
data = data.drop_duplicates(subset='PostalCode', keep='first')

In [18]:
data = data.reset_index(drop=True)

In [22]:
data.shape

(103, 3)

In [23]:
data

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"
