# Lab: Segmenting and Clustering Neighborhoods in Toronto

## Import Required Libraries

In [119]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json 

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium 

!conda install -c anaconda beautifulsoup4 --yes
from bs4 import BeautifulSoup

import urllib.request

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be UPDATED:

  ca-certificates      anaconda::ca-certificates-2020.1.1-0 --> conda-forge::ca-certificates-2020.4.5.1-hecc5488_0

The following packages will be SUPERSEDED by a higher-priority channel:

  certifi               anaconda::certifi-2020.4.5.1-py36_0 --> conda-forge::certifi-2020.4.5.1-py36h9f0ad1d_0
  openssl               anaconda::openssl-1.1.1g-h7b6447c_0 --> conda-forge::openssl-1.1.1g-h516909a_0


Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - beautifulsoup4


The following packages will be SUPERSEDED by a higher

## Scrape Data from Wikipedia

### Pass Wikipedia page to BeautifulSoup:

In [120]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html")


### Pull table from Wikipedia page:

In [121]:
#finds 'wikitable sortable' table items in url
my_table=soup.find('table', class_='wikitable sortable')
my_table

#finds all rows of table
rows=my_table.findAll('tr')
rows

#create empty arrays to store data
A=[]
B=[]
C=[]

#loop through each row in table, and populate empty arrays with data
for row in rows:
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

rows[0:5]


[<tr>
 <th>Postal Code
 </th>
 <th>Borough
 </th>
 <th>Neighborhood
 </th></tr>,
 <tr>
 <td>M1A
 </td>
 <td>Not assigned
 </td>
 <td>
 </td></tr>,
 <tr>
 <td>M2A
 </td>
 <td>Not assigned
 </td>
 <td>
 </td></tr>,
 <tr>
 <td>M3A
 </td>
 <td>North York
 </td>
 <td>Parkwoods
 </td></tr>,
 <tr>
 <td>M4A
 </td>
 <td>North York
 </td>
 <td>Victoria Village
 </td></tr>]

## Convert arrays to pandas dataframe:

In [122]:
df=pd.DataFrame(A)
df['Borough']=B
df['Neighborhood']=C
df.columns=['PostalCode','Borough','Neighborhood']

#remove '\n' from all entries
df['PostalCode']=df['PostalCode'].str.replace(r'\n', '')
df['Borough']=df['Borough'].str.replace(r'\n', '')
df['Neighborhood']=df['Neighborhood'].str.replace(r'\n', '')

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Ignore cells with a borough that is unassigned:

In [123]:
df=df[df['Borough'] != 'Not assigned'].reset_index(drop=True)
    #also resets index

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


### Check for duplicate postal code listings:

In [124]:
codes=df['PostalCode'].unique()
a=len(codes)
b=df.shape[0]
if a-b != 0:
    print('there are duplicate postal codes')
else:
    print('no duplicates')

no duplicates


#### Show any entries with multiple neighborhoods have neighborhoods delimited by commas:

In [125]:
df['Neighborhood']=df['Neighborhood'].str.replace(r' /', ',')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### For any cells with assigned Boroughs, but unassigned Neighborhoods, make the Neighborhood the same as the Borough:

In [164]:
#check if any blanks exist
len(df['Borough'])

for i in range(0,len(df['Borough'])):
    if pd.isna(df.iloc[i,2]) == False:
        df.iloc[i,2]=df.iloc[i,1]
 
df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,North York
1,M4A,North York,North York
2,M5A,Downtown Toronto,Downtown Toronto
3,M6A,North York,North York
4,M7A,Downtown Toronto,Downtown Toronto


In [165]:
#print number of rows in dataframe
df.shape[0]

103