# CLUSTERING NEIGHBORHOODS IN TORONTO

## Step 1. Webscraping to Extract Data

**Get the data from online**

In [29]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [30]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_data = requests.get(url).text
soup = BeautifulSoup(html_data, 'html5lib')
print(soup.title)

<title>List of postal codes of Canada: M - Wikipedia</title>


**Find which table of the page contains data that I am looking for**

In [31]:
tables = soup.find_all('table')
len(tables)

3

In [32]:
for index, table in enumerate(tables):
    if ("M3A" in str(table)):
        t_index = index
print(t_index)

0


In [62]:
table = tables[0]
# table 

**Extract data from the chosen table and make Dataframe**

In [35]:
table_contents=[]
for row in table.findAll('td'): # <td> data cell, <tr>: The Table Row, <p> paragraph, <br> break,  <b> bold text
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')))
        table_contents.append(cell)
table_contents
df=pd.DataFrame(table_contents)

for value in df['Borough']:
    if len(value) > 20:
        print(value)
    else:
        pass
    
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

df

East YorkEast Toronto
MississaugaCanada Post Gateway Processing Centre
Downtown TorontoStn A PO Boxes25 The Esplanade
East TorontoBusiness reply mail Processing Centre969 Eastern


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


**Check if there is a duplicated values in 'PostalCode'**

In [36]:
df['PostalCode'].duplicated().value_counts()

False    103
Name: PostalCode, dtype: int64

**Dataframe shape**

In [37]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(len(df['Borough'].unique()), df.shape[0]))

The dataframe has 15 boroughs and 103 neighborhoods.


# Step 2. Make GeoSpatial Dataset

**Use geopy library to get the latitude and longitude values**

**- First try**

In [10]:
!git clone https://github.com/DenisCarriere/geocoder
!cd geocoder
!python setup.py install
import geocoder # import geocoder

postal_code = df['PostalCode']
# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

fatal: destination path 'geocoder' already exists and is not an empty directory.
python: can't open file 'setup.py': [Errno 2] No such file or directory


AttributeError: module 'geocoder' has no attribute 'google'

**- The second try**

In [None]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

geolocator = Nominatim(user_agent="ny_explorer")

for postal_code in df['PostalCode']:
    location = geolocator.geocode(postal_code)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinate of {} are {}, {}.'.format(postal_code, latitude, longitude))

### *Given that this package can be very unreliable,*
**Use the Geocoder package or the csv file**

In [38]:
url2 = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv"
geo_data = pd.read_csv(url2)
geo_data.rename(columns = {'Postal Code':'PostalCode'}, inplace = True)
geo_data.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [39]:
df_m1 = df.set_index('PostalCode')
df_m1.head()

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Queen's Park,Ontario Provincial Government


In [40]:
geo_data_m2 = geo_data.set_index('PostalCode')
geo_data_m2.head()

Unnamed: 0_level_0,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [41]:
joined = df_m1.join(geo_data_m2, how="inner")
joined.reset_index(inplace=True)
joined.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


**Get all the rows from the data frame which contains Toronto in their Borough**

In [43]:
joined['Borough'].str.contains('Toronto', regex=False)

0      False
1      False
2       True
3      False
4      False
       ...  
98     False
99      True
100     True
101    False
102    False
Name: Borough, Length: 103, dtype: bool

In [45]:
toronto_df = joined[joined['Borough'].str.contains('Toronto', regex=False)].reset_index(drop=True)
toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
8,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
9,M4J,East York/East Toronto,The Danforth East,43.685347,-79.338106


# Step 3. Create a map of with neighborhoods and explore the neighborhood

**Create a map of the neighborhoods**

In [16]:
# !conda install -c conda-forge folium=0.5.0 --yes
import folium 

In [46]:
# downtown toronto latitude longitude: 43.6548° N, 79.3883° W
map_toronto = folium.Map(location=[43.6548, -79.3883], zoom_start=11)
map_toronto

for lat, lon, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'],toronto_df['Borough'],toronto_df['Neighborhood']):
    label = '{},{}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat,lon], radius=5, popup=label, color='blue',fill=True, fill_color='#3186cc',fill_opacity=0.7, parese_html=False).add_to(map_toronto)
    
map_toronto

### The map might not be visible on Github. Please check out the image file in the repository.

**Use KMeans for the clustering of the neighbourhoods**

In [61]:
from sklearn.cluster import KMeans
k = 5
toronto_clustering = toronto_df[['Latitude','Longitude']].copy()
toronto_clustering.head()

Unnamed: 0,Latitude,Longitude
0,43.65426,-79.360636
1,43.657162,-79.378937
2,43.651494,-79.375418
3,43.676357,-79.293031
4,43.644771,-79.373306


In [54]:
kmeans = KMeans(n_clusters=k, random_state=0).fit(toronto_clustering)
kmeans.labels_[0:10] 

array([4, 4, 4, 1, 4, 4, 0, 4, 3, 1], dtype=int32)

In [55]:
toronto_df.insert(0,'Cluster Labels',kmeans.labels_)
toronto_df.head()

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,1,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


**Visualize the cluster in the map**

In [58]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [60]:
# create map
map_toronto_cl = folium.Map(location=[43.6548, -79.3883], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
for lat, lon, neighborhood, cluster in zip(toronto_df['Latitude'], toronto_df['Longitude'],toronto_df['Neighborhood'],toronto_df['Cluster Labels']):
    label = folium.Popup('Cluster'+str(cluster), parse_html=True)
    folium.CircleMarker([lat,lon], radius=5, popup=label, color=rainbow[cluster-1],fill=True, fill_color=rainbow[cluster-1],fill_opacity=0.7).add_to(map_toronto_cl)
    
map_toronto_cl

### The map might not be visible on Github. Please check out the image file in the repository.