<h2>Segmenting and Clustering Neighborhood Notebook</h2>

In [2]:
import pandas as pd
import numpy as np
import requests

In [3]:
#Beautifulsoup to scrape web data
from bs4 import BeautifulSoup

In [4]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [5]:
soup = BeautifulSoup(source.text, 'lxml')

In [6]:
#using soup object, iterate the .wikitable to get the data from the HTML page and store it into a list
data = []
columns = []
table = soup.find(class_='wikitable')

In [8]:
for index, tr in enumerate(table.find_all('tr')):
        section = []
        for td in tr.find_all(['th','td']):
            section.append(td.text.rstrip())
        
        #First row of data is the header\n,
        if (index == 0):
            columns = section
        else:
            data.append(section)

<h2>Create Dataframe</h2>

In [9]:
canada_df = pd.DataFrame(data = data,columns = columns)
canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


<h2>Data Cleanup </h2>

<li>Remove Boroughs that are 'Not assigned'</li>

In [10]:
indexNames = canada_df[canada_df['Borough'] == 'Not assigned' ].index
canada_df.drop(indexNames , inplace=True)
canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


<li>Group by neighbourhood & remove duplicate</li>

In [28]:
#remove duplicates
    canada_df.drop_duplicates()
    canada_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [12]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
    canada_df['Neighbourhood'].replace("Not assigned", canada_df["Borough"],inplace=True)
    canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


<li>Shape of Dataframe </li>

In [29]:
canada_df.shape

(420, 3)

<h2>New Dataframe with Latitude & Longitude</h2.

In [30]:
df_coord = pd.read_csv("http://cocl.us/Geospatial_data")
df_coord.head(12)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [31]:
df_coord.rename(columns={'Postal Code':'Postcode'}, inplace=True)

In [32]:
#join two dataframe together on Postcode
  result = pd.merge(canada_df, df_coord, how='inner', on = 'Postcode')

In [33]:
result.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M3A,North York,Parkwoods,43.753259,-79.329656
2,M4A,North York,Victoria Village,43.725882,-79.315572
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636


<h2>Explore and cluster the neighborhoods in Toronto</h2>

In [21]:
!conda install -c conda-forge geopy

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    certifi-2019.11.28         |   py36h9f0ad1d_1         149 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1

In [17]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [18]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [19]:
# Matplotlib and associated plotting modules
    import matplotlib.cm as cm
    import matplotlib.colors as colors
    from sklearn.cluster import KMeans

In [23]:
!conda install -c conda-forge folium=0.5.0
import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         673 KB

The following NEW packages will be INSTALLED:

    altair:  4.0.1-py_0 conda-forge
    branca:  0.4.0-py_0 conda-forge
    folium:  0.5.0-py_0 conda-forge
    vincent: 0.4.4-py_1 conda-forge


Downloading and Extracting Packages
vincent-0.4.4        | 28 KB     | #####

In [35]:
# Step_1 examine the resulting dataframe
    neighborhoods = result[['Borough','Neighbourhood','Latitude','Longitude']]
    

In [36]:
print('The dataframe has {} boroughs and {} neighbourhood.'.format(len(neighborhoods['Borough'].unique()), neighborhoods.shape[0]))

The dataframe has 10 boroughs and 420 neighbourhood.


In [38]:
 # Step_2 Use geopy library to get the latitude and longitude values of Toronto
    address = 'Toronto, ON, Canada'
    geolocator = Nominatim(user_agent="to_explorer") #why it's to_explorer?
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinate of Toronto, ON are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, ON are 43.653963, -79.387207.


In [39]:
# Step_3 create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10.4)


In [42]:
# add markers to map
for lat, lng, borough, neighbourhood in zip(neighborhoods['Latitude'],neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighbourhood']):
        label = '{}, {}'.format(neighbourhood, borough)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=3,
            popup=label,
            color='#E42222',
            fill=True,
            fill_color='#CB9D5B',
            fill_opacity=0.7,
            parse_html=False).add_to(map_toronto)  
        map_toronto

In [43]:
map_toronto