<h1> Assignment - Part 1 </h1>

<b> In the next section we will be scraping data for each of Toronto's neighbourhoods. </b>

In [1]:
#Installing packages
#!pip install bs4
from bs4 import BeautifulSoup 
import requests 
import pandas as pd

<h4>Scraping data from Wikipedia</h4>

In [2]:
url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969" #using an adjusted url that makes the scraping easier
data  = requests.get(url).text #requesting data
soup = BeautifulSoup(data, 'html5lib') #interpreting data
tables = soup.find_all('table') #pulling data in the table



<h4>Extracting scraped data to a data frame</h4>

In [4]:
neighbourhood_data = pd.DataFrame(columns=["Postal Code", "Borough", "Neighbourhood"]) #setting the headers

#assigning scraped data to columns
for row in tables[0].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        postal_code = col[0].text.strip()
        borough = col[1].text.strip()
        neighbourhood = col[2].text.strip()
        neighbourhood_data = neighbourhood_data.append({"Postal Code":postal_code, "Borough":borough, "Neighbourhood":neighbourhood}, ignore_index=True)

indexNames = neighbourhood_data[neighbourhood_data['Borough'] == "Not assigned" ].index #removing unassigned boroughs
neighbourhood_data.drop(indexNames , inplace=True)
#neighbourhood_data.Borough.replace('Not Assigned',neighbourhood_data.Neighbourhood,inplace=True) #replacing unassigned neighbourhoods with borough names
neighbourhood_data



Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [5]:
neighbourhood_data.shape #showing table shape

(103, 3)

<h1> Assignment - Part 2</h1>

<b> In the next section we will be importing the corresponding coordinate data for each postal code. </b>

In [6]:
#!conda install -c conda-forge geocoder --yes
import geocoder #import package

ModuleNotFoundError: No module named 'geocoder'

In [14]:
#Pull coordinate info
latitude=[]
longitude=[]
for code in neighbourhood_data['Postal Code']:
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
    print(code, g.latlng)
    while (g.latlng is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
        print(code, g.latlng)
    latlng = g.latlng
    latitude.append(latlng[0])
    longitude.append(latlng[1])

M3A [43.75245000000007, -79.32990999999998]
M4A [43.73057000000006, -79.31305999999995]
M5A [43.65512000000007, -79.36263999999994]
M6A [43.72327000000007, -79.45041999999995]
M7A [43.66253000000006, -79.39187999999996]
M9A [43.662630000000036, -79.52830999999998]
M1B [43.811390000000074, -79.19661999999994]
M3B [43.74923000000007, -79.36185999999998]
M4B [43.70718000000005, -79.31191999999999]
M5B [43.65739000000008, -79.37803999999994]
M6B [43.70687000000004, -79.44811999999996]
M9B [43.65034000000003, -79.55361999999997]
M1C [43.78574000000003, -79.15874999999994]
M3C [43.72168000000005, -79.34351999999996]
M4C [43.68970000000007, -79.30681999999996]
M5C [43.65215000000006, -79.37586999999996]
M6C [43.69211000000007, -79.43035999999995]
M9C [43.64857000000006, -79.57824999999997]
M1E [43.765750000000025, -79.17469999999997]
M4E [43.67709000000008, -79.29546999999997]
M5E [43.64536000000004, -79.37305999999995]
M6E [43.68784000000005, -79.45045999999996]
M1G [43.76812000000007, -79.2

In [7]:
#At this point even though I pulled in the coordinate info via geocoder, I decided to apped the data from the CSV instead to ensure consistency for grading purposes.
df_postcode = neighbourhood_data.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
locgeo_df = pd.read_csv('https://cocl.us/Geospatial_data', index_col='Postal Code')

toronto_data = df_postcode.join(locgeo_df, on='Postal Code') 
toronto_data

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


In [22]:
toronto_data.dtypes

Postal Code       object
Borough           object
Neighbourhood     object
Latitude         float64
Longitude        float64
dtype: object

In [8]:
toronto_data.shape

(103, 5)

<h1>Assignment - Part 3</h1>

<b> In the next section we will be performing a cluster analysis to group the neighbourhoods together </b>

In [10]:
import numpy as np # library to handle data in a vectorized manner

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [46]:
# set number of clusters
kclusters = 15

toronto_grouped_clustering = toronto_data.drop(['Neighbourhood','Borough','Postal Code'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

toronto_grouped_clustering.insert(0, 'Cluster Labels', kmeans.labels_)

In [47]:
#Toronto coordinates
latitude = 43.70011
longitude = -79.4163

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, cluster in zip(toronto_grouped_clustering['Latitude'], toronto_grouped_clustering['Longitude'], toronto_grouped_clustering['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [47]:
#Toronto coordinates
latitude = 43.70011
longitude = -79.4163

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

style_function = lambda x: {
    'fillColor': nbh_count_colormap(x['properties']['nb']),
    'color': 'black',
    'weight': 1.5,
    'fillOpacity': 0.7
}

folium.GeoJson(
    map_clusters,
    style_function=style_function,
    tooltip=folium.GeoJsonTooltip(
        fields=['neighbourhood', 'nb', 'QP_str'],
        aliases=['Neighbourhood', 'Location amount', 'Quote-part'],
        localize=True
    )
).add_to(map_clusters)

nbh_count_colormap.add_to(nbh_locs_map)
nbh_count_colormap.caption = 'Airbnb location amount'
nbh_count_colormap.add_to(nbh_locs_map)