## Objective 1: Scrape Wikipedia Into Dataframe

### Building code to scrape web content

In [1]:
import pandas as pd

test_df = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])

test_df = test_df.append(pd.read_html(io='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'))

### Clean up scraped web content and make it how it is expected

In [2]:
# Cleaning
test_df.drop(columns=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,'PostalCode'],inplace=True)
test_df.dropna(how='all',inplace=True)

# Drop all not assigned boroughs, reassign all not assigned neighborhoods
df = test_df[test_df['Borough'] != 'Not assigned']
df.replace({'Neighborhood': 'Not assigned'}, df['Borough'])

#Rename and reorder columns
df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
df = df[['PostalCode', 'Borough', 'Neighborhood']]

df.reset_index(inplace=True)
df.drop(columns=['index'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [3]:
df.shape

(103, 3)

## Objective 2: Use Geocoder to Add Latitude/Longitude to Dataframe

### Using their code to get Lat/Long data

In [6]:
!conda install -c conda-forge geocoder -y
import geocoder # import geocoder
print('Import complete!')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Import complete!


In [14]:
# set new columns

df['Latitude'] = 1
df['Longitude'] = 1
df = df.astype({'Latitude': float, 'Longitude': float})

# start dataframe loop and get index

for postal_code in df['PostalCode']:
    i = int(df[df['PostalCode']==postal_code].index.values[0])
    
# initialize your variable to None
    lat_lng_coords = None

# loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
# set coordinates to index value in dataframe

    df.at[i,'Latitude'] = latitude
    df.at[i,'Longitude'] = longitude

df.head()

Status code Unknown from https://geocode.arcgis.com/arcgis/rest/services/World/GeocodeServer/find: ERROR - HTTPSConnectionPool(host='geocode.arcgis.com', port=443): Read timed out. (read timeout=5.0)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939


## Objective 3: Exploring and Clustering Neighborhoods

In [29]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

print ('Library imported!')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Library imported!


In [24]:
# set number of clusters
kclusters = 5

X = df[['Latitude', 'Longitude']]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(X)

# add clustering labels
df.insert(0, 'Cluster Labels', kmeans.labels_)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 0, 3, 0, 2, 1, 4, 4, 0])

In [25]:
df

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,4,M3A,North York,Parkwoods,43.752935,-79.335641
1,4,M4A,North York,Victoria Village,43.728102,-79.311890
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.661790,-79.389390
...,...,...,...,...,...,...
98,2,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653340,-79.509766
99,0,M4Y,Downtown Toronto,Church and Wellesley,43.666659,-79.381472
100,0,M7Y,East Toronto,Business reply mail Processing Centre,43.648700,-79.385450
101,2,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.632798,-79.493017


In [37]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10.4)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighborhood'], df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## It appears that the cluster of neighborhoods closest to the Billy Bishop Toronto City Airport are more centralized/clustered together than any of the other clusters. This is likely due to the proximity to the aforementioned airport and to being in central Toronto.