In [1]:
import pandas as pd

In [2]:
# read in data
TNT_postals = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
TNT_postals

[            0                 1  \
 0    Postcode           Borough   
 1         M1A      Not assigned   
 2         M2A      Not assigned   
 3         M3A        North York   
 4         M4A        North York   
 5         M5A  Downtown Toronto   
 6         M5A  Downtown Toronto   
 7         M6A        North York   
 8         M6A        North York   
 9         M7A      Queen's Park   
 10        M8A      Not assigned   
 11        M9A         Etobicoke   
 12        M1B       Scarborough   
 13        M1B       Scarborough   
 14        M2B      Not assigned   
 15        M3B        North York   
 16        M4B         East York   
 17        M4B         East York   
 18        M5B  Downtown Toronto   
 19        M5B  Downtown Toronto   
 20        M6B        North York   
 21        M7B      Not assigned   
 22        M8B      Not assigned   
 23        M9B         Etobicoke   
 24        M9B         Etobicoke   
 25        M9B         Etobicoke   
 26        M9B         Etobi

In [3]:
# use data from sectioned columns with "0" - included all three columns automatically
TNT = TNT_postals[0]
# label columns
TNT.columns = ['Postcode', 'Borough', 'Neighborhood']
# dropped the first row because it contained the column headers
TNT = TNT.iloc[1:]
TNT.head()

Unnamed: 0,Postcode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [4]:
# exclude all rows where Borough value was Not assigned
TNT = TNT[TNT.Borough != 'Not assigned']
# replace all Not assigned values in Neighborhood with the corresponding Borough value
TNT.Neighborhood.replace('Not assigned',TNT.Borough,inplace=True)
TNT.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


In [5]:
# grouped rows by common Postcode value, created lists in Neighborhood column
TNT = TNT.groupby(['Postcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
TNT.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [6]:
# shape of 103 rows and 3 columns
TNT.shape

(103, 3)

## End of submission 1
--------------------------------------------------------------------------
## Start submission 2

In [19]:
coordinates = pd.read_csv('http://cocl.us/Geospatial_data')
coordinates.columns = ['Postcode', 'Latitude', 'Longitude']
coordinates.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
TNT_coords = pd.merge(TNT, coordinates, how='left', on=['Postcode'])
TNT_coords.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## End Submission 2
-------------------------------------------------------
## Start Submission 3

In [31]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

# dropping str columns to use latitude and longitude float only
TNT_coords_clustering = TNT_coords.drop('Neighborhood', 1)
TNT_coords_clustering = TNT_coords_clustering.drop('Borough', 1)
TNT_coords_clustering = TNT_coords_clustering.drop('Postcode', 1)


# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(TNT_coords_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 2, 0, 2], dtype=int32)

In [36]:
!conda install -c conda-forge folium=0.5.0
import folium

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge


In [40]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [45]:
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi in zip(TNT_coords['Latitude'], TNT_coords['Longitude'], TNT_coords['Neighborhood']):
    label = folium.Popup(str(poi) + ' Cluster ', parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[kclusters-1],
        fill=True,
        fill_color=rainbow[kclusters-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

##### observation: downtown Toronto has many neighborhoods right on top of each other, but as you venture out the neighborhoods become more evenly spaced

## End of Submission 3
## End of Assignment