Before we get the data and start exploring it, let's download all the dependencies that we will need.

In [3]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


<a id='item1'></a>

## 1. Download and Explore Dataset

In [4]:
#Import the file from the html site
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M",header=0)[0]

#Remove rows that are not assigned
df = df[df.Borough != 'Not assigned']

#Find Duplicated rows
df_non_dups = df.drop_duplicates(['Postcode','Borough'], keep=False)
df_dups = df[df.duplicated(['Postcode','Borough'], keep=False)]

#Create blank dataframe to store de-duped values
df_dedup = pd.DataFrame(columns=['Postcode','Borough','Neighbourhood'])

#Get values of first row in dups
updateIndex = df_dups.index[0]
currentPostcode = df_dups.iloc[0,0]
currentBorough = df_dups.iloc[0,1]
updateNeighborhood = ''
concatString = ''

#Loop through the dups and create update dataframe with concatinated names
for index, row in df_dups.iterrows():
  if row['Postcode'] == currentPostcode:
    updateNeighborhood = updateNeighborhood + concatString + row['Neighbourhood'] 
    concatString = ', '
  else:  
    df_dedup.loc[updateIndex] = np.array([currentPostcode, currentBorough, updateNeighborhood])
    
    updateIndex = index
    currentPostcode = row['Postcode']
    currentBorough = row['Borough']
    updateNeighborhood = row['Neighbourhood'] 

#Add the last row
df_dedup.loc[updateIndex] = np.array([currentPostcode, currentBorough, updateNeighborhood])


#Combine data frames to get full list
df_non_dups = df_non_dups.append(df_dedup)

#Update Neighbourhoods that are not assigned
df_non_dups.Neighbourhood.replace('Not assigned',df_non_dups.Borough,inplace=True)


df_non_dups.shape

df_non_dups.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
14,M3B,North York,Don Mills North


#### Get Latitude and Longitude data

In [5]:
#Get the latitude and longitude table
df = pd.read_csv("http://cocl.us/Geospatial_data")

#Update the index
df = df.rename(columns={'Postal Code': 'Postcode'})
df.set_index('Postcode', inplace = True)

df.head()

Unnamed: 0_level_0,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


Let's assign the latitude and longitude

In [6]:
#Assign df2 to existing df_non_dups
df2 = df_non_dups

#Add new columns
df2['Latitude'] = ''
df2['Longitude'] = ''

#Set the latitude and longitude
for index, row in df2.iterrows():
  df2.loc[index].Latitude = df.loc[row['Postcode']].Latitude
  df2.loc[index].Longitude = df.loc[row['Postcode']].Longitude

df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.7533,-79.3297
3,M4A,North York,Victoria Village,43.7259,-79.3156
8,M7A,Queen's Park,Queen's Park,43.6623,-79.3895
10,M9A,Etobicoke,Islington Avenue,43.6679,-79.5322
14,M3B,North York,Don Mills North,43.7459,-79.3522


Notice how all the relevant data is in the *features* key, which is basically a list of the neighborhoods. So, let's define a new variable that includes this data.

#### Create a map of New York with neighborhoods superimposed on top.

In [8]:
address = 'Toronto, CA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [19]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, Neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighbourhood']):
    label = '{}, {}'.format(Neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [None]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 0, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]