Part 1

In [1]:
# Question 1 - Create Notebook and Import Packages
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Question 2 - Import Dataframe
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
canada_postal_code = requests.get(wikipedia_link)

soup = BeautifulSoup(canada_postal_code.content, 'lxml')
wikitable = soup.findAll('table')[0]
df_postal_codes = pd.read_html(str(wikitable))[0]

df_postal_codes.head(5)

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [3]:
# Question 3 - Modify Dataframe to fit the screenshot
# Rename columns
df_postal_codes.rename(columns={ 0:'PostalCode', 1:'Borough',2:'Neighborhood'}, inplace=True)
df_postal_codes2 = df_postal_codes.drop([0])

# Remove Rows with Borogh is 'Not Assinged'
df_postal_codes2 = df_postal_codes2[df_postal_codes2.Borough != 'Not assigned']

# Renaming Neighborhood to match Borough if 'Not Assigned'
df_postal_codes2.loc[df_postal_codes2.Neighborhood == 'Not assigned', ['Neighborhood']] = df_postal_codes2['Borough']

# Combining the Neighborhoods by Borough
df_postal_codes2 = df_postal_codes2.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(','.join).reset_index()
df_postal_codes2.sort_values(by='PostalCode')

df_postal_codes2.shape
df_postal_codes2.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Part 2

In [4]:
# import geocoder info
geocoder_link='http://cocl.us/Geospatial_data'
df_geocoder = pd.read_csv(geocoder_link)
df_geocoder.rename(columns={ 'Postal Code':'PostalCode'}, inplace=True)
df_geocoder.head(5)

# Merge Latitude Longitude Data from Geocoder
df_postal_codes_final = pd.merge(df_postal_codes2, df_geocoder, how='left', left_on='PostalCode',right_on='PostalCode')
df_postal_codes_final.head(5)
#df_postal_codes_final.shape

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Part 3

In [5]:
### Import all necessary libraries

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import numpy as np # library to handle data in a vectorized manner

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Libraries imported.


In [8]:
# Get Venue Information
address = 'Toronto, Ontario'

geolocator = Nominatim()
Location = geolocator.geocode(address)
Latitude = Location.latitude
Longitude = Location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(Latitude, Longitude))

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

CLIENT_ID = 'MVMMAFNRIBOGHRJETZCML3UCI1JUNNSOWUJBQPRLC5XJZ2QP' # your Foursquare ID
CLIENT_SECRET = 'VY5L1RQ01KLNM0DFBPF3S1FBNAU0BG4QZGQMI5EHQSUZVGTR' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            Latitude, 
            Longitude, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)




The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [9]:
# Filter to only Borough for Toronto
df_toronto_only = df_postal_codes_final[df_postal_codes_final.Borough.str.slice(-7,) == 'Toronto']
df_toronto_only.reset_index(drop=True, inplace=True)
df_toronto_only.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [10]:
# Retrive Toronto Venues
toronto_venues = getNearbyVenues(names=df_toronto_only['Neighborhood'],
                                   latitudes=df_toronto_only['Latitude'],
                                   longitudes=df_toronto_only['Longitude']
                                  )

toronto_venues.groupby('Neighborhood').count()

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

The Beaches
The Danforth West,Riverdale
The Beaches West,India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront,Regent Park
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Roselawn
Forest Hill North,Forest Hill West
The Annex,North Midtown,Yorkville
Harbord,University of Toronto
Chinatown,Grange Park,Kensington Market
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Christie
Dovercourt Village,Dufferin
Little Portugal,Trinity
Brockton,Exhibition Place,Parkdale Village
High Park,The Junction South
Parkdale,Roncesvall

Unnamed: 0,Vegetarian / Vegan Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Bakery,Bar,Breakfast Spot,Bubble Tea Shop,...,Restaurant,Seafood Restaurant,Smoke Shop,Steakhouse,Sushi Restaurant,Tapas Restaurant,Tea Room,Thai Restaurant,Toy / Game Store,University
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Get the top 10 Venues from each neighborhood
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped
    
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe for sorted venues
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar
1,Berczy Park,Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar
2,"Brockton,Exhibition Place,Parkdale Village",Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar
3,Business reply mail Processing Centre969 Eastern,Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar
5,"Cabbagetown,St. James Town",Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar
6,Central Bay Street,Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar
7,"Chinatown,Grange Park,Kensington Market",Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar
8,Christie,Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar
9,Church and Wellesley,Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar


In [12]:
# Create 5 Cluster for the toronto Neighborhoods
# set number of clusters
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

toronto_merged = df_toronto_only

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,0,Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,0,Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Art Gallery,Coffee Shop,Japanese Restaurant,Café,Sushi Restaurant,Breakfast Spot,Chinese Restaurant,American Restaurant,Exhibit,Bar


In [13]:
# create map for the toronto clusters
map_clusters = folium.Map(location=[Latitude, Longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters