In [197]:
import requests
import pandas as pd
import numpy as np

### Parsing wikipedia table with pandas

In [198]:
df=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [199]:
df.columns =["Postalcode","Borough","Neighborhood"] #correct the 1st column name
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Droping rows where Borough is not assigned

In [200]:
df = df[df.Borough != 'Not assigned'] 

### Merging Neighbourhoods by Postcode

In [201]:
for postcode in (df['Postalcode'].value_counts()>1).index.values:
    df.loc[df.Postalcode==postcode, 'Neighborhood'] = ', '.join(df[df.Postalcode==postcode]['Neighborhood'].values)
    
df.drop_duplicates(subset='Postalcode', keep="first", inplace = True)
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,"Lawrence Heights, Lawrence Manor"
7,M7A,Downtown Toronto,Queen's Park


### Assigning not assigned Neighbourhoods

In [202]:
df.loc[df['Neighborhood']=='Not assigned', 'Neighborhood'] = df[df['Neighborhood']=='Not assigned']['Borough']
df[df['Postalcode']=='M9A']

Unnamed: 0,Postalcode,Borough,Neighborhood
9,M9A,Queen's Park,Queen's Park


### Checking resulting dataframe shape

In [203]:
df.shape

(103, 3)

### Downloading geospatial data into a dataframe

In [204]:
geod = pd.read_csv('https://cocl.us/Geospatial_data')

geod.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging Toronto dataframe with geospatial Dataframe

In [210]:
geod.rename(columns={'Postal Code': 'Postalcode'}, inplace=True)

df = df.merge(geod, left_on='Postalcode', right_on='Postalcode')
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


### Instaling folium library for creating map visualizations 

In [213]:
!conda install -c conda-forge folium=0.5.0 --yes 

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.0.1               |             py_0         575 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         673 KB

The following NEW packages will be INSTALLED:

    altair:  4.0.1-py_0 conda-forge
    branca:  0.3.1-py_0 conda-forge
    folium:  0.5.0-py_0 conda-forge
    vincent: 0.4.4-py_1 conda-forge


Downloading and Extracting Packages
altair-4.0.1         | 575 KB    | #####

### Toronto neighborhoods map visualization

In [214]:
import folium # map rendering library

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[43.65426,  -79.360636], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}: {}'.format(borough, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [231]:
toronto_data = df[df['Borough'].str.contains('Toronto')] 
toronto_data.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [237]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[43.65426,  -79.360636], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighborhood']):
    label = '{}: {}'.format(borough, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Seting up foursquare credentials

In [233]:
CLIENT_ID = 'TXJ0O0QJKUY2WM1IX3J2TZ4CJEWRRGQFL34I5IMJ5QZEUZMD' #  Foursquare ID
CLIENT_SECRET = 'SWXA0JMYIJCAUCSDZ1QCMPUGAV2C311TR1OSGLJRV0AKVEK1' #  Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [234]:
toronto_data.shape

(39, 5)

### Defining support funcions

In [239]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    

# function that returns nearby venues by accessing foursquare    
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print('Retrieving nearby venues for ', name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Retrieving nearby venues of Toronto Neighborhoods

In [240]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

Retrieving nearby venues for  Harbourfront
Retrieving nearby venues for  Queen's Park
Retrieving nearby venues for  Ryerson, Garden District
Retrieving nearby venues for  St. James Town
Retrieving nearby venues for  The Beaches
Retrieving nearby venues for  Berczy Park
Retrieving nearby venues for  Central Bay Street
Retrieving nearby venues for  Christie
Retrieving nearby venues for  Adelaide, King, Richmond
Retrieving nearby venues for  Dovercourt Village, Dufferin
Retrieving nearby venues for  Harbourfront East, Toronto Islands, Union Station
Retrieving nearby venues for  Little Portugal, Trinity
Retrieving nearby venues for  The Danforth West, Riverdale
Retrieving nearby venues for  Design Exchange, Toronto Dominion Centre
Retrieving nearby venues for  Brockton, Exhibition Place, Parkdale Village
Retrieving nearby venues for  The Beaches West, India Bazaar
Retrieving nearby venues for  Commerce Court, Victoria Hotel
Retrieving nearby venues for  Studio District
Retrieving nearby ve

### Analyzing retrieved venues data

In [242]:
print(toronto_venues.shape)
toronto_venues.head()

(1716, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Harbourfront,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Harbourfront,43.65426,-79.360636,Cooper Koo Family YMCA,43.653191,-79.357947,Gym / Fitness Center
3,Harbourfront,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Harbourfront,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [243]:
toronto_venues['Neighborhood'].value_counts()

First Canadian Place, Underground city                                                                        100
St. James Town                                                                                                100
Ryerson, Garden District                                                                                      100
Design Exchange, Toronto Dominion Centre                                                                      100
Harbourfront East, Toronto Islands, Union Station                                                             100
Commerce Court, Victoria Hotel                                                                                100
Adelaide, King, Richmond                                                                                      100
Stn A PO Boxes 25 The Esplanade                                                                                94
Chinatown, Grange Park, Kensington Market                                               

In [246]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 234 uniques categories.


### Preparing data for Clustering

In [302]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

del(toronto_onehot['Neighborhood'])

# add neighborhood column back to dataframe
toronto_onehot.insert(0, 'Neighborhood', toronto_venues['Neighborhood'].values)  

toronto_onehot.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [303]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head() 

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [272]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
         venue  freq
0  Coffee Shop  0.07
1          Bar  0.04
2   Steakhouse  0.04
3         Café  0.04
4   Restaurant  0.03


----Berczy Park----
            venue  freq
0     Coffee Shop  0.09
1    Cocktail Bar  0.05
2  Farmers Market  0.03
3        Beer Bar  0.03
4      Steakhouse  0.03


----Brockton, Exhibition Place, Parkdale Village----
            venue  freq
0          Bakery  0.09
1     Coffee Shop  0.09
2  Breakfast Spot  0.09
3            Café  0.09
4         Stadium  0.05


----Business Reply Mail Processing Centre 969 Eastern----
           venue  freq
0     Comic Shop  0.07
1     Restaurant  0.07
2            Spa  0.07
3    Pizza Place  0.07
4  Burrito Place  0.07


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2  Airport Terminal  0.12
3             Plane  0.06
4     Boat or Ferry  0.06


---

In [273]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [275]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Steakhouse,Bar,Café,Cosmetics Shop,Hotel,Restaurant,Asian Restaurant,Thai Restaurant,Bakery
1,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Steakhouse,Bakery,Farmers Market,Café,Cheese Shop,Beer Bar,Liquor Store
2,"Brockton, Exhibition Place, Parkdale Village",Bakery,Coffee Shop,Café,Breakfast Spot,Gym,Performing Arts Venue,Pet Store,Nightclub,Climbing Gym,Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Park,Garden,Light Rail Station,Farmers Market,Spa,Fast Food Restaurant,Burrito Place,Restaurant,Brewery,Auto Workshop
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Service,Airport Terminal,Boutique,Airport,Airport Food Court,Airport Gate,Sculpture Garden,Bar,Harbor / Marina


### Running Clustering Algorithm

In [291]:
from sklearn.cluster import KMeans


# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=1).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 0, 3, 1, 1, 1, 1, 1, 1, 4, 1, 1], dtype=int32)

In [299]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_data with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() 

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,1,Coffee Shop,Park,Pub,Bakery,Café,Breakfast Spot,Mexican Restaurant,Restaurant,Shoe Store,Brewery
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,1,Coffee Shop,Gym,Park,Fast Food Restaurant,Portuguese Restaurant,Nightclub,Music Venue,Mexican Restaurant,Juice Bar,Italian Restaurant
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,1,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Japanese Restaurant,Bakery,Middle Eastern Restaurant,Fast Food Restaurant,Diner,Plaza
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Coffee Shop,Café,Restaurant,Clothing Store,Italian Restaurant,Hotel,Bakery,Cosmetics Shop,Beer Bar,Cocktail Bar
19,M4E,East Toronto,The Beaches,43.676357,-79.293031,4,Trail,Other Great Outdoors,Health Food Store,Pub,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant,Yoga Studio


### Creating map visualization of clustering results

In [301]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[43.65426,  -79.360636], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters