# Scraping data from Wikipedia

## Finding the Postal Code, Borough and Neighborhood from the table

#### Importing BeautifulSoup and requests

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

#### Reading wikipedia url

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#### Parsing url with lxml parser

In [3]:
soup = BeautifulSoup(source, 'lxml')

#### Finding table and all the tr's in the table

In [4]:
table = soup.find('table')
table_rows = table.find_all('tr')

#### Empty list which will contain all the table rows

In [5]:
toronto_pc_df = []

#### Appending the wikipedia table data into "can_pc_df" row by row

In [6]:
for tr in table_rows:
    td = tr.find_all('td')
    row = [data.text for data in td]
    row = [content.replace('\n', '') for content in row]
    toronto_pc_df.append(row)

#### Adding column names and creating dataframe

In [7]:
columns = ['Postcode', 'Borough', 'Neighborhood']
toronto_pc_df = pd.DataFrame(toronto_pc_df, columns=columns)

#### Dropping na values

In [8]:
toronto_pc_df.dropna(inplace=True)

#### Eliminating the Boroughs which have "Not assigned" value

In [9]:
toronto_pc_df = toronto_pc_df[toronto_pc_df.Borough != 'Not assigned']

#### Neighborhood which have "Not assigned" value

In [10]:
toronto_pc_df[toronto_pc_df['Neighborhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighborhood
8,M7A,Queen's Park,Not assigned


#### Replacing Neighborhood which have "Not assigned" with repsective Borough value

In [11]:
toronto_pc_df['Neighborhood'][toronto_pc_df['Neighborhood'] == 'Not assigned'] = toronto_pc_df['Borough'][toronto_pc_df['Neighborhood'] == 'Not assigned']

#### Merging Neighborhoods which have same Borough

In [12]:
toronto_pc_df = toronto_pc_df.groupby(['Postcode','Borough'])['Neighborhood'].apply(lambda x: ', '.join(set(x))).reset_index()

In [13]:
toronto_pc_df

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Highland Creek, Port Union"
2,M1E,Scarborough,"Morningside, West Hill, Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### Shape of the df

In [14]:
toronto_pc_df.shape

(103, 3)

## ADDING LATITUDE AND LONGITUDE

In [15]:
geospatial_coordinates_df = pd.read_csv('Geospatial_Coordinates.csv')

In [16]:
geospatial_coordinates_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Column name is different for toronto_pc_df and geospatial_coordinates_df (Postcode and Postal Code)

In [17]:
toronto_pc_df.columns

Index(['Postcode', 'Borough', 'Neighborhood'], dtype='object')

In [18]:
geospatial_coordinates_df.columns

Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')

In [19]:
toronto_pc_df.rename(columns={'Postcode':'Postal Code'}, inplace=True)

In [20]:
toronto_ll_df = pd.merge(toronto_pc_df, geospatial_coordinates_df, on='Postal Code', how='outer')

In [21]:
toronto_ll_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Highland Creek, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Morningside, West Hill, Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Segmenting and Clustering Neighborhoods in Toronto

In [22]:
 # convert an address into latitude and longitude values
from geopy.geocoders import Nominatim

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

from pandas.io.json import json_normalize

print('Libraries imported.')

Libraries imported.


### Getting the latitude and longitude of Toronto

In [23]:
address = 'Toronto, ON, Canada'

geolocator = Nominatim(user_agent="to_explorer", timeout=3)
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, ON, Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, ON, Canada are 43.653963, -79.387207.


### Plotting map of Toronto with the Neighborhoods

In [24]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_ll_df['Latitude'], toronto_ll_df['Longitude'], toronto_ll_df['Borough'], toronto_ll_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Utilizing the Foursquare API to explore the neighborhoods and segment them

#### Foursqaure API credentials

In [25]:
# Foursquare ID
CLIENT_ID = '1JDFUZA2BGP0RNAAYR4ZG0HDC52ECHFGMQEQVFBWQ4YPMDMN'
# Foursquare SECRET
CLIENT_SECRET = 'OPRDMNKORPU0XYSVSWPFIZNA4LEKYH1VK4WLUQ3UW1DNIJOL'
# Foursquare API version
VERSION = '20180605' 

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 1JDFUZA2BGP0RNAAYR4ZG0HDC52ECHFGMQEQVFBWQ4YPMDMN
CLIENT_SECRET:OPRDMNKORPU0XYSVSWPFIZNA4LEKYH1VK4WLUQ3UW1DNIJOL


#### Let's explore a random neighborhood in our dataframe.

In [26]:
location = 45

In [27]:
toronto_ll_df.loc[location, 'Neighborhood']

'Davisville North'

#### Get the neighborhood's latitude and longitude values.

In [28]:
neighborhood_latitude = toronto_ll_df.loc[location, 'Latitude'] # neighborhood latitude value

neighborhood_longitude = toronto_ll_df.loc[location, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_ll_df.loc[location, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Davisville North are 43.7127511, -79.3901975.


#### Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters.
First, let's create the GET request URL


In [29]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

In [30]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=1JDFUZA2BGP0RNAAYR4ZG0HDC52ECHFGMQEQVFBWQ4YPMDMN&client_secret=OPRDMNKORPU0XYSVSWPFIZNA4LEKYH1VK4WLUQ3UW1DNIJOL&v=20180605&ll=43.7127511,-79.3901975&radius=500&limit=100'

Send the GET request and examine the resutls

In [31]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5dfa924a216785001b163aaf'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Davisville',
  'headerFullLocation': 'Davisville, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 10,
  'suggestedBounds': {'ne': {'lat': 43.7172511045, 'lng': -79.38398344441633},
   'sw': {'lat': 43.708251095499996, 'lng': -79.39641155558367}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8e73c30cd6209590ae7be4',
       'name': 'Summerhill Market North',
       'location': {'address': '1054 Mount Pleasant Rd.',
        'crossStreet': 'Sheldrake Blvd',
        'lat': 43.71549914910689,
        'lng': -79.39288125988016,
        'labeledLa

In [32]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [33]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Summerhill Market North,Food & Drink Shop,43.715499,-79.392881
1,Sherwood Park,Park,43.716551,-79.387776
2,Homeway Restaurant & Brunch,Breakfast Spot,43.712641,-79.391557
3,Winners,Clothing Store,43.713236,-79.393873
4,Best Western Roehampton Hotel & Suites,Hotel,43.708878,-79.39088


In [34]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

10 venues were returned by Foursquare.


## Explore Neighborhoods in Toronto

#### Function to repeat the same process to all the neighborhoods.

In [35]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Invoking getNearbyValues() and storing the results is toronto_venues

In [36]:
toronto_venues = getNearbyVenues(names=toronto_ll_df['Neighborhood'],
                                   latitudes=toronto_ll_df['Latitude'],
                                   longitudes=toronto_ll_df['Longitude']
                                  )

Malvern, Rouge
Rouge Hill, Highland Creek, Port Union
Morningside, West Hill, Guildwood
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Wexford Heights, Scarborough Town Centre, Dorset Park
Wexford, Maryvale
Agincourt
Tam O'Shanter, Sullivan, Clarks Corners
Agincourt North, L'Amoreaux East, Steeles East, Milliken
L'Amoreaux West
Upper Rouge
Hillcrest Village
Henry Farm, Oriole, Fairview
Bayview Village
York Mills, Silver Hills
Willowdale, Newtonbrook
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
York University, Northwood Park
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
Riverdale, The Danf

#### Size of the resulting dataframe

In [37]:
print(toronto_venues.shape)
toronto_venues.head()

(2237, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Rouge Hill, Highland Creek, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Rouge Hill, Highland Creek, Port Union",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
3,"Morningside, West Hill, Guildwood",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,"Morningside, West Hill, Guildwood",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


#### Venues returned for each neighborhood

In [38]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,4,4,4,4,4,4
"Agincourt North, L'Amoreaux East, Steeles East, Milliken",3,3,3,3,3,3
"Alderwood, Long Branch",9,9,9,9,9,9
"Bathurst Manor, Downsview North, Wilson Heights",19,19,19,19,19,19
Bayview Village,4,4,4,4,4,4
Berczy Park,56,56,56,56,56,56
"Birch Cliff, Cliffside West",4,4,4,4,4,4
"Brockton, Parkdale Village, Exhibition Place",23,23,23,23,23,23
Business Reply Mail Processing Centre 969 Eastern,17,17,17,17,17,17


#### Unique categories from all the returned venues

In [39]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 274 uniques categories.


## Analyze Each Neighborhood

In [40]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Shape of new dataframe

In [41]:
toronto_onehot.shape

(2237, 274)

#### Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [42]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.020000,...,0.00,0.00,0.020000,0.000000,0.000000,0.000000,0.000000,0.010000,0.000000,0.0
1,Agincourt,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,"Agincourt North, L'Amoreaux East, Steeles East...",0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,"Alderwood, Long Branch",0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,"Bathurst Manor, Downsview North, Wilson Heights",0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
5,Bayview Village,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
6,Berczy Park,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.00,0.00,0.017857,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
7,"Birch Cliff, Cliffside West",0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
8,"Brockton, Parkdale Village, Exhibition Place",0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
9,Business Reply Mail Processing Centre 969 Eastern,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.00,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


In [43]:
toronto_grouped.shape

(101, 274)

#### Each neighborhood along with the top 5 most common venues

In [44]:
top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(top_venues))
    print('\n')

----Adelaide, King, Richmond----
             venue  freq
0      Coffee Shop  0.07
1             Café  0.05
2  Thai Restaurant  0.04
3       Steakhouse  0.04
4       Restaurant  0.03


----Agincourt----
                       venue  freq
0                     Lounge  0.25
1  Latin American Restaurant  0.25
2             Breakfast Spot  0.25
3               Skating Rink  0.25
4              Metro Station  0.00


----Agincourt North, L'Amoreaux East, Steeles East, Milliken----
           venue  freq
0           Park  0.33
1     Playground  0.33
2    Coffee Shop  0.33
3    Yoga Studio  0.00
4  Metro Station  0.00


----Alderwood, Long Branch----
          venue  freq
0   Pizza Place  0.22
1   Coffee Shop  0.11
2  Skating Rink  0.11
3      Pharmacy  0.11
4           Gym  0.11


----Bathurst Manor, Downsview North, Wilson Heights----
                  venue  freq
0           Coffee Shop  0.11
1                  Park  0.05
2                 Diner  0.05
3         Shopping Mall  0.05
4  Fast F

         venue  freq
0         Café  0.14
1  Pizza Place  0.07
2       Bakery  0.07
3  Coffee Shop  0.07
4   Restaurant  0.07


----Humber Summit----
                 venue  freq
0        Shopping Mall  0.33
1  Empanada Restaurant  0.33
2          Pizza Place  0.33
3    Mobile Phone Shop  0.00
4   Miscellaneous Shop  0.00


----Humewood-Cedarvale----
               venue  freq
0              Field  0.25
1              Trail  0.25
2       Tennis Court  0.25
3       Hockey Arena  0.25
4  Mobile Phone Shop  0.00


----India Bazaar, The Beaches West----
               venue  freq
0               Park  0.09
1     Sandwich Place  0.09
2        Coffee Shop  0.05
3  Fish & Chips Shop  0.05
4            Brewery  0.05


----L'Amoreaux West----
                    venue  freq
0      Chinese Restaurant  0.15
1    Fast Food Restaurant  0.15
2             Pizza Place  0.08
3  Thrift / Vintage Store  0.08
4             Coffee Shop  0.08


----Lawrence Manor East, Bedford Park----
                  ve

4        Pizza Place  0.06


----Toronto Dominion Centre, Design Exchange----
         venue  freq
0  Coffee Shop  0.12
1         Café  0.08
2        Hotel  0.05
3   Restaurant  0.05
4          Bar  0.04


----Toronto Islands, Harbourfront East, Union Station----
         venue  freq
0  Coffee Shop  0.13
1     Aquarium  0.05
2         Café  0.04
3        Hotel  0.04
4      Brewery  0.03


----Trinity, Little Portugal----
              venue  freq
0               Bar  0.11
1  Asian Restaurant  0.05
2       Coffee Shop  0.05
3        Restaurant  0.05
4       Men's Store  0.05


----Underground city, First Canadian Place----
                 venue  freq
0          Coffee Shop  0.10
1                 Café  0.07
2           Restaurant  0.04
3  American Restaurant  0.04
4           Steakhouse  0.04


----University of Toronto, Harbord----
            venue  freq
0            Café  0.14
1             Bar  0.05
2  Sandwich Place  0.05
3       Bookstore  0.05
4      Restaurant  0.05


----Upwoo

### Inserting data into dataframe

#### Function to sort the venues in descending order.

In [45]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Creating a new dataframe and displaying the top 10 venues for each neighborhood.

In [46]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,Thai Restaurant,Salad Place,Asian Restaurant,Burger Joint,Restaurant,Bar,Bakery
1,Agincourt,Latin American Restaurant,Skating Rink,Lounge,Breakfast Spot,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
2,"Agincourt North, L'Amoreaux East, Steeles East...",Park,Coffee Shop,Playground,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
3,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Skating Rink,Pharmacy,Pool,Pub,Sandwich Place,Gym,Airport Terminal,Dessert Shop
4,"Bathurst Manor, Downsview North, Wilson Heights",Coffee Shop,Middle Eastern Restaurant,Frozen Yogurt Shop,Bridal Shop,Sandwich Place,Fast Food Restaurant,Diner,Restaurant,Deli / Bodega,Bank


# Cluster Neighborhoods


Run k-means to cluster the neighborhood into 5 clusters.

In [47]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 4, 2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0])

Creating a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [48]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_ll_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

In [49]:
# check the last columns!

toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0.0,Fast Food Restaurant,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant,Gym Pool
1,M1C,Scarborough,"Rouge Hill, Highland Creek, Port Union",43.784535,-79.160497,0.0,Bar,History Museum,Women's Store,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant
2,M1E,Scarborough,"Morningside, West Hill, Guildwood",43.763573,-79.188711,0.0,Intersection,Rental Car Location,Electronics Store,Spa,Breakfast Spot,Pizza Place,Mexican Restaurant,Medical Center,Women's Store,Discount Store
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0.0,Coffee Shop,Korean Restaurant,Insurance Office,Women's Store,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0.0,Hakka Restaurant,Bakery,Caribbean Restaurant,Athletics & Sports,Bank,Gas Station,Thai Restaurant,Fried Chicken Joint,Doner Restaurant,Dog Run


#### There is no data available for some neighbourhood. Dropping that row and chaning floats to int

In [50]:
toronto_merged=toronto_merged.dropna()
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)


In [51]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Cluster Examination

### Cluster 1

In [52]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,0,Fast Food Restaurant,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant,Gym Pool
1,Scarborough,0,Bar,History Museum,Women's Store,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant
2,Scarborough,0,Intersection,Rental Car Location,Electronics Store,Spa,Breakfast Spot,Pizza Place,Mexican Restaurant,Medical Center,Women's Store,Discount Store
3,Scarborough,0,Coffee Shop,Korean Restaurant,Insurance Office,Women's Store,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
4,Scarborough,0,Hakka Restaurant,Bakery,Caribbean Restaurant,Athletics & Sports,Bank,Gas Station,Thai Restaurant,Fried Chicken Joint,Doner Restaurant,Dog Run
6,Scarborough,0,Department Store,Bus Station,Convenience Store,Coffee Shop,Hobby Shop,Chinese Restaurant,Dumpling Restaurant,Dog Run,Doner Restaurant,Donut Shop
7,Scarborough,0,Bakery,Bus Line,Fast Food Restaurant,Park,Soccer Field,Metro Station,Bus Station,Intersection,Women's Store,Donut Shop
8,Scarborough,0,American Restaurant,Motel,Dim Sum Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Women's Store
9,Scarborough,0,College Stadium,Skating Rink,General Entertainment,Café,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
10,Scarborough,0,Indian Restaurant,Pet Store,Vietnamese Restaurant,Chinese Restaurant,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop


### Cluster 2

In [53]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,North York,1,Park,Bank,Convenience Store,Bar,Dumpling Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant
25,North York,1,Food & Drink Shop,Park,Women's Store,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant
30,North York,1,Park,Airport,Women's Store,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
31,North York,1,Grocery Store,Park,Bank,Women's Store,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
40,East York,1,Park,Metro Station,Convenience Store,Women's Store,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
44,Central Toronto,1,Park,Swim School,Bus Line,Drugstore,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Dim Sum Restaurant
50,Downtown Toronto,1,Park,Playground,Trail,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Event Space,Dumpling Restaurant,Department Store
64,Central Toronto,1,Trail,Park,Sushi Restaurant,Bus Line,Jewelry Store,Drugstore,Discount Store,Dog Run,Doner Restaurant,Donut Shop
72,North York,1,Park,Bakery,Pub,Japanese Restaurant,Drugstore,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
74,York,1,Park,Women's Store,Market,Fast Food Restaurant,Gourmet Shop,Golf Course,Ethiopian Restaurant,Empanada Restaurant,Greek Restaurant,Electronics Store


### Cluster 3

In [54]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Scarborough,2,Playground,Convenience Store,Women's Store,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
14,Scarborough,2,Park,Coffee Shop,Playground,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
48,Central Toronto,2,Gym,Playground,Women's Store,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore


### Cluster 4

In [55]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
20,North York,3,Cafeteria,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Women's Store,College Stadium


### Cluster 5

In [56]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
94,Etobicoke,4,Filipino Restaurant,Women's Store,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant
