# Coursera Capstone project on Battle of neighbourhood

##### ~By Debapriyo Dasgupta

In [1]:
import numpy as np
import pandas as pd

In [2]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


# Data collection process

### Installing BeautifulSoup

In [2]:
!pip -q install requests
!pip -q install html5lib
!pip -q install bs4
print('Packages installed successfully!!!!')

Packages installed successfully!!!!


### Scraping website content

In [3]:
import requests
from bs4 import BeautifulSoup

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url)

soup = BeautifulSoup(req.content,'lxml')
#print(soup.prettify())

### Loading data in DataFrame

In [5]:
postal_table = soup.find("table", attrs={"class":"wikitable"})
postal_table_head = postal_table.tbody.find_all("tr")
postal_table_head

postal_data = []
for rows in postal_table_head:
    tr_row=[]
    for td in rows.find_all("td"):
        tr_row.append(td.text.replace('\n',' ').strip())
    postal_data.append(tr_row)
        
#Removing the first empty element
postal_data.pop(0)
#postal_data

#Converting the list to DataFrame
df_postal_code = pd.DataFrame(data = postal_data, columns=['PostalCode','Borough','Neighborhood'])
df_postal_code.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Cleaning the dataset
#### 1. Deleting the rows where borough is "Not assigned"

In [6]:
#Deleting the rows where borough is "Not assigned"
df_postal_code = df_postal_code[df_postal_code['Borough'] != 'Not assigned']
df_postal_code.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


#### 2. Merging different Neighborhoods grouped by PostalCode

In [7]:
postal_upd =df_postal_code.groupby(['PostalCode','Borough']).apply(lambda x: "%s" %', '.join(x.Neighborhood))
df_postal_upd = pd.DataFrame(postal_upd)
df_postal_upd.reset_index(inplace = True)

df_postal_upd.rename(columns={0:'Neighborhood'}, inplace=True)
df_postal_upd.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### 3. Replacing 'Not assigned' Neighborhood to corresponding Borough values

In [8]:
df_postal_upd.loc[df_postal_upd['Neighborhood']=='Not assigned', 'Neighborhood'] = df_postal_upd['Borough']

#To check if any further Neighborhood values are 'Not assigned'
df_postal_upd[df_postal_upd['Neighborhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


### Finalizing the data after cleaning

In [9]:
#Final dataset
df_postal_can =df_postal_upd

#Fetching shape of final dataset
df_postal_can.shape

(103, 3)

# Fetching latitude longitude for each postal code

In [10]:
df_postal_can.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [11]:
#Importing geocoder
#!pip -q install geocoder
from geopy.geocoders import Nominatim

print('Geocoder package installed and imported!!!')

Geocoder package installed and imported!!!


In [13]:
## Trying to use geopy package
geolocator = Nominatim(user_agent='ca_postal')
location = geolocator.geocode('M1C, Toronto, Ontario')
print('Latitude = {}, Longitude = {}'.format(location.latitude, location.longitude))

### the coordinates are not appropriate

Latitude = 43.653963, Longitude = -79.387207


In [39]:
def getGeocode(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        
    return lat_lng_coords

In [50]:
#print(getGeocode('M1H'))
## This is going in an ever ending process. Hence using the spatial file

In [12]:
## Fetching the data from geospatial csv file
df_latlng=pd.read_csv('http://cocl.us/Geospatial_data')
df_latlng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
## Merging the latitude longitude information with the dataset
df_postal_can.loc[df_postal_can['PostalCode']==df_latlng['Postal Code'],'Latitude']=df_latlng['Latitude']
df_postal_can.loc[df_postal_can['PostalCode']==df_latlng['Postal Code'],'Longitude']=df_latlng['Longitude']

In [14]:
df_postal_can.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [15]:
df_postal_can.shape


(103, 5)

# Explorin the data and clustering

In [16]:
#Importing important libraries
import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes 
import folium

print('All necessary packages imported!!!')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.0 MB

The following NEW packages will be 

### Create a map of Toronto and plot all the data points

In [17]:
#Fetching coordinates for Toronto
address = 'Toronto, CA'
geolocator = Nominatim(user_agent='Can_agent')
location = geolocator.geocode(address)
print('The geograpical coordinate of Toronto are {}, {}.'.format(location.latitude, location.longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [18]:
#Creating empty map of Canada
map_canada = folium.Map(location=[location.latitude,location.longitude], zoom_start=10)

#Now merging data points to the map
for lat, long, borough, neighborhood in zip(df_postal_can['Latitude'], df_postal_can['Longitude'], df_postal_can['Borough'], df_postal_can['Neighborhood']):
    label ='{}, {}'.format(neighborhood,borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,long],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#e82b09',
        fill_opacity=0.7,
        parse_html=False).add_to(map_canada)

map_canada

Firstly a subset of the df_postal_can is considered where we considered the Borough having 'Toronto' text. Once we do analysis on the subset we extend the analysis to the whole set

In [19]:
# Taking a subset of data, where borough having Toronto
df_postal_toronto = df_postal_can[df_postal_can['Borough'].str.contains('Toronto')==True].reset_index(drop=True)
df_postal_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


Now lets plot the points of the subset in Toronto map

In [20]:
#Creating empty map of Toronto
map_toronto = folium.Map(location=[location.latitude,location.longitude], zoom_start=12)

#Now merging data points to the map
for lat, long, borough, neighborhood in zip(df_postal_toronto['Latitude'], df_postal_toronto['Longitude'], df_postal_toronto['Borough'], df_postal_toronto['Neighborhood']):
    label ='{}, {}'.format(neighborhood,borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,long],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#e82b09',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

Next we start to explore the neighborhood using the Foursquare API

#### Defining the Foursquare credentials

In [21]:
# The code was removed by Watson Studio for sharing.

Let's start to explore the 1st location 'The Beaches, East Toronto' in the sub-dataframe

In [22]:
neighborhood_latitude = df_postal_toronto.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_postal_toronto.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_postal_toronto.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


#### Now, let's get the top 100 venues that are in The Beaches within a radius of 500 meters.

First we create the url for calling Foursquare API and then fetch the data

In [23]:
radius=500
LIMIT=100
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
    CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e34dd13882fc7001b77555b'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distanc

Let's define a function to fetch the category of each venue.

In [24]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

We start to read the json and cleanly put it in dataframe

In [25]:
venues = results['response']['groups'][0]['items']
nb_venues = json_normalize(venues)

filtered_cols = ['venue.name','venue.categories','venue.location.lat','venue.location.lng']
nb_venues =nb_venues.loc[:, filtered_cols]

#Filter categories for each row
nb_venues['venue.categories']=nb_venues.apply(get_category_type, axis=1)

#Clean the column names
nb_venues.columns = [col.split(".")[-1] for col in nb_venues.columns]

print('Number of venues returned from Foursquare is {}'.format(nb_venues.shape[0]))
print('Sample venue data is as below:')
nb_venues.head()

Number of venues returned from Foursquare is 4
Sample venue data is as below:


Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869


#### Now, we repeat the same to fetch venue data for all the neighborhood in sub dataframe

We define a function to repeat the same process done for 'The Beaches, East Toronto'. We need to capture below feature

1. PostalCode
2. Borough
3. Neighborhood
4. Latitude
5. Longitude
6. VenueName
7. VenueCategory
8. VenueLatitude
9. VenueLongitude

In [26]:
def getNearbyVenues(data, radius=500):
    
    venues_list=[]
    for pCode, name, nh, lat, lng in zip(data['PostalCode'], data['Borough'], data['Neighborhood'], data['Latitude'], data['Longitude']):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        if(results is None):
            print('There is no venue details for postal code =', + pCode)
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            pCode,
            name,
            nh, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode',
                             'Borough',
                             'Neighborhood', 
                             'Latitude', 
                             'Longitude', 
                             'Venue', 
                             'Venue Latitude', 
                             'Venue Longitude', 
                             'Venue Category']
    
    return(nearby_venues)

We execute the function __getNearbyVenues__ for the sub dataframe df_postal_toronto and fetch all venue related information per neighborhood

In [27]:
df_toronto_venue = getNearbyVenues(data=df_postal_toronto)
df_toronto_venue.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


Now that we have a flavor of the dataset based on the subset dataframe, we now implement the same on the whole dataframe __df_postal_can__

In [28]:
df_venue_details = getNearbyVenues(data=df_postal_can)
df_venue_details.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
3,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [30]:
len(df_venue_details['PostalCode'].unique())

100

As we can see initially there was __103__ distinct poastal codes in the dataset. but only __100__ postal code have venues nearby as per api return from Foursquare. We will see which are the 3 neighborhoods with no venues suggested by Foursquare later

### Checking for the size of the dataset

In [31]:
print('The size of complete venue detail dataset = {}'.format(df_venue_details.shape[0]))

The size of complete venue detail dataset = 2214


We can check how many venues are there per neighborhood

In [32]:
df_venue_details.groupby('Neighborhood').count()

Unnamed: 0_level_0,PostalCode,Borough,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100,100,100
Agincourt,4,4,4,4,4,4,4,4
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",2,2,2,2,2,2,2,2
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",10,10,10,10,10,10,10,10
"Alderwood, Long Branch",11,11,11,11,11,11,11,11
"Bathurst Manor, Downsview North, Wilson Heights",18,18,18,18,18,18,18,18
Bayview Village,4,4,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",24,24,24,24,24,24,24,24
Berczy Park,55,55,55,55,55,55,55,55
"Birch Cliff, Cliffside West",4,4,4,4,4,4,4,4


We can also check how many distinct categories of venues are there

In [33]:
print('There are {} uniques categories.'.format(len(df_venue_details['Venue Category'].unique())))

There are 270 uniques categories.


# Analyzing each postal code

Let's transform the data to onehot coding so as to analyze the categories

In [34]:
# one hot encoding
toronto_onehot = pd.get_dummies(df_venue_details[['Venue Category']], prefix="", prefix_sep="")

# add necessary columns to the back of the dataframe
toronto_onehot['PostalCode'] = df_venue_details['PostalCode']
toronto_onehot['Borough'] = df_venue_details['Borough']

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-2]] + [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-2])

toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,PostalCode,Borough,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,Scarborough,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1C,Scarborough,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1C,Scarborough,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1E,Scarborough,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1E,Scarborough,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


The new shape of the data

In [35]:
toronto_onehot.shape

(2214, 272)

#### Let's group rows by postal code and by taking the mean of the frequency of occurrence of each category

In [36]:
df_venue_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
df_venue_grouped.head()

Unnamed: 0,PostalCode,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Let's find the new size

In [37]:
df_venue_grouped.shape

(100, 271)

So there are 100 distinct postal codes or 100 distinct neighborhoods distributed in different venue categories

### We now try to capture top 10 venues of each neighborhood

First we need to write a function to sort the venues of a particular postal code 

In [71]:
def return_top_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now we create a new dataframe and display top 10 venues of each postal codes of each bouroughs

In [72]:
num_top_venues = 10

indicators = ['st','nd', 'rd']
columns = ['PostalCode']

#Preparing column names dynamically
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Common venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Common venue'.format(ind+1))

#Create a new dataframe
postalCode_venue_sorted = pd.DataFrame(columns=columns)
postalCode_venue_sorted['PostalCode'] = df_venue_grouped['PostalCode']

for ind in np.arange(df_venue_grouped.shape[0]):
    postalCode_venue_sorted.iloc[ind,1:] = return_top_venues(df_venue_grouped.iloc[ind, :], num_top_venues)
    
postalCode_venue_sorted.head()

Unnamed: 0,PostalCode,1st Common venue,2nd Common venue,3rd Common venue,4th Common venue,5th Common venue,6th Common venue,7th Common venue,8th Common venue,9th Common venue,10th Common venue
0,M1B,Fast Food Restaurant,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Yoga Studio,Dessert Shop
1,M1C,Bar,History Museum,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
2,M1E,Mexican Restaurant,Breakfast Spot,Electronics Store,Pizza Place,Rental Car Location,Medical Center,Intersection,Yoga Studio,Diner,Discount Store
3,M1G,Coffee Shop,Korean Restaurant,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Eastern European Restaurant
4,M1H,Caribbean Restaurant,Bakery,Fried Chicken Joint,Thai Restaurant,Athletics & Sports,Gas Station,Bank,Hakka Restaurant,Eastern European Restaurant,Dumpling Restaurant


# Cluster the neighborhoods

Now that we have top 10 venues of each neighborhood, lets try to cluster them. 

For clustering we will be using K-means Clustering and lets start by taking cluster=5

In [73]:
kcluster = 5

df_venue_grouped_cluster = df_venue_grouped.drop('PostalCode',1)

#Run K-Means on df_venue_grouped_cluster
kmeans = KMeans(n_clusters=kcluster, random_state=0).fit(df_venue_grouped_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 4, 4, 4, 4, 4, 4, 4, 4, 4], dtype=int32)

Now lets create a new dataframe to have a consolidated data of different postal codes with corresponding boroughs and neighborhood, alongwith the top 10 venue details with the cluster labels

In [92]:
#Add clustering labels to postalcode
#postalCode_venue_sorted.insert(0,'Cluster', kmeans.labels_)

df_postal_consolidated = df_postal_can
df_postal_consolidated = df_postal_consolidated.join(postalCode_venue_sorted.set_index('PostalCode'), on='PostalCode')


df_postal_consolidated.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Common venue,2nd Common venue,3rd Common venue,4th Common venue,5th Common venue,6th Common venue,7th Common venue,8th Common venue,9th Common venue,10th Common venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,0.0,Fast Food Restaurant,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Yoga Studio,Dessert Shop
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,4.0,Bar,History Museum,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,4.0,Mexican Restaurant,Breakfast Spot,Electronics Store,Pizza Place,Rental Car Location,Medical Center,Intersection,Yoga Studio,Diner,Discount Store
3,M1G,Scarborough,Woburn,43.770992,-79.216917,4.0,Coffee Shop,Korean Restaurant,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Eastern European Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,4.0,Caribbean Restaurant,Bakery,Fried Chicken Joint,Thai Restaurant,Athletics & Sports,Gas Station,Bank,Hakka Restaurant,Eastern European Restaurant,Dumpling Restaurant


Let's find the distinct cluster values in the consolidated data

In [93]:
df_postal_consolidated['Cluster'].unique()

array([ 0.,  4.,  2., nan,  3.,  1.])

Let's see in details the postal codes that are having cluster value nan

In [94]:
df_postal_consolidated[df_postal_consolidated['Cluster'].isnull()]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Common venue,2nd Common venue,3rd Common venue,4th Common venue,5th Common venue,6th Common venue,7th Common venue,8th Common venue,9th Common venue,10th Common venue
16,M1X,Scarborough,Upper Rouge,43.836125,-79.205636,,,,,,,,,,,
21,M2M,North York,"Newtonbrook, Willowdale",43.789053,-79.408493,,,,,,,,,,,
93,M9A,Queen's Park,Queen's Park,43.667856,-79.532242,,,,,,,,,,,


To revalidate we can double check whether any venues are coming as output by calling the Foursquare API with lat long values one-by-one

In [95]:
radius=500
LIMIT=100
neighborhood_latitude = 43.667856
neighborhood_longitude = -79.532242
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
    CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e34f0b947b43d002315cc9a'},
  'headerLocation': 'Edenbridge - Humber Valley',
  'headerFullLocation': 'Edenbridge - Humber Valley, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 0,
  'suggestedBounds': {'ne': {'lat': 43.6723560045, 'lng': -79.52603259414254},
   'sw': {'lat': 43.6633559955, 'lng': -79.53845140585746}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': []}]}}

__Note :__ As we can see we are receiving a warning "There aren't a lot of results near you. Try something more general, reset your filters, or expand the search area." and no values in the items. Thus we can consider these areas to be a separate cluster

For better analysis purpose we rename the cluster value from NaN to cluster# 5 and see the unique cluster values now avaliable

In [96]:
df_postal_consolidated.loc[df_postal_consolidated['Cluster'].isnull(),'Cluster'] =5
df_postal_consolidated['Cluster'].unique()

array([0., 4., 2., 5., 3., 1.])

Now that we have renamed the NaN to a new cluster, cluster# 5, we can convert the 'Cluster' column to int and see how the final data looks like

In [97]:
df_postal_consolidated['Cluster']= df_postal_consolidated['Cluster'].apply(np.int32)
df_postal_consolidated.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Common venue,2nd Common venue,3rd Common venue,4th Common venue,5th Common venue,6th Common venue,7th Common venue,8th Common venue,9th Common venue,10th Common venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,0,Fast Food Restaurant,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Yoga Studio,Dessert Shop
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,4,Bar,History Museum,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,4,Mexican Restaurant,Breakfast Spot,Electronics Store,Pizza Place,Rental Car Location,Medical Center,Intersection,Yoga Studio,Diner,Discount Store
3,M1G,Scarborough,Woburn,43.770992,-79.216917,4,Coffee Shop,Korean Restaurant,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Eastern European Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,4,Caribbean Restaurant,Bakery,Fried Chicken Joint,Thai Restaurant,Athletics & Sports,Gas Station,Bank,Hakka Restaurant,Eastern European Restaurant,Dumpling Restaurant


Ok. So now we can start to plot the neighborhood details in the map to have a visual

In [105]:
# create map
map_clusters = folium.Map(location=[location.latitude,location.longitude], zoom_start=11)

length=len(df_postal_consolidated['Cluster'].unique())
# set color scheme for the clusters
x = np.arange(length)
ys = [i + x + (i*x)**2 for i in range(length)]
colors_array = cm.inferno(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_postal_consolidated['Latitude'], df_postal_consolidated['Longitude'], df_postal_consolidated['Neighborhood'], df_postal_consolidated['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Now let's see the data for each cluster

#### Cluster 0

In [106]:
df_postal_consolidated[df_postal_consolidated['Cluster']==0]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Common venue,2nd Common venue,3rd Common venue,4th Common venue,5th Common venue,6th Common venue,7th Common venue,8th Common venue,9th Common venue,10th Common venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,0,Fast Food Restaurant,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Yoga Studio,Dessert Shop
102,M9W,Etobicoke,Northwest,43.706748,-79.594054,0,Drugstore,Rental Car Location,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Yoga Studio,Department Store


#### Cluster 1

In [107]:
df_postal_consolidated[df_postal_consolidated['Cluster']==1]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Common venue,2nd Common venue,3rd Common venue,4th Common venue,5th Common venue,6th Common venue,7th Common venue,8th Common venue,9th Common venue,10th Common venue
94,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ...",43.650943,-79.554724,1,Golf Course,Yoga Studio,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant


#### Cluster 2

In [108]:
df_postal_consolidated[df_postal_consolidated['Cluster']==2]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Common venue,2nd Common venue,3rd Common venue,4th Common venue,5th Common venue,6th Common venue,7th Common venue,8th Common venue,9th Common venue,10th Common venue
14,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St...",43.815252,-79.284577,2,Park,Playground,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
23,M2P,North York,York Mills West,43.752758,-79.400049,2,Park,Bank,Convenience Store,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
30,M3K,North York,"CFB Toronto, Downsview East",43.737473,-79.464763,2,Park,Snack Place,Airport,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
40,M4J,East York,East Toronto,43.685347,-79.338106,2,Park,Rental Car Location,Intersection,Convenience Store,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2,Park,Swim School,Bus Line,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore,Dessert Shop
50,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,2,Park,Playground,Trail,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
74,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512,2,Park,Fast Food Restaurant,Market,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Eastern European Restaurant
90,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,2,Park,River,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Department Store,Donut Shop
98,M9N,York,Weston,43.706876,-79.518188,2,Park,Yoga Studio,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724,2,Park,Bus Line,Sandwich Place,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Falafel Restaurant


#### Cluster 3

In [109]:
df_postal_consolidated[df_postal_consolidated['Cluster']==3]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Common venue,2nd Common venue,3rd Common venue,4th Common venue,5th Common venue,6th Common venue,7th Common venue,8th Common venue,9th Common venue,10th Common venue
20,M2L,North York,"Silver Hills, York Mills",43.75749,-79.374714,3,Cafeteria,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,College Rec Center


#### Cluster 4

In [110]:
df_postal_consolidated[df_postal_consolidated['Cluster']==4]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Common venue,2nd Common venue,3rd Common venue,4th Common venue,5th Common venue,6th Common venue,7th Common venue,8th Common venue,9th Common venue,10th Common venue
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,4,Bar,History Museum,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,4,Mexican Restaurant,Breakfast Spot,Electronics Store,Pizza Place,Rental Car Location,Medical Center,Intersection,Yoga Studio,Diner,Discount Store
3,M1G,Scarborough,Woburn,43.770992,-79.216917,4,Coffee Shop,Korean Restaurant,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Eastern European Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,4,Caribbean Restaurant,Bakery,Fried Chicken Joint,Thai Restaurant,Athletics & Sports,Gas Station,Bank,Hakka Restaurant,Eastern European Restaurant,Dumpling Restaurant
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,4,Grocery Store,Playground,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029,4,Hobby Shop,Playground,Coffee Shop,Department Store,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577,4,Bus Line,Bakery,Park,Ice Cream Shop,Bus Station,Metro Station,Intersection,Soccer Field,Cosmetics Shop,Dog Run
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476,4,Motel,American Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Yoga Studio,Dessert Shop
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848,4,College Stadium,Skating Rink,Café,General Entertainment,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
10,M1P,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ...",43.757410,-79.273304,4,Indian Restaurant,Pet Store,Vietnamese Restaurant,Thrift / Vintage Store,Chinese Restaurant,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run


#### Cluster 5

In [111]:
df_postal_consolidated[df_postal_consolidated['Cluster']==5]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Common venue,2nd Common venue,3rd Common venue,4th Common venue,5th Common venue,6th Common venue,7th Common venue,8th Common venue,9th Common venue,10th Common venue
16,M1X,Scarborough,Upper Rouge,43.836125,-79.205636,5,,,,,,,,,,
21,M2M,North York,"Newtonbrook, Willowdale",43.789053,-79.408493,5,,,,,,,,,,
93,M9A,Queen's Park,Queen's Park,43.667856,-79.532242,5,,,,,,,,,,
