## Segemnting and Clustering of Tonronto Neighborhoods

In [1]:
!pip install beautifulsoup4
!pip install folium



In [3]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium # map rendering library

### Part 1-Load wiki data with BeautifulSoup

In [4]:
# define a function to read wiki data
def get_wikiData():  
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
    content = r.content
    soup = BeautifulSoup(content)
    #print(soup)

    alls = []
    # get the table data
    tab = soup.find('table', attrs={'class':'wikitable sortable'})
    
    for tr in tab('tr'): # for each row
        # get all the td items and assign them
        tds = tr('td')
        if(len(tds) > 0):
            postalCode = tds[0].text.rstrip()
            borough = tds[1].text.rstrip()
            neighborhood = tds[2].text.rstrip()

            # ignore any unassigned borough
            if(borough != 'Not assigned'):
                # if neighborhood is 'Not assigned', it uses borough name
                if(neighborhood == 'Not assigned'):
                    neighborhood = str(borough)
                # add the new record 
                alls.append([postalCode, borough, neighborhood])
    
    return alls

In [5]:
# get wiki data 
results= get_wikiData()

# convert to DataFrame
neighDF = pd.DataFrame(results,columns=['Postal Code','Borough','Neighborhood'])
#neighDF.to_csv('GTA_neighborhoods.csv', index=False, encoding='utf-8')
neighDF.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
neighDF.shape

(103, 3)

### Part2 - Get Latitude/Longitude data for each neighborhood

Use Geocode to get the latitude/longitude for each postal code area

In [7]:
import geocoder # import geocoder

geoList=[]

for code in neighDF['Postal Code']:
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
    #print(code, g.latlng)
    while (g.latlng is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
        #print(code, g.latlng)
    geoList.append([code,g.latlng[0],g.latlng[1]])
geoDF = pd.DataFrame(geoList, columns=['Postal Code','Latitude','Longitude'])
geoDF.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M3A,43.75245,-79.32991
1,M4A,43.73057,-79.31306
2,M5A,43.65512,-79.36264
3,M6A,43.72327,-79.45042
4,M7A,43.66253,-79.39188
...,...,...,...
98,M8X,43.65319,-79.51113
99,M4Y,43.66659,-79.38133
100,M7Y,43.64869,-79.38544
101,M8Y,43.63278,-79.48945


In [8]:
# Inner-join the two dataframes on key "Postal Code"
neighDF2 = neighDF.merge(geoDF, on='Postal Code', how="inner")
neighDF2

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.65319,-79.51113
99,M4Y,Downtown Toronto,Church and Wellesley,43.66659,-79.38133
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.64869,-79.38544
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.63278,-79.48945


In [9]:
neighDF2.shape

(103, 5)

### Part3 Clustering of Tonronto neighborhoods

Note: Anayalyze the neighborhoods only in the city of Toronto

#### 3.1 Get the boroughs which contains Toronto

In [12]:
neighDF2 = neighDF2[neighDF2['Borough'].str.contains('Toronto') ].sort_values('Postal Code').reset_index(drop=True)
neighDF2

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.67709,-79.29547
1,M4K,East Toronto,"The Danforth West, Riverdale",43.68375,-79.35512
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.66797,-79.31467
3,M4M,East Toronto,Studio District,43.66213,-79.33497
4,M4N,Central Toronto,Lawrence Park,43.72843,-79.38713
5,M4P,Central Toronto,Davisville North,43.71276,-79.38851
6,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.71458,-79.40668
7,M4S,Central Toronto,Davisville,43.7034,-79.38659
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.69048,-79.38318
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.68568,-79.40237


#### 3.2 Explore Neighborhoods in Toronto

Define Foursquare Credentials and Version

In [13]:
CLIENT_ID = 'FNWRY3S2LREZ2SBJZTYYIHXBFWK1TPLQCRCJ4YJ3CJLOBWBB' # your Foursquare ID
CLIENT_SECRET = 'JDGNKOY24KFLEZFYWAX3UXN3G3EXB40PR4DLXK5N3QJBOZBF' # your Foursquare Secret
VERSION = '20180604' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)


Your credentails:
CLIENT_ID: FNWRY3S2LREZ2SBJZTYYIHXBFWK1TPLQCRCJ4YJ3CJLOBWBB
CLIENT_SECRET:JDGNKOY24KFLEZFYWAX3UXN3G3EXB40PR4DLXK5N3QJBOZBF


Let's create a function to repeat the same process to all the neighborhoods

In [14]:
def getNearbyVenues(codes, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for code, lat, lng in zip(codes, latitudes, longitudes):
        #print(code)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            code, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
# get all the venues in Toronto
toronto_venues = getNearbyVenues(codes=neighDF2['Postal Code'],latitudes=neighDF2['Latitude'],longitudes=neighDF2['Longitude'])
toronto_venues

Unnamed: 0,Postal Code,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,43.67709,-79.29547,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,43.67709,-79.29547,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,43.67709,-79.29547,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,43.67709,-79.29547,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4K,43.68375,-79.35512,Dairy Queen,43.684223,-79.357062,Ice Cream Shop
...,...,...,...,...,...,...,...
1747,M7Y,43.64869,-79.38544,The Underground Dance Centre,43.649740,-79.388045,Dance Studio
1748,M7Y,43.64869,-79.38544,Baba Geddo,43.649944,-79.389321,Mediterranean Restaurant
1749,M7Y,43.64869,-79.38544,Mr Burrito,43.649992,-79.389282,Burrito Place
1750,M7Y,43.64869,-79.38544,Me Va Me,43.650055,-79.390555,Middle Eastern Restaurant


In [16]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 232 uniques categories.


In [17]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Postal Code'] = toronto_venues['Postal Code'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Postal Code,Accessories Store,Afghan Restaurant,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4K,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
toronto_grouped = toronto_onehot.groupby('Postal Code').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Postal Code,Accessories Store,Afghan Restaurant,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.039216,0.019608,0.0,0.0,0.0,0.039216,0.0,...,0.0,0.0,0.0,0.0,0.019608,0.0,0.0,0.0,0.0,0.0
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
toronto_grouped.shape

(38, 233)

Define a function to sort the venues in descending order.

In [20]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each postalcode area.

In [21]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postal Code'] = toronto_grouped['Postal Code']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Health Food Store,Pub,Trail,Neighborhood,Yoga Studio,Eastern European Restaurant,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm
1,M4K,Bus Line,Cosmetics Shop,Coffee Shop,Park,Intersection,Discount Store,Grocery Store,Ice Cream Shop,Business Service,Fish & Chips Shop
2,M4L,Park,Pizza Place,Italian Restaurant,Pub,Restaurant,Coffee Shop,Sandwich Place,Movie Theater,Fast Food Restaurant,Burrito Place
3,M4M,Pizza Place,Brewery,Diner,Italian Restaurant,Coffee Shop,Bakery,Sushi Restaurant,Arts & Crafts Store,Gastropub,American Restaurant
4,M4N,Bus Line,Swim School,Yoga Studio,Food,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm


#### 3.3 Run k-means to cluster the neighborhood into 5 clusters.

In [22]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 2, 0, 3, 0, 1, 0])

In [23]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = neighDF2

# merge toronto_grouped with toronto_data to add latitude/longitude for each postalCode
toronto_merged = toronto_merged.merge(neighborhoods_venues_sorted, on='Postal Code', how='inner')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.67709,-79.29547,0,Health Food Store,Pub,Trail,Neighborhood,Yoga Studio,Eastern European Restaurant,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm
1,M4K,East Toronto,"The Danforth West, Riverdale",43.68375,-79.35512,0,Bus Line,Cosmetics Shop,Coffee Shop,Park,Intersection,Discount Store,Grocery Store,Ice Cream Shop,Business Service,Fish & Chips Shop
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.66797,-79.31467,0,Park,Pizza Place,Italian Restaurant,Pub,Restaurant,Coffee Shop,Sandwich Place,Movie Theater,Fast Food Restaurant,Burrito Place
3,M4M,East Toronto,Studio District,43.66213,-79.33497,0,Pizza Place,Brewery,Diner,Italian Restaurant,Coffee Shop,Bakery,Sushi Restaurant,Arts & Crafts Store,Gastropub,American Restaurant
4,M4N,Central Toronto,Lawrence Park,43.72843,-79.38713,2,Bus Line,Swim School,Yoga Studio,Food,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm


#### 3.4 Visualize the clustering on map

In [25]:
# create map of Toronto
latitude = 43.6532
longitude = -79.3832
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    #cluster = int(cluster)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters