## Segmenting and Clustering Neighborhood in Toronto

## *Data Collection using BeautifulSoup4*

## *Install BeautifulSoup4 and lxml before runing the code*
## *Use your own Foursquares crendentials to run the* 

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
table = soup.find("table")

output_rows = []
for table_row in table.findAll('tr'):
    columns = table_row.findAll('td')
    output_row = []
    for column in columns:
        output_row.append(column.text)
    output_rows.append(output_row)

df=pd.DataFrame(output_rows, columns=['Postcode','Borough','Neighborhood'])
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


In [2]:
df.shape

(288, 3)

## *Data pre-processing* 

In [3]:
df=df.drop(index=0,axis=0)   #remove null row
df=df[df.Borough != 'Not assigned']   #remove rows with unassigned Borough
df.Neighborhood=df.Neighborhood.str.strip('"\n"')  #remove '\n' from string
df=df.reset_index(drop=True)   #reset index

In [4]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [5]:
df.shape

(210, 3)

In [6]:
ungrouped_df=df

In [7]:
ungrouped_df.shape

(210, 3)

In [8]:
ungrouped_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


Replace Neiborhood with 'Not assigned'

In [9]:
ungrouped_df['Neighborhood'].replace('Not assigned', ungrouped_df['Borough'], inplace=True)

In [10]:
ungrouped_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Queen's Park
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [11]:
print("All Neighborhood assigned, e.g. Queen's Park to Queen's Park ")

All Neighborhood assigned, e.g. Queen's Park to Queen's Park 


In [12]:
ungrouped_df.shape

(210, 3)

Grouped by postcode and combining Neighborhood

In [13]:
grouped=ungrouped_df.groupby(["Postcode","Borough"])["Neighborhood"].apply(lambda Neighborhood: ",".join(Neighborhood))


In [14]:
grouped.reset_index()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


In [15]:
type(grouped)

pandas.core.series.Series

In [16]:
grouped_df=pd.DataFrame(grouped).reset_index()  #generate new DF for the grouped 

In [17]:
grouped_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [18]:
grouped_df.shape

(103, 3)

## *Read Geo Data*

In [19]:
geo_df=pd.read_csv('Geospatial_Coordinates.csv')

In [20]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
geo_df.shape

(103, 3)

### Sort two DFs in the same order of "Postcode"

In [22]:
geo_sorted=geo_df.sort_values(by="Postal Code")

In [23]:
grouped_sorted=grouped_df.sort_values(by="Postcode")

In [24]:
geo_sorted.shape

(103, 3)

In [25]:
grouped_sorted.shape

(103, 3)

In [26]:
combined_df=pd.concat([grouped_sorted, geo_sorted[["Latitude", "Longitude"]]], axis=1)

In [27]:
combined_df.head(15)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [28]:
combined_df.shape

(103, 5)

## Extract Toronto boroughs 

In [29]:
Tor_bor=combined_df[combined_df.Borough.str.contains("Toronto")].reset_index(drop=True) 

In [30]:
Tor_bor

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


In [31]:
Tor_bor.groupby('Neighborhood').count()

Unnamed: 0_level_0,Postcode,Borough,Latitude,Longitude
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Adelaide,King,Richmond",1,1,1,1
Berczy Park,1,1,1,1
"Brockton,Exhibition Place,Parkdale Village",1,1,1,1
Business Reply Mail Processing Centre 969 Eastern,1,1,1,1
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",1,1,1,1
"Cabbagetown,St. James Town",1,1,1,1
Central Bay Street,1,1,1,1
"Chinatown,Grange Park,Kensington Market",1,1,1,1
Christie,1,1,1,1
Church and Wellesley,1,1,1,1


In [32]:
Tor_bor.groupby('Neighborhood').count().shape

(39, 4)

In [33]:
print('The dataframe has {} boroughs and {} Neighborhood (Postcode)'.format(
        len(Tor_bor['Borough'].unique()),
         len(Tor_bor['Neighborhood'].unique())
    )
)

The dataframe has 4 boroughs and 39 Neighborhood (Postcode)


In [34]:
from geopy.geocoders import Nominatim

In [35]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


In [36]:
import folium # map rendering library

map_tor = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Tor_bor['Latitude'], Tor_bor['Longitude'], Tor_bor['Borough'], Tor_bor['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  
    
map_tor

Define Foursquare Credential and Version
### Change to your own ID to run it

In [63]:
CLIENT_ID = 'your ID' # your Foursquare ID
CLIENT_SECRET = 'your secet' # your Foursquare Secret
VERSION = '20191228' # Foursquare API version

# Queen's Park location
latitude = 43.667856	
longtitude = -79.532242

In [64]:
LIMIT = 200
radius = 1000

# url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
# url
# version and laittude orders are different, will change the later stage pandas columns 
# use the answer below 

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=your ID&client_secret=your secet&v=20191228&ll=43.667856,-79.387207&radius=1000&limit=200'

In [65]:
results = requests.get(url).json()
results

{'meta': {'code': 400,
  'errorType': 'invalid_auth',
  'errorDetail': 'Missing access credentials. See https://developer.foursquare.com/docs/api/configuration/authentication for details.',
  'requestId': '5e0827d95fb726001ba3f6e3'},
 'response': {}}

In [66]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [67]:
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON convert it to dataframe
print(type(nearby_venues))

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row, combine categories and venue.categories to venue categoris
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

KeyError: 'groups'

In [0]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

In [0]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']

    return(nearby_venues)

In [0]:
tor_venues = getNearbyVenues(names=Tor_bor['Neighborhood'],
                                   latitudes=Tor_bor['Latitude'],
                                   longitudes=Tor_bor['Longitude']
                                  )



In [0]:
print(tor_venues.shape)
tor_venues.head()

In [0]:
tor_venues.groupby("Neighborhood").count().shape

no venues for Queesn's Park

In [0]:
print('There are {} uniques categories.'.format(len(tor_venues['Venue Category'].unique())))

In [0]:
# one hot encoding
tor_onehot = pd.get_dummies(tor_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tor_onehot['Neighborhood'] = tor_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]

tor_onehot.head()

In [0]:
tor_onehot.shape

In [0]:
tor_grouped = tor_onehot.groupby('Neighborhood').mean().reset_index()
tor_grouped   #this dataframe will be used in K-mean fit

In [0]:
tor_grouped.shape

In [0]:
num_top_venues = 5

for hood in tor_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = tor_grouped[tor_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [0]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# return column name,row's index.values are column names.

In [0]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind])) # 1st,2nd,3rd
    except:
        columns.append('{}th Most Common Venue'.format(ind+1)) #4th and after

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = tor_grouped['Neighborhood']

for ind in np.arange(tor_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()
neighborhoods_venues_sorted.iloc[1,:].index.values



In [0]:
# set number of clusters
kclusters = 5

tor_grouped_clustering = tor_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [0]:
tor_grouped.tail()

In [0]:
Tor_bor1=Tor_bor[Tor_bor.Neighborhood != "Queen's Park"]

In [0]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster_Labels', kmeans.labels_)

tor_merged = Tor_bor1

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
tor_merged = tor_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

tor_merged.head() # check the last columns!


In [0]:
# tor_merged["Cluster_Labels"]=tor_merged["Cluster_Labels"].astype('Int64')

In [0]:
tor_merged.head() # check the last columns!

In [0]:
tor_merged.tail()

In [0]:
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_merged['Latitude'], tor_merged['Longitude'], tor_merged['Neighborhood'], tor_merged['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters