# Capstone Project: The Battle of Neighborhoods
## *Opening a Bubble Tea chain in Kuala Lumpur, Malaysia*

### 1. Import libraries

In [1]:
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

#!pip install geocoder # un-hash to install module if unavailable
import geocoder # library to get geographical coordinates of location
from geopy.geocoders import Nominatim

#!conda install -c conda-forge folium=0.5.0 --yes # un-hash to install module if unavailable
import folium # library for map rendering

from sklearn.cluster import KMeans # Import k-means for clustering

# Matplotlib and plotting modules
import matplotlib.cm as cm 
import matplotlib.colors as colors

### 2. Web scraping names of neighbourhoods in Kuala Lumpur from Wikipedia page

In [2]:
# Suburbs and townships of the Kuala Lumpur metropolitan area:
# GET request from Wikipedia url
url = requests.get("https://en.wikipedia.org/wiki/Kuala_Lumpur")

# Define object for BeautifulSoup to parse data
soup = BeautifulSoup(url.text, 'html.parser')

In [3]:
# Find relevant table located at bottom of Wikipedia page under "Kuala Lumpur metropolitan area"
table = soup.find('table',{'class':'nowraplinks navbox-subgroup'})

# Locate and extract each row data from table
rows = table.find_all('a')

# Store it in an empty list
wiki_data = []
for row in rows:
    wiki_data.append(row.text)

In [4]:
# Create pandas dataframe to store data
kl_area_names = pd.DataFrame(wiki_data)

# Name the column as 'Neighbourhood'
kl_area_names.rename(columns = {0:"Neighborhood"}, inplace = True)

# Drop any duplicate data & reset index
kl_area_names.drop_duplicates(keep=False,inplace=True) 

kl_area_names.reset_index(drop=True, inplace=True)

print('Number of neighborhoods: '+ str(kl_area_names.shape[0]))
kl_area_names.head()

Number of neighborhoods: 70


Unnamed: 0,Neighborhood
0,Jinjang
1,Taman Bukit Maluri
2,Bandar Menjalara
3,Bukit Kiara
4,Bukit Tunku


### 3. Geographical coordinates of each neighborhood

In [5]:
# Object for geopy geolocator
geolocator = Nominatim(user_agent="specify_your_app_name_here")

# Get each neighborhood's location data
location_data = []
for neighborhood in kl_area_names['Neighborhood']:
    location = geolocator.geocode(neighborhood+', Kuala Lumpur')
    location_data.append(location)
    
# Check first 10 location data points
location_data[0:10]

[Location(Jinjang Utara, Batu, Kuala Lumpur, 52000, Malaysia, (3.21749, 101.6608685, 0.0)),
 Location(Taman Bukit Maluri, Segambut, Kuala Lumpur, 52100, Malaysia, (3.2020528, 101.6329945, 0.0)),
 Location(Bandar Menjalara, Kampung Palimbayan Indah, Kepong, Kuala Lumpur, Malaysia, (3.1941357999999997, 101.63363432715688, 0.0)),
 Location(Bukit Kiara, Kuala Lumpur, 6000, Malaysia, (3.158462, 101.6360029, 0.0)),
 Location(Bukit Tunku, Kuala Lumpur, 50480, Malaysia, (3.1709295, 101.6789455, 0.0)),
 Location(Sri Damansara Timur (U/C), Jalan Lingkaran Tengah 2, Taman Daya, Kampung Selayang Pandang, Petaling Jaya, Majlis Perbandaran Selayang, Kuala Lumpur, 52100, Malaysia, (3.2075844499999997, 101.62853338390299, 0.0)),
 Location(Bukit Damansara, Kuala Lumpur, 50490, Malaysia, (3.151148, 101.657635, 0.0)),
 Location(Lebuhraya Sprint (Hubungan Damansara), Seksyen 12, Bangsar, Kuala Lumpur, 46350, Malaysia, (3.1320107, 101.6498916, 0.0)),
 Location(Jalan Tuanku Abdul Halim, Taman Duta, Kuala Lu

*Note: The location data of a few neighborhoods were unavailable. I have decided to omit these data points from the analysis.*

In [6]:
# Demonstrate missing data point
print(location_data[9])

None


In [7]:
# Find index of all missing location data points 
indexPosList = [ i for i in range(len(location_data)) if location_data[i] == None ]
 
print(indexPosList)

[9, 17, 21, 28, 47, 52, 59, 63, 68]


In [8]:
# Removing neighborhoods with missing location data
kl_neighborhoods = kl_area_names.drop([9, 17, 21, 28, 47, 52, 59, 63, 68], axis=0)

kl_neighborhoods.reset_index(drop=True, inplace=True)

print('Number of neighborhoods for analysis: '+ str(kl_neighborhoods.shape[0]))

Number of neighborhoods for analysis: 61


In [9]:
# Get geoprahical coordinates using new dataframe without missing values
geolocator = Nominatim(user_agent="specify_your_app_name_here")

# Get each neighborhood's location data [latitude & longitude] 
location_lat_lng = []
for neighborhood in kl_neighborhoods['Neighborhood']:
    location = geolocator.geocode(neighborhood+', Kuala Lumpur')
    location_lat_lng.append([location.latitude, location.longitude])
    
# Check first 5 rows
location_lat_lng[0:5]

[[3.21749, 101.6608685],
 [3.2020528, 101.6329945],
 [3.1941357999999997, 101.63363432715688],
 [3.158462, 101.6360029],
 [3.1709295, 101.6789455]]

In [10]:
# Create dataframe for location coordinates
kl_coords = pd.DataFrame(location_lat_lng, columns=['Latitude', 'Longitude'])

# Merge location coordinates into neighborhood dataframe
kl_df = pd.concat([kl_neighborhoods, kl_coords], axis=1)

kl_df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Jinjang,3.21749,101.660869
1,Taman Bukit Maluri,3.202053,101.632994
2,Bandar Menjalara,3.194136,101.633634
3,Bukit Kiara,3.158462,101.636003
4,Bukit Tunku,3.17093,101.678945


### 4. Search for bubble tea shops in each neighborhood's vicinity using Foursquare location data

In [36]:
#C_ID = 'Foursquare client ID' # Foursquare client ID (hidden to prevent account sharing)
#C_SEC = 'Foursquare client secret' # Foursquare client secret (hidden to prevent account sharing)

In [12]:
# Define Foursquare Credentials and Version
CLIENT_ID = C_ID # Foursquare client ID (hidden to prevent account sharing)
CLIENT_SECRET = C_SEC # Foursquare client secret (hidden to prevent account sharing)
VERSION = '20200324'

In [13]:
# Setting up querry radius and limit
search_query = 'bubble tea'
radius = 3000
LIMIT = 100

venues = []
# Loop over each neighborhood's geographical coordinates data
for lat, long, neighborhood in zip(kl_df['Latitude'], kl_df['Longitude'], kl_df['Neighborhood']):
    # Define corresponding URL
    url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(
        CLIENT_ID, CLIENT_SECRET, lat, long, VERSION, search_query, radius, LIMIT)
    
    # Send GET request to retrieve venues data
    results = requests.get(url).json()['response']['venues']
    
    for venue in results:
        venues.append((
            neighborhood,
            venue['name'],  
            venue['categories']))

In [14]:
# Place venue results into dataframe
venues_df = pd.DataFrame(venues)

# Define the column names
venues_df.columns = ['Neighborhood', 'VenueName', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(3050, 3)


Unnamed: 0,Neighborhood,VenueName,VenueCategory
0,Jinjang,Yi Zhong Tang Bubble Tea,"[{'id': '52e81612bcbc57f1066b7a0c', 'name': 'B..."
1,Jinjang,Brem Mall Bubble Tea Shop,[]
2,Jinjang,bubble bubble bubble tea & waffle @ restaurant...,"[{'id': '4bf58dd8d48988d145941735', 'name': 'C..."
3,Jinjang,The Coffee Bean & Tea Leaf,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C..."
4,Jinjang,C' Tea Cafe,"[{'id': '4bf58dd8d48988d142941735', 'name': 'A..."


In [15]:
# Function that extracts the category of the venue
def get_category_type(rowz):
    try:
        categories_list = rowz['VenueCategory']
    except:
        categories_list = rowz['venues_df.VenueCategory']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
# Filter the category for each row
venues_df['VenueCategory'] = venues_df.apply(get_category_type, axis=1)
venues_df.head()

Unnamed: 0,Neighborhood,VenueName,VenueCategory
0,Jinjang,Yi Zhong Tang Bubble Tea,Bubble Tea Shop
1,Jinjang,Brem Mall Bubble Tea Shop,
2,Jinjang,bubble bubble bubble tea & waffle @ restaurant...,Chinese Restaurant
3,Jinjang,The Coffee Bean & Tea Leaf,Coffee Shop
4,Jinjang,C' Tea Cafe,Asian Restaurant


In [16]:
# Remove all venues that are not 'Bubble Tea Shop'
bt_venues = venues_df.loc[venues_df['VenueCategory'] == 'Bubble Tea Shop']
print(bt_venues.shape)
bt_venues.head()

(410, 3)


Unnamed: 0,Neighborhood,VenueName,VenueCategory
0,Jinjang,Yi Zhong Tang Bubble Tea,Bubble Tea Shop
14,Jinjang,Tealive,Bubble Tea Shop
22,Jinjang,Q Q Tea Bar,Bubble Tea Shop
27,Jinjang,Tealive,Bubble Tea Shop
31,Jinjang,Tealive,Bubble Tea Shop


In [17]:
# Check total number of unique bubble tea brands 
bt_venues['VenueName'].nunique()

48

In [18]:
# Show total number of unique bubble tea shops in each neighborhood, place in new dataframe
unique_bt = bt_venues.groupby('Neighborhood')['VenueName'].nunique().reset_index()
unique_bt = pd.DataFrame(unique_bt)

# Rename column name to Unique Shops
unique_bt.rename(columns={'VenueName':'Unique Shops'}, inplace=True)

unique_bt.head()

Unnamed: 0,Neighborhood,Unique Shops
0,Alam Damai,4
1,Ampang,2
2,Bandar Baru Sentul,4
3,Bandar Malaysia,8
4,Bandar Menjalara,6


In [19]:
# Group dataframe according to neighborhood with bubble tea shop
bt_df = bt_venues.groupby('Neighborhood')['VenueCategory'].count().reset_index()

# Create new dataframe to store grouped data & rename column to Bubble Tea Shop
bt_df = pd.DataFrame(bt_df)
bt_df.rename(columns={'VenueCategory':'Bubble Tea Shop'}, inplace=True)

bt_df.head()

Unnamed: 0,Neighborhood,Bubble Tea Shop
0,Alam Damai,6
1,Ampang,6
2,Bandar Baru Sentul,7
3,Bandar Malaysia,11
4,Bandar Menjalara,10


In [20]:
# Merge dataframe to show relationship between neighborhood & no. of bubble tea shop & no. of unique bubble tea shop
bt_df = pd.concat([bt_df, unique_bt['Unique Shops']], axis=1)

bt_df

Unnamed: 0,Neighborhood,Bubble Tea Shop,Unique Shops
0,Alam Damai,6,4
1,Ampang,6,2
2,Bandar Baru Sentul,7,4
3,Bandar Malaysia,11,8
4,Bandar Menjalara,10,6
5,Bandar Sri Permaisuri,6,5
6,Bandar Tasik Selatan,11,8
7,Bangsar,6,2
8,Bangsar South,9,4
9,Brickfields,2,1


### 5. Cluster neighborhoods

In [21]:
# Set number of clusters
k = 3

cluster_df = bt_df.drop(["Neighborhood"], 1)

# Run k-means clustering
kmeans = KMeans(n_clusters=k, random_state=0).fit(cluster_df)

# Check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 0, 2, 1, 1, 2, 1, 0, 2, 0], dtype=int32)

In [22]:
# Create new dataframe that includes <neighborhood>, <bubble tea shop count>, and <cluster label>
bt_cluster_df = bt_df.copy()

bt_cluster_df["Cluster Label"] = kmeans.labels_

bt_cluster_df.head()

Unnamed: 0,Neighborhood,Bubble Tea Shop,Unique Shops,Cluster Label
0,Alam Damai,6,4,2
1,Ampang,6,2,0
2,Bandar Baru Sentul,7,4,2
3,Bandar Malaysia,11,8,1
4,Bandar Menjalara,10,6,1


In [23]:
# Merge bt_cluster_df with kl_df to add latitude/longitude for each neighborhood
bubble_tea_kl = bt_cluster_df.join(kl_df.set_index("Neighborhood"), on="Neighborhood")

# Sort cluster
bubble_tea_kl.sort_values(["Cluster Label"], inplace=True)

print(bubble_tea_kl.shape)
bubble_tea_kl

(61, 6)


Unnamed: 0,Neighborhood,Bubble Tea Shop,Unique Shops,Cluster Label,Latitude,Longitude
30,Kuala Lumpur City Centre,2,1,0,3.139619,101.693744
58,Taman U-Thant,3,3,0,3.156499,101.729820
54,Taman OUG,5,3,0,3.075488,101.670810
44,Sri Hartamas,5,3,0,3.156072,101.646406
41,Sentul,6,2,0,3.178618,101.695478
38,Perdana Botanical Gardens,3,1,0,3.143426,101.684513
35,Mont Kiara,5,3,0,3.169999,101.652147
33,Medan Tuanku,4,1,0,3.159271,101.698910
31,Kuala Lumpur Sentral,3,2,0,3.135266,101.685760
28,Kampung Sungai Penchala,4,2,0,3.162549,101.625972


### 6. Visualize the clusters formed with Folium

In [26]:
# Create map centered on Kuala Lumpur, Malaysia
kl_map_location = geolocator.geocode('Kuala Lumpur')

map_clusters = folium.Map(location=[kl_map_location.latitude, kl_map_location.longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i+x+(i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(bubble_tea_kl['Latitude'], bubble_tea_kl['Longitude'], bubble_tea_kl['Neighborhood'], bubble_tea_kl['Cluster Label']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [27]:
# Cluster 0
cluster_0 = bt_cluster_df.loc[bt_cluster_df['Cluster Label'] == 0]
cluster_0

Unnamed: 0,Neighborhood,Bubble Tea Shop,Unique Shops,Cluster Label
1,Ampang,6,2,0
7,Bangsar,6,2,0
9,Brickfields,2,1,0
11,Bukit Kiara,3,2,0
12,Bukit Nanas,5,2,0
14,Chow Kit,5,2,0
16,Damansara Heights,4,1,0
18,Dang Wangi,4,2,0
20,Federal Hill,2,1,0
21,Jalan Cochrane,3,3,0


In [28]:
# Cluster 1
cluster_1 = bt_cluster_df.loc[bt_cluster_df['Cluster Label'] == 1]
cluster_1

Unnamed: 0,Neighborhood,Bubble Tea Shop,Unique Shops,Cluster Label
3,Bandar Malaysia,11,8,1
4,Bandar Menjalara,10,6,1
6,Bandar Tasik Selatan,11,8,1
10,Bukit Jalil,10,9,1
19,Desa Petaling,8,7,1
23,Jinjang,10,5,1
26,Kampung Malaysia,10,8,1
36,Padang Balang,8,7,1
40,Semarak,12,6,1
42,Setapak,9,6,1


In [29]:
# Cluster 2
cluster_2 = bt_cluster_df.loc[bt_cluster_df['Cluster Label'] == 2]
cluster_2

Unnamed: 0,Neighborhood,Bubble Tea Shop,Unique Shops,Cluster Label
0,Alam Damai,6,4,2
2,Bandar Baru Sentul,7,4,2
5,Bandar Sri Permaisuri,6,5,2
8,Bangsar South,9,4,2
13,Bukit Tunku,8,3,2
15,Damansara,8,5,2
17,Damansara Town Centre,6,3,2
24,KL Eco City,8,4,2
27,Kampung Pandan,5,4,2
29,Kerinchi,8,4,2
