In [1]:
# import packages
import pandas as pd 
import folium
import matplotlib.pyplot as plt
from geopy.distance import geodesic
import requests

In [2]:
# open the csv file with geospatial data of colombo district
geospatialData = pd.read_csv('colomboCensusProcessed.csv')
geospatialData.head()

Unnamed: 0,DS,GN,GN Number,Both sexes,Male,Female,Latitude,Longitude
0,Colombo,Sammanthranapura,,7829,4017,3812,6.978943,79.877983
1,Colombo,Mattakkuliya,,28003,14029,13974,6.971672,79.878683
2,Colombo,Modara,,17757,8794,8963,6.966925,79.871153
3,Colombo,Madampitiya,,12970,6505,6465,6.961663,79.875184
4,Colombo,Mahawatta,,8809,4367,4442,6.958307,79.873084


In [3]:
# drop the unwanted columns
geospatialData.drop(['GN Number', 'Both sexes', 'Male', 'Female'], axis=1, inplace=True)

In [4]:
# location of colombo
colomboLocation = [6.926523627315386, 79.85483152436545]

In [5]:
# add a column to the df to store the distance from colombo
geospatialData['Distance from Colombo'] = None
# add distance data to df
for latitude, longitude, index in zip(geospatialData['Latitude'], geospatialData['Longitude'], geospatialData.index):
    distanceColombo = geodesic([latitude, longitude], colomboLocation).km
    geospatialData.at[index, 'Distance from Colombo'] = distanceColombo
geospatialData.head()

Unnamed: 0,DS,GN,Latitude,Longitude,Distance from Colombo
0,Colombo,Sammanthranapura,6.978943,79.877983,6.336563
1,Colombo,Mattakkuliya,6.971672,79.878683,5.646016
2,Colombo,Modara,6.966925,79.871153,4.818306
3,Colombo,Madampitiya,6.961663,79.875184,4.489975
4,Colombo,Mahawatta,6.958307,79.873084,4.052534


In [6]:
# locations more than 15km away from Colombo cann't consider as good locations to construct apartment buildings due to traffic conditions
# drop the locations more than 15km away from the city
geospatialData = geospatialData[(geospatialData['Distance from Colombo']<15)]
# locations closer than 5km is dropped, because the land prices are sky high
geospatialData = geospatialData[(geospatialData['Distance from Colombo']>5)]
geospatialData.head()

Unnamed: 0,DS,GN,Latitude,Longitude,Distance from Colombo
0,Colombo,Sammanthranapura,6.978943,79.877983,6.336563
1,Colombo,Mattakkuliya,6.971672,79.878683,5.646016
40,Kolonnawa,Halmulla,6.95349,79.89548,5.391848
46,Kolonnawa,Kittampahuwa,6.943663,79.900379,5.378471
48,Kolonnawa,Maha Buthgamuwa B,6.945843,79.909475,6.405504


In [7]:
# visualize the selected locations
# create a map object
colomboMap = folium.Map(location=colomboLocation, zoom_start=11)
# add markers of GN locations to map
for latitude, longitude, label in zip(geospatialData['Latitude'], geospatialData['Longitude'], geospatialData['GN']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7,
        parse_html=True
        ).add_to(colomboMap)
# visuallize the base location in red color
folium.CircleMarker(
    colomboLocation,
    radius=2,
    color='red',
    fill=True,
    fill_color='red',
    fill_opacity=0.7,
    parse_html=True
    ).add_to(colomboMap)
# visualize the map
colomboMap

In [20]:
# get the venues near the selected locations using Foursqure API
# Foursqure credentials
CLIENT_ID = 'VU2AEMY2CCFAIDMYNX4GGHA1IISEQ5KHVOQH3OWEEW5VUK2U' # Foursquare ID
CLIENT_SECRET = '0BEY2YJ2OF1CWXVC1Z23QKN10YWGCV0CQ4DQBGL3OVEOMJ3Q' # Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 50 # A default Foursquare API limit value

In [21]:
# define a list to store the venues
venues = []
# looping through the df to get the nerby venue details
for latitude, longitude, gn in zip(geospatialData['Latitude'], geospatialData['Longitude'], geospatialData['GN']):
    # request url
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            latitude, 
            longitude, 
            1500, # only concider 1,500m radius around the selected location due to reachability.
            LIMIT)
    # create a get request
    results = requests.get(url).json()['response']['groups'][0]['items']
    # append venue details to the list
    venues.append([(
        gn,
        latitude,
        longitude,
        venue['venue']['name'],
        venue['venue']['location']['lat'],
        venue['venue']['location']['lng'],
        venue['venue']['categories'][0]['name']) for venue in results])

In [23]:
# convert the list to df
nearbyVenues = pd.DataFrame([item for venue in venues for item in venue])
# add column names to df
nearbyVenues.columns = ['GN',
                        'GN Latitude',
                        'GN Longitude',
                        'Venue',
                        'Venue Latitude',
                        'Venue Longitude',
                        'Venue Category']
nearbyVenues.head()

Unnamed: 0,GN,GN Latitude,GN Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Sammanthranapura,6.978943,79.877983,Pegasus Reef Hotel,6.982707,79.884876,Hotel Bar
1,Sammanthranapura,6.978943,79.877983,Crow Island Beach,6.973375,79.86941,Beach
2,Sammanthranapura,6.978943,79.877983,MoMo,6.974219,79.887215,Restaurant
3,Sammanthranapura,6.978943,79.877983,Pizza Hut,6.986227,79.889353,Pizza Place
4,Sammanthranapura,6.978943,79.877983,The Royal Boat Restaurant,6.971901,79.885855,Restaurant


In [33]:
# one hot encoding
colomboVenues = pd.get_dummies(nearbyVenues[['Venue Category']], prefix="", prefix_sep="")
# add GN column to the df
colomboVenues['GN'] = nearbyVenues['GN']
# rearranging the columns
columnNames = ['GN']+[columnName for columnName in list(colomboVenues.columns) if columnName != 'GN']
colomboVenues = colomboVenues[columnNames]
colomboVenues.head()

Unnamed: 0,GN,Accessories Store,Airport,Airport Terminal,Aquarium,Arcade,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Tourist Information Center,Track,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Warehouse Store,Women's Store,Zoo
0,Sammanthranapura,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Sammanthranapura,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Sammanthranapura,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Sammanthranapura,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Sammanthranapura,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# group the venues 
colomboGroupedVenues = colomboVenues.groupby('GN').mean()
colomboGroupedVenues.reset_index(inplace=True)
colomboGroupedVenues.head()

Unnamed: 0,GN,Accessories Store,Airport,Airport Terminal,Aquarium,Arcade,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Tourist Information Center,Track,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Warehouse Store,Women's Store,Zoo
0,Ambathale,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Angulana North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0
2,Angulana South,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0
3,Arangala,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0
4,Aruppitiya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# define a functionn to get most common venues
def getMostCommonVenues(row, venueCount):
    categories = row.iloc[1:]
    categoriesSorted = categories.sort_values(ascending=False)
    return categoriesSorted.index[0:venueCount]

In [40]:
# create a list of column names for the df which use to store most common venues
columnNames = ['GN']
indicators = ["st", "nd", "rd"]
# select most common 5 venue types arround the selected GN location
venueCount = 10

for i in range(venueCount):
    if i<len(indicators):
        columnNames.append('{}{} Most Common Venue'.format(i+1, indicators[i]))
    else:
        columnNames.append('{}th Most Common Venue'.format(i+1))

In [41]:
# create a df with column names in the list
colomboCommonVenues = pd.DataFrame(columns=columnNames)
# fill the GN column
colomboCommonVenues['GN'] = colomboGroupedVenues['GN']
# filling the other columns with venue categories
for i in range(colomboGroupedVenues.shape[0]):
    colomboCommonVenues.iloc[i, 1:] = getMostCommonVenues(colomboGroupedVenues.iloc[i, :], venueCount)
colomboCommonVenues.head()

Unnamed: 0,GN,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Ambathale,Bus Station,Aquarium,Clothing Store,Park,Accessories Store,Nature Preserve,Office,Optical Shop,Outdoors & Recreation,Pakistani Restaurant
1,Angulana North,Bus Station,Department Store,Snack Place,Train Station,Chinese Restaurant,Electronics Store,Platform,Bakery,Accessories Store,Optical Shop
2,Angulana South,Train Station,Department Store,Chinese Restaurant,Bus Station,Accessories Store,Performing Arts Venue,Nature Preserve,Office,Optical Shop,Outdoors & Recreation
3,Arangala,Convenience Store,Market,Video Store,Sri Lankan Restaurant,Recreation Center,Japanese Restaurant,Music Venue,Nature Preserve,Office,Optical Shop
4,Aruppitiya,Chinese Restaurant,Supermarket,Bakery,Indian Restaurant,Grocery Store,Pizza Place,Fast Food Restaurant,Bar,Clothing Store,Shopping Mall
