In [1]:
#Preliminaries
from collections import namedtuple
import requests
import csv
import time
import json
import os
import pandas as pd

#For Interactive GeoVisualization
import folium
from folium import Map
from folium.map import Layer, FeatureGroup, LayerControl, Marker
from folium.plugins import MarkerCluster, FeatureGroupSubGroup, Fullscreen

#To turn our DataFrame to a GeoDataFrame
import geopandas as gpd
from shapely.geometry import Point, Polygon

### Why Google API?

Between Google API and OpenStreetMap, Google Place API returns a more updated data. This probably due to the fact that more resources are employed in Google and has more users. The caveat though is that after a certain number of use, Google starts to charge so I advise developers to read up on the cost matrix and be aare while using the API.

For the purposes of this blog, assuming you have not used Google API for the month (*the free tier refreshes monthly*), this will cost you nothing.

When testing the code, limit your API calls to a few rows so you won't run the error of running your free-tier out.

It goes without saying that in order to continue, one should register for a Google API key. 

In [2]:
#the namedtuple method returns a new subclass of tuple with named fields
#this is a convenient way of assigning names to tuples so you can later access them by name
CityCoords = namedtuple("CityCoords", ["city_name", "lat", "lng", "region"])

In [3]:
#### Google Maps Key
# GMAPS_KEY = GMAPS_KEY

#Google Place API Request
#Note that we {lat}, {long}, {radius}, {keywords} and {key} will be arguments that we need to supply in later
G_PLACES_API = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={lat},{lng}&radius={radius}&keyword={keyword}&key={key}"

#Token to skip to next page
G_PLACES_NEXT_PAGE_TOKEN = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?pagetoken={page_token}&key={key}"

In [4]:
#Indicate brands you wish to collect
coffee_brands = ["Coffee Bean and Tea Leaf", "Starbucks", "Tim Hortons", "Coffee Project"]

#### Define the Functions we will be calling on later

In [5]:
def get_cities_coordinates():
    '''
    
    This function reads a json file of places and clones instances from namedtuples.
    
    Depending on the namedtuples defined, the data will be mapped over
    
    _make = method of the namedtuple, and hence of the collections library as well. 
            It creates an instance of the namedtuples object in a namedtuple, 
            the arrangement or order is important
    
    '''
    results = []
    with open('data/philippine_cities.json', 'r') as f:
        data = json.loads(f.read())
        for row in data:
            results.append(
                CityCoords._make(
                        [
                            row["city"],
                            row["lat"],
                            row["long"],
                            row["region"]
                        ]
                    )
                )
            
    return results

In [6]:
def get_nearby_coordinates(lat: float, lng: float, adjustment=0.1):
    '''
    This function will get coordinates of the city center.
    
    Parameters:
        lat: this is the latitutde of the center of the city
        lng: this is the longitude of the center of the city
        adjustment: the number used to adjust (individually, and separately) city centers' longitude and lattitude. 
                    The resulting longitude and latitude pair becomes the coordinates of a "nearby" place
    '''
    for r in [adjustment, -adjustment]:
        yield (lat + r, lng)
    for r in [adjustment, -adjustment]:
        yield (lat, lng + r)


What's the difference between using the yield function versus using a list? The yield function generates a "generator" object which returns each output and is consider memory-efficient, because after returning the output, it does not need to store it unlike a list.

To see the output, call on the generator object and print the values:

In [7]:
#Manila Lat and Lng (14.5547, 121.0244 )
for i in get_nearby_coordinates(14.5547,  121.0244):
    print(i)

(14.6547, 121.0244)
(14.4547, 121.0244)
(14.5547, 121.1244)
(14.5547, 120.9244)


You see that it generates an output efficiently. This is what we will be using to look for nearby locations. 


The reason that we need to the nearby locations from the nearby center is that we want to make sure that we get as many coffee shops by moving our point of reference. The way Google Places API does the search is by scanning the radius from the chosen coordinates. In this case, we may not be able to get all of them at once if we just chose the city's center.

In [8]:
def get_coffee_shops(brands, lat, lng, radius=10000):
    '''
    This function gets all the coffee shops' data per brand and per cities.
    
    Parameters:
        brands: list of coffee brands to look up
        lat:  latitude of point where Google Places will center the calculation of radius on.
        lng: longitude of point where Google Places will center the calculation of radius on.
        radius: radius from the reference point to include in the search    
    '''
    total_results = []
    #Loop through the brands
    for brand in brands:
        API_results =[] #Contains the JSON Objects from Google API Queries
        keyword = brand.lower().replace(" ","+") #Turning into format acceptable by API
        response = requests.get(G_PLACES_API.format(lat=lat, lng=lng, radius=radius,keyword=keyword, key=GMAPS_KEY))
        API_results.extend(response.json()['results'])
        next_page_token = response.json().get("next_page_token")
        while True:
            if next_page_token is None:
                break
            #Sleeping for three seconds to wait for the results prepared by Google
            time.sleep(3)
            response = requests.get(
                        G_PLACES_NEXT_PAGE_TOKEN.format(key=GMAPS_KEY, page_token=next_page_token))
            next_page_token = response.json().get("next_page_token")
            API_results.extend(response.json()["results"])
        total_results.append(API_results) #Per brand, different list
    return total_results

There is a particular trade-off for the choice of radius-parameter. A larger radius may be able to get more numbers in total but there's a chance that some of these will be redundant and therefore adds to your cost unnecessarily (using 20K did not exceeed my free-tier limit). Using too little runs the risk of not getting all of them.

In [9]:
class CoffeeShopSearcher(object):
    '''
    This class will provide a dataframe of counts and lat, long per city and per brand
    
    '''
    def __init__(self, e: CityCoords):
        self.lat = float(e.lat)
        self.lng = float(e.lng)
        self.places_ids = set()
        self.main_dataframe = pd.DataFrame(data=None, columns=['id','brand','name', 'lat', 'lng', 'vicinity'])
        
    def scan(self, lat, lng):
        '''
        This function scans a single point and appends it data on the main dataframe.
        
        '''
        
        results = get_coffee_shops(coffee_brands, lat, lng)
        for i in range(len(results)):
            for j in results[i]:
                self.places_ids.add(j['place_id'])
                self.main_dataframe.loc[len(self.places_ids)] = [j['place_id'], coffee_brands[i], 
                                                                j['name'], j['geometry']['location']['lat'], 
                                                                 j['geometry']['location']['lng'],
                                                                 j['vicinity']]
    def search(self):
        for (lat, lng) in get_nearby_coordinates(self.lat, self.lng):
            self.scan(lat, lng)
        return self.main_dataframe.drop_duplicates(subset='id', keep='first')
    

### Finally Run the Codes

In [10]:
# final_df = pd.DataFrame(data=None, columns=['id','brand','name', 'lat', 'lng', 'vicinity'])
# #Use the Class to search over per City
# cities_coords = get_cities_coordinates()
# for e in cities_coords:
#     df = CoffeeShopSearcher(e).search()
#     final_df= pd.concat([final_df, df])
#     print("Total Coffee Shops Gathered for {}: {}".format(e.city_name,len(df)))

In [11]:
# final_df.to_csv("Coffee Brands Footprint.csv")

### Cleaning the dataset

The data we get from our API request may not entirely be clean. This particularly happens when the keywords you use matches other brands' keywords as well. Take for instance, Coffee Bean and Tea Leaf where coffee may match other coffee shops as well.

In [12]:
final_df = pd.read_csv('Coffee Brands Footprint.csv',usecols=["brand", 'name','lat', 'lng', 'vicinity'])
final_df.head()


Unnamed: 0,brand,name,lat,lng,vicinity
0,Coffee Bean and Tea Leaf,The Coffee Bean and Tea Leaf,15.164039,120.609505,"Aniceto Gueco St, Angeles"
1,Coffee Bean and Tea Leaf,The Coffee Bean & Tea Leaf,15.168922,120.580243,"G/F SM City Clark, Manuel A. Roxas Hwy, Clark ..."
2,Coffee Bean and Tea Leaf,The Coffee Bean & Tea Leaf,15.169827,120.578192,"Tech Hub, SM Clark, Clark Freeport, Angeles"
3,Coffee Bean and Tea Leaf,The Coffee Bean and Tea Leaf,15.167694,120.564239,"Clark Freeport, Mabalacat"
4,Starbucks,Starbucks,15.177889,120.530057,"Manuel A. Roxas Hwy, Clark Freeport, Mabalacat"


In [13]:
#Setting the value of those with the Starbucks name as "Starbucks Brand" etc.
#empty brands for fresh cleaning
final_df.loc[:, "brand"] = None


#Replace keywords in name
#You can use a dictionary to replace the values
#Turn on Regex=True for partial replacement
d = {"&": 'and'}
final_df.replace(to_replace =d.keys(), 
                 value =d.values(), regex=True, inplace=True) 


for i in coffee_brands:
    final_df.loc[final_df.name.str.lower().str.contains(i.lower()),"brand"] = i

#Save the final df to include only brands that are belong to the ones in your list
df = final_df[~final_df.brand.isna()]


The final count of our coffee shops in the Philippines:

In [14]:
df.brand.value_counts()

Starbucks                   432
Coffee Bean and Tea Leaf    147
Tim Hortons                  47
Coffee Project               45
Name: brand, dtype: int64

In [15]:
# trial = get_coffee_shops(coffee_brands, 14.5547, 121.0244, radius=10000)

## GEOVISUALIZATION THROUGH FOLIUM

First, to use Folium, we have to turn our dataframe into a GeoDataframe:

In [16]:
#Make list of Points to use as  Geometry Column
geometry = [Point(xy) for xy in zip(df['lng'], df['lat'])]

#Let us create GeoDataFrame with df data and list of Point Geometries
gdf = gpd.GeoDataFrame(df, geometry=geometry)

In [17]:
#The folium.Map instantiates a folium map object with given parameters
coffee_map = folium.Map(location = [14.5540,120.9752],
                       zoom_start=5, #from experience, this zoom level captures the whole PH
                       tiles='CartoDB dark_matter',
                       control_scale=True,
                       prefer_canvas=True)
#Add Fullscreen Control
Fullscreen(
            title="Full Screen",
            title_cancel = "Exit fullscreen",
            force_separate_button=True).add_to(coffee_map)

coffee_map


In [18]:
#Let's Create a Color Dictionary for the Markers
color_dict = {
    "Starbucks": ' #00704A',
    "Coffee Bean and Tea Leaf": '#362d26',
    "Coffee Project": '#654321',
    "Tim Hortons": '#dd0f2d'
}

In [19]:
#Since we need to add a filter for this, we need to add them by brand:

for brand in coffee_brands:
    df_i = gdf[gdf['brand']==brand]
    df_i.loc[:, "color"] = color_dict[brand]
    
    #Let's add a marker feature - clustering
    marker_cluster = MarkerCluster(control=False) #False so it will not appear as a layer
    marker_cluster.add_to(coffee_map)
    sub_group = FeatureGroupSubGroup(marker_cluster, name=brand, control=True, show=True)
    
    #Popup Contents
    for index, row in df_i.iterrows():
        
        html = """
        
        <h3>{title}</h3><br>
        <b> {brand}</b><br>
          {vicinity}<br>
        
        """
        
        popup_contents = folium.Html(html.format(title = df_i.loc[index, 'name'],
                                                     brand = row.brand,
                                                     vicinity = row.vicinity),
                                         script = True)
    
    
        popup = folium.Popup(popup_contents, max_width=2650)
    
        folium.vector_layers.CircleMarker(radius = 8,
                                                  location = (row.geometry.y,
                                                              row.geometry.x),
                                                  popup = popup,
                                                  color = row.color,
                                                  fill = True,
                                                  fill_color = row.color,
                                                  name = brand,
                                                  control = True,
                                                  overlay = True
                                                 ).add_to(sub_group)
        sub_group.add_to(coffee_map)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


Let's add some control so we can filter according to the coffee brands:

In [20]:
folium.map.LayerControl(collapsed=True, position='topright').add_to(coffee_map)
coffee_map