In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files


import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import math
import random

In [2]:
CLIENT_ID = 'YLFUKJAOWG0KL1WHTO2HD105URFFUDZ1YTCA0NHWVBWBW3FI' # your Foursquare ID
CLIENT_SECRET = 'L1QRNAP5NRYRERCQBW40PBXTNNNBGWK4CGNYRIIOKRPT5KZ1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

Our region of study will be a square, 25 kilometers by 25 kilometers, centered at 34.052200° N, 118.243700° W. These coordinates were obtained by searching Google for "los angeles lat/lon". 

We will use a spherical earth approximation to compute the latitudes and longitudes of the boundaries of this square, and the latitudes and longitudes of the boundaries of random points within this square. According to this approximation, one degree of latitude is 10<sup>7</sup>/90 meters, and one degree of longitude is cos(latitude)\*10<sup>7</sup>/90 meters.

In [12]:
la_lat, la_lon = (34.052200, -118.243700)
square_half_side = 12500 #meters

In [13]:
def north_east_of_la(north, east):
    '''Compute the latitude and longitude of a point in our
    study area, given its distance north/south and east/west
    of the center point
    
    Inputs:
        north: float; distance north of center point, in meters. 
            Use negative for points south of the center.
        east: float; distance east of center point, in meters. 
            Use negative for points west of the center.
            
    Returns: tuple (latitude, longitude), in degrees
    '''
    lat = la_lat + north*90/10**7
    lon = la_lon + east*90/(10**7*math.cos(lat*3.14159265359/180))
    return (lat, lon)

In [14]:
nw_corner = north_east_of_la(square_half_side, -square_half_side)
ne_corner = north_east_of_la(square_half_side, square_half_side)
sw_corner = north_east_of_la(-square_half_side, -square_half_side)
se_corner = north_east_of_la(-square_half_side, square_half_side)

In [79]:
map_of_study = folium.Map(location=[la_lat, la_lon], zoom_start=11)
folium.PolyLine(
    [nw_corner, ne_corner, se_corner, sw_corner, nw_corner]
    ).add_to(map_of_study)
map_of_study

#### Let's create a function to query Foursquare for venues

In [17]:
def get_nearby_venues(x, y, radius=500, limit=100):
    '''
    Query Foursquare for venues near a particular point within the 
    study area, and return the IDs and categories of the venues. 
    Note that if limit > 100, Foursquare will return at most 100 venues.
    
    Inputs: 
        x: (float) x coordinate of the query point, in meters east 
            of the study center point
        y: (float) y coordinate of the query point, in meters north 
            of the study center point
        radius: (float) query radius, in meters
        limit: (int) maximum number of results to return
        
    Returns: a Series with venue IDs as the index and venue
        categories as the values
    '''
        
    (lat, lon) = north_east_of_la(y, x)
    venues_list=[]
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat,
            lon,
            radius, 
            limit)
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    nearby_venues = pd.Series()
        
    # include only relevant information for each nearby venue
    for v in results:
        nearby_venues.loc[v['venue']['id']] = v['venue']['categories'][0]['name']
        
    return(nearby_venues)

We test `get_nearby_venues` at 50 random points within the study area:

In [18]:
for i in range(50):
    nv = get_nearby_venues(random.uniform(-square_half_side, square_half_side), 
                           random.uniform(-square_half_side, square_half_side), 
                           radius=1000, limit=150)
    print(len(nv))

90
22
63
100
81
46
29
86
12
27
18
17
14
64
10
100
100
100
76
6
24
6
53
39
14
61
23
16
25
18
14
100
24
92
39
56
20
100
100
78
9
26
21
65
6
4
6
13
4
9


We see that several times, Foursquare returned 100 venues. Most likely, this means there were *n* > 100 venues within the query radius. We know that Foursquare is limited to returning 100 venues, but we don't know how Foursquare chooses 100 out of the _n_ venues. Its selection rule may cause an unknown bias. Therefore, when a query returns 100 venues, we do not use these results. However, if we  simply ignore these results, we are undersampling areas with a high density of venues. To avoid this problem, whenever a query returns 100 venues, we replace it with four more queries with half the radius. These four queries cover the same total area as the original query.

In [21]:
def get_nearby_venues_unbiased(x, y, radius=1000):
    '''
    Query Foursquare for venues near a particular point within the 
    study area, and return the IDs and categories of the venues. 
    Note that Foursquare will return at most 100 venues. 
    
    To ensure that Foursquare does not bias the results with
    unknown selection rules, we ignore the results of queries
    that return 100 venues. Instead, we execute queries for 
    four nearby points, using a radius half as large.
    
    Inputs: 
        x: (float) x coordinate of the query point, in meters east 
            of the study center point
        y: (float) y coordinate of the query point, in meters north 
            of the study center point
        radius: (float) query radius, in meters
        
    Returns: a DataFrame with a row for each venue found, with
        columns 'id' and 'category'
    '''
        
    limit = 100
    (lat, lon) = north_east_of_la(y, x)
    venues_list=[]
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat,
            lon,
            radius, 
            limit)
    # make the GET request
    json_received = requests.get(url).json()
    try:
        results = json_received["response"]['groups'][0]['items']
    except JSONDecodeError:
        print(json_received)
        results = []
        
    if len(results) < limit:
        nearby_venues = pd.DataFrame(columns=['id', 'category'])
        
        # include only relevant information for each nearby venue
        for v in results:
            nearby_venues = nearby_venues.append(
                {'id': v['venue']['id'],
                 'category': v['venue']['categories'][0]['name']
                }, ignore_index=True)        
        print(len(nearby_venues))
    else:
        quadrants = [get_nearby_venues_unbiased(x + i*radius/2, y + j*radius/2, radius/2)
                     for i in [-1, 1] for j in [-1, 1]
                    ]
        nearby_venues = (pd.concat(quadrants, axis=0, ignore_index=True)
                         .drop_duplicates(subset=['id'], keep='first')
                        )
    
    return(nearby_venues)

We call `get_nearby_venues_unbiased` repeatedly at random locations until we have at least 10,000 distinct venues. We use `drop_duplicates` to ensure they are distinct.

In [22]:
#Collect required number of venues
venues_to_find = 10000
radius = 1000

#Query centers lie within a smaller square, so that the query
#circle stays within the study area
query_square_half_side = square_half_side - radius

venues = pd.DataFrame(columns=['id', 'category'])
centers_chosen = 0
while len(venues) < venues_to_find:
    random_x = random.uniform(-query_square_half_side,
                              query_square_half_side)
    random_y = random.uniform(-query_square_half_side,
                              query_square_half_side)
    centers_chosen += 1
    new_venues = get_nearby_venues_unbiased(random_x, random_y, radius)
    venues = (venues.append(new_venues, ignore_index=True, sort=False)
              .drop_duplicates(subset=['id'], keep='first')
             )
    print((centers_chosen, len(new_venues), len(venues)))

39
(1, 39, 39)
23
42
44
39
(2, 148, 187)
29
(3, 29, 216)
13
(4, 13, 229)
17
(5, 17, 246)
21
(6, 21, 267)
24
(7, 24, 291)
11
(8, 11, 302)
79
(9, 79, 381)
5
(10, 5, 386)
27
(11, 27, 413)
95
31
21
15
(12, 162, 575)
70
(13, 70, 645)
59
(14, 59, 704)
27
(15, 27, 731)
90
(16, 90, 821)
15
(17, 15, 836)
34
(18, 34, 869)
28
(19, 28, 897)
30
(20, 30, 927)
87
(21, 87, 1014)
12
(22, 12, 1026)
50
72
71
50
(23, 243, 1169)
29
(24, 29, 1198)
33
(25, 33, 1231)
93
9
64
20
(26, 186, 1411)
66
(27, 66, 1477)
8
(28, 8, 1485)
32
(29, 32, 1493)
21
(30, 21, 1513)
82
(31, 82, 1581)
59
(32, 59, 1629)
59
(33, 59, 1685)
88
(34, 88, 1773)
12
(35, 12, 1783)
26
(36, 26, 1805)
50
(37, 50, 1855)
20
(38, 20, 1875)
30
(39, 30, 1889)
21
(40, 21, 1900)
48
(41, 48, 1928)
54
(42, 54, 1982)
39
(43, 39, 2021)
17
(44, 17, 2027)
30
(45, 30, 2039)
28
(46, 28, 2067)
41
(47, 41, 2079)
25
(48, 25, 2093)
17
(49, 17, 2104)
73
(50, 73, 2177)
26
(51, 26, 2203)
16
(52, 16, 2212)
30
(53, 30, 2240)
37
(54, 37, 2250)
42
61
24
48
1
54
79
(55

45
(418, 45, 9852)
89
(419, 89, 9868)
90
(420, 90, 9869)
12
(421, 12, 9876)
81
(422, 81, 9878)
21
(423, 21, 9879)
27
(424, 27, 9884)
36
19
1
72
13
4
56
(425, 201, 9887)
22
(426, 22, 9889)
57
(427, 57, 9897)
32
(428, 32, 9909)
5
(429, 5, 9909)
23
(430, 23, 9916)
17
(431, 17, 9919)
51
(432, 51, 9921)
42
(433, 42, 9921)
51
(434, 51, 9924)
58
60
0
8
(435, 126, 9937)
15
30
53
2
21
37
92
(436, 250, 10016)


In [23]:
venues.to_csv('venues.csv')

In [24]:
venues.describe()

Unnamed: 0,id,category
count,10016,10016
unique,10016,451
top,5ab455de8d0a531174f5a5f5,Mexican Restaurant
freq,1,429


We have found a total of 10,016 venues. It will be useful to know how many are in each category:

In [26]:
categories = venues['category'].value_counts()
categories.to_csv('categories.csv')

In [54]:
venues = venues.reset_index()

In the cell below, for each of these 10,016 venues, we query Foursquare for its "next venues," which means the most common venues that a user checks in at after checking in at this venue. This takes about two hours.

Occasionally, while the cell below is executing, Foursquare fails to return a valid result. In this case, the code below will print the JSON and continue. Allow the cell to execute until it finishes. Then replace `for i in venues.index:` with `for i in venues[pd.isna(venues['num next'])].index:` and re-execute the cell. This will repeat the queries that were not successful.

In [58]:
for i in venues.index:
    
#for i in venues[pd.isna(venues['num next'])].index:

    venue_id = venues.loc[i, 'id']
    url = 'https://api.foursquare.com/v2/venues/{}/nextvenues?&client_id={}&client_secret={}&v={}'.format(
            venue_id,
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION)
    json_received = requests.get(url).json()
    try:
        num_next_venues = json_received['response']['nextVenues']['count']
        next_venues = json_received['response']['nextVenues']['items']
        venues.loc[i, 'num next'] = num_next_venues
        for j in range(num_next_venues):
            venues.loc[i, 'next' + str(j + 1)] = next_venues[j]['categories'][0]['name']
    except KeyError:
        print(i)
        print(json_received)
        
    if i%100 == 0:
        print(i)
        venues.to_csv('venuesWithNext.csv')

10000


In [62]:
venues.describe(include='all')

Unnamed: 0,index,id,category,num next,next1,next2,next3,next4,next5
count,10016.0,10016,10016,10016.0,4825,3788,3187,2768,2494
unique,,10016,451,,229,227,220,219,221
top,,5ab455de8d0a531174f5a5f5,Mexican Restaurant,,Coffee Shop,Coffee Shop,Coffee Shop,Coffee Shop,Coffee Shop
freq,,1,429,,368,250,241,181,156
mean,5007.591753,,,1.703474,,,,,
std,2891.672768,,,2.116934,,,,,
min,0.0,,,0.0,,,,,
25%,2503.75,,,0.0,,,,,
50%,5007.5,,,0.0,,,,,
75%,7511.25,,,4.0,,,,,


Let's look at venues in the Sports Bar category.

In [82]:
category = 'Sports Bar'
group = venues[venues['category'] == category]
group

Unnamed: 0,index,id,category,num next,next1,next2,next3,next4,next5
436,436,54642cba498e50ec78b74e43,Sports Bar,4.0,Cocktail Bar,Shopping Plaza,Burger Joint,Dumpling Restaurant,
2646,2646,4c33afec452620a10d02240f,Sports Bar,4.0,Baseball Stadium,Baseball Field,Clothing Store,Lounge,
2650,2650,535c92de498eacad03e66b58,Sports Bar,1.0,Baseball Stadium,,,,
3591,3591,4c2035fdb306c928eaac69b7,Sports Bar,2.0,Casino,Outlet Mall,,,
6004,6004,5a29fc9f9d6a191b4ec7caf4,Sports Bar,2.0,Basketball Stadium,Basketball Court,,,
6014,6014,4f31d5efe4b057434ce9c989,Sports Bar,1.0,Basketball Stadium,,,,
7243,7243,561db7a3498e594adb2fa222,Sports Bar,1.0,Basketball Stadium,,,,
8186,8186,5875af81d4ab736b8e785bac,Sports Bar,2.0,College Football Field,Soccer Stadium,,,
9182,9182,52f7dd7111d22027eea4c47f,Sports Bar,5.0,Basketball Stadium,General Entertainment,Movie Theater,Rock Club,Bar
9451,9451,4aff7f64f964a520193922e3,Sports Bar,5.0,Korean Restaurant,Asian Restaurant,Korean Restaurant,Karaoke Bar,Seafood Restaurant


Each row is a venue. Where do people go after visiting this venue? For each venue, Foursquare has given us up to five likely answers. We don't care about the identities of these venues, so we have saved their categories only. These categories are in columns next1 through next5.

We start by counting occurrences of categories in the column next1:

In [83]:
group['next1'].value_counts()

Basketball Stadium        4
Baseball Stadium          2
College Football Field    2
Casino                    1
Korean Restaurant         1
Cocktail Bar              1
Name: next1, dtype: int64

We repeat this for columns next2 through next5, and combine the results:

In [84]:
MNPV = 5 #Maximum number of results from a nextvenues query (determined by Foursquare)
group_scores = pd.concat(
    [group['next' + str(i + 1)].value_counts()
     for i in range(MNPV)],
    axis=1, sort=False)
group_scores

Unnamed: 0,next1,next2,next3,next4,next5
Basketball Stadium,4.0,,,,
Baseball Stadium,2.0,,,,
College Football Field,2.0,,,,
Casino,1.0,,,,
Korean Restaurant,1.0,,1.0,,
Cocktail Bar,1.0,,,,
Asian Restaurant,,1.0,,,
Basketball Court,,1.0,,,
General Entertainment,,1.0,,,
Baseball Field,,1.0,,,


We'd like to add up the total number of times people went to these other venues after a sports bar. Foursquare doesn't give us these numbers, but it does order the next venues using these numbers in descending order. Therefore we give higher weight to those earlier in the list. We make a wild guess that the best weights for columns next1 through next5 are 10, 9, 8, 7, and 6 respectively. Here are the weighted scores: 

In [85]:
WFFR = 10 #Weight given to the first result of a nextvenues query (my choice)
group_scores = pd.concat(
    [group['next' + str(i + 1)].value_counts()*(WFFR - i)
     for i in range(MNPV)],
    axis=1, sort=False)
group_scores

Unnamed: 0,next1,next2,next3,next4,next5
Basketball Stadium,40.0,,,,
Baseball Stadium,20.0,,,,
College Football Field,20.0,,,,
Casino,10.0,,,,
Korean Restaurant,10.0,,8.0,,
Cocktail Bar,10.0,,,,
Asian Restaurant,,9.0,,,
Basketball Court,,9.0,,,
General Entertainment,,9.0,,,
Baseball Field,,9.0,,,


We get the total score for each category by adding across the rows:

In [86]:
group_scores = group_scores.sum(axis=1)
group_scores

Basketball Stadium        40.0
Baseball Stadium          20.0
College Football Field    20.0
Casino                    10.0
Korean Restaurant         18.0
Cocktail Bar              10.0
Asian Restaurant           9.0
Basketball Court           9.0
General Entertainment      9.0
Baseball Field             9.0
Shopping Plaza             9.0
Outlet Mall                9.0
Sports Bar                 9.0
Soccer Stadium             9.0
Movie Theater              8.0
Rock Club                 15.0
Clothing Store             8.0
Burger Joint               8.0
Karaoke Bar                7.0
Gastropub                  7.0
Dumpling Restaurant        7.0
Lounge                     7.0
Bar                        6.0
Seafood Restaurant         6.0
American Restaurant        6.0
dtype: float64

Finally, we convert these scores to percentages by dividing by the total, and list the top five in descending order:

In [87]:
NRPC = 5 #Number of results to output per category (my choice)
group_percent = group_scores*100/group_scores.sum()
group_percent.sort_values(ascending=False, inplace=True)
leaders = group_percent.head(NRPC)
leaders

Basketball Stadium        14.545455
Baseball Stadium           7.272727
College Football Field     7.272727
Korean Restaurant          6.545455
Rock Club                  5.454545
dtype: float64

Thus, we estimate that someone at a sports bar has a 14.5 percent chance of next visiting a basketball stadium, and that the next four most likely categories are baseball stadium, college football field, Korean restaurant, and rock club.

Now we repeat this computation for all categories. We will only include categories where we have at least 10 venues, with a total of at least 20 next venues.

In [78]:
results_columns = ['# of Venues', '# of Venues With Next',
                   'Total # of Next Venues']
for i in range(1, NRPC + 1):
    results_columns += ['Next Category #' + str(i), 'Pct. #' + str(i)]
results_df = pd.DataFrame(columns=results_columns)

for category, group in venues.groupby('category'):
    num_venues = len(group)
    num_with_next = len(group[group['num next'] > 0])
    if num_with_next < 10:
        continue
    total_next = group['num next'].sum()
    if total_next < 20:
        continue
    results_df.loc[category, '# of Venues'] = num_venues
    results_df.loc[category, '# of Venues With Next'] = num_with_next
    results_df.loc[category, 'Total # of Next Venues'] = total_next
    group_scores = pd.concat(
        [group['next' + str(i + 1)].value_counts()*(WFFR - i)
         for i in range(MNPV)],
        axis=1, sort=False).sum(axis=1)
    group_percent = group_scores*100/group_scores.sum()
    group_percent.sort_values(ascending=False, inplace=True)
    leaders = group_percent.head(NRPC).reset_index()  
    for j in leaders.index:
        results_df.loc[category,
                       'Next Category #' + str(j + 1)
                      ] = leaders.loc[j, 'index']
        results_df.loc[category,
                       'Pct. #' + str(j + 1)
                      ] = leaders.loc[j, 0]        
    
print(results_df.shape)
results_df

(122, 13)


Unnamed: 0,# of Venues,# of Venues With Next,Total # of Next Venues,Next Category #1,Pct. #1,Next Category #2,Pct. #2,Next Category #3,Pct. #3,Next Category #4,Pct. #4,Next Category #5,Pct. #5
Accessories Store,24,12,40,Clothing Store,28.9941,Sporting Goods Shop,11.8343,Coffee Shop,7.10059,Men's Store,5.02959,Outlet Mall,5.02959
American Restaurant,158,102,397,Coffee Shop,7.01006,Shopping Mall,6.40049,Bar,5.02895,Grocery Store,4.8156,Multiplex,3.6879
Art Gallery,83,42,130,Art Gallery,20.0909,Bar,5.90909,Art Museum,5.81818,Gastropub,3.09091,Coffee Shop,3.09091
Art Museum,19,13,58,Art Museum,28.9979,Coffee Shop,11.7271,Sculpture Garden,7.46269,Historic Site,7.03625,New American Restaurant,7.03625
Arts & Crafts Store,34,17,64,Shopping Mall,12.2411,Coffee Shop,6.96798,Big Box Store,6.59134,Bookstore,5.08475,Flower Shop,4.89642
Asian Restaurant,87,42,130,Coffee Shop,12.9061,Bubble Tea Shop,8.12274,Grocery Store,6.31769,Dessert Shop,6.22744,Big Box Store,6.22744
BBQ Joint,39,18,71,Ice Cream Shop,8.00681,Dessert Shop,7.15503,Coffee Shop,5.96252,Café,5.62181,Shopping Mall,4.08859
Bakery,162,80,271,Coffee Shop,10.1054,Shopping Mall,7.16169,Grocery Store,6.72232,Bakery,4.56942,Café,4.08612
Bank,78,57,191,Grocery Store,14.5601,Coffee Shop,13.8166,Supermarket,9.60347,Pharmacy,9.23172,Shopping Mall,5.70012
Bar,109,91,392,Bar,19.7806,Gastropub,4.51411,Cocktail Bar,3.94984,Lounge,3.44828,Mexican Restaurant,3.41693


Thus, for 122 venue categories, we have recommendations for targeting advertising to people at those venues.