In [2]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: price-instrument-construction.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Sun Mar 29 2020
#
# DESC: This code constructs an instrument for restaurants' price based on nearby
#       restaurants. 
#
# EXEC:
#      
################################################################################
################################################################################

In [3]:
############################### Libraries ######################################

import sqlalchemy as db
import pandas as pd

import numpy as np

import os
import json

################################################################################

In [4]:
######################### Constants and settings ###############################

database = 'postgresql://{user}:{user_pass}@{host}/{dataname2}'

output_folder_path = '/home/user/projects/urban/data/output/spatial-demand/restaurants-direct' 

create_restaurants_with_geo_statement = """
CREATE TEMPORARY TABLE restaurants_with_geo AS
    SELECT
        sname_place_id,
        ST_SetSRID(ST_Point(longitude, latitude),4326)::geography AS location
    FROM
        restaurants AS r
;
-- Create spatial index
CREATE INDEX restaurants_with_geo_location_idx
ON restaurants_with_geo
USING GIST (location);
"""

spatial_join_statement = """
CREATE TEMPORARY TABLE restautants_joined_by_distance AS (
    SELECT 
        r1.sname_place_id,
        r2.sname_place_id AS neighbor_id,
        ST_Distance(r1.location, r2.location) AS distance
    FROM
        restaurants_with_geo AS r1
    LEFT JOIN
        restaurants_with_geo AS r2
    ON
        ST_DWithin(r1.location, r2.location, 1000) AND
        r1.sname_place_id != r2.sname_place_id
);
"""

select_2_closest_neightbours = """
SELECT 
    *
FROM (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY sname_place_id ORDER BY distance) AS rank
    FROM
        restautants_joined_by_distance
    ) AS rank_filtered
WHERE
    rank <= 5
ORDER BY 
    sname_place_id;
"""

restaurants_table_statement = """
SELECT
    r.sname_place_id,
    r.brands,
    r.naics_code,
    r.categories,
    r.price,
    r.rating,
    r.review_count,
    r.area_m2,
    r.total_minutes_open,
    r.yelp_phone IS NOT NULL AS phone,
    r.state,
    r.zip_code,
    r.cbsa AS r_cbsa,
    r.cbg AS r_cbg,
    r.latitude,
    r.longitude
FROM
    restaurants as r
;
"""

################################################################################

In [5]:
########################## Run SQL, export data ################################

engine = db.create_engine(database)
connection = engine.connect()

print('Creating restaurants_with_geo table.')
result = engine.execute(create_restaurants_with_geo_statement)
print('Doing the spatial join.')
result = engine.execute(spatial_join_statement)

print('Exporting.')
restaurant_neighbors = pd.read_sql(select_2_closest_neightbours, engine)
restaurants = pd.read_sql(restaurants_table_statement, engine)

engine.dispose()

################################################################################

Creating restaurants_with_geo table.
Doing the spatial join.
Exporting.


In [6]:
restaurant_neighbors.head()

Unnamed: 0,sname_place_id,neighbor_id,distance,rank
0,sg:0000151bf1ed4fe3a39f1b27327821bf,sg:6e6d5a396c584e87bcdb5b987eae22e3,113.654197,1
1,sg:0000151bf1ed4fe3a39f1b27327821bf,sg:7f0565bb533c449b8172206b2d5d21f2,129.211783,2
2,sg:0000151bf1ed4fe3a39f1b27327821bf,sg:3c665ffb8fd14fa29430247bf33ca2b5,142.778952,3
3,sg:0000151bf1ed4fe3a39f1b27327821bf,sg:9af7a6a41b884562a27ccbc5b6bd09bd,253.470636,4
4,sg:0000151bf1ed4fe3a39f1b27327821bf,sg:0eae9a876f7a41f09f653f9dd358d3d1,274.326769,5


In [7]:
restaurant_neighbors.shape

(2691442, 4)

In [8]:
restaurants.head()

Unnamed: 0,sname_place_id,brands,naics_code,categories,price,rating,review_count,area_m2,total_minutes_open,phone,state,zip_code,r_cbsa,r_cbg,latitude,longitude
0,sg:3d7150bd87cb441f9c52ac4a24e6f69e,,722511,"[{'alias': 'bbq', 'title': 'Barbeque'}]",1.0,4.0,34.0,126.892555,,True,ok,74464,,400219781004,35.886263,-94.975156
1,sg:1e484aaa553746b1887e60c96296934f,Krispy Krunchy Chicken,722513,,,,,310.641279,,False,ca,93230,25260.0,60310011001,36.303961,-119.636296
2,sg:615e0eb2bfa9405889d6bdb6f590d9f2,,722511,"[{'alias': 'newamerican', 'title': 'American (...",-1.0,4.0,11.0,353.904797,,True,ny,14571,40380.0,360734013003,43.328487,-78.191166
3,sg:ed001eec28e546bb8a3fdce63b69d751,Little Caesars,722511,,,,,1236.128049,,False,pr,680,32420.0,720970806001,18.199873,-67.137186
4,sg:4a5b60fe17064abd9832a52566c3c3f2,Dairy Queen,722513,,,,,211.244876,,False,wv,24874,,541090031002,37.606191,-81.536343


In [9]:
# Get first category for each restaurant
restaurants['category1'] = restaurants['categories'].apply(lambda x: x[0]['alias'] if x else None)
restaurants.drop(columns = ['categories'], inplace = True)

In [10]:
# Price to integer type
restaurants['price'] = restaurants['price'].astype('Int64')

In [11]:
# Drop all missing and non 1-2 restaurants
restaurants.dropna(subset = ['price'], inplace = True)
restaurants.drop(restaurants[(restaurants['price'] != 1) & (restaurants['price'] != 2)].index, 
                 inplace = True)
# Convert to binary
restaurants['price'] = restaurants['price'] - 1

In [12]:
restaurants.head()

Unnamed: 0,sname_place_id,brands,naics_code,price,rating,review_count,area_m2,total_minutes_open,phone,state,zip_code,r_cbsa,r_cbg,latitude,longitude,category1
0,sg:3d7150bd87cb441f9c52ac4a24e6f69e,,722511,0,4.0,34.0,126.892555,,True,ok,74464,,400219781004,35.886263,-94.975156,bbq
9,sg:09b25a8733b94885a0b9f4143ca1d8b0,,722511,1,4.0,103.0,403.029696,,True,tx,77429,26420.0,482015544012,29.959005,-95.649567,italian
11,sg:1f5c7c0840854e1992ee73438d4047be,,722511,1,3.0,15.0,546.740314,,True,de,19973,41540.0,100050504071,38.655084,-75.593844,mexican
12,sg:c3190c316e5049b788b7d06e2bd5ca5c,,722511,1,3.0,57.0,1092.123158,,True,az,85284,38060.0,40133200013,33.349662,-111.947598,indpak
13,sg:2480f6045c15418b8cd18121f656742c,,722511,1,4.0,59.0,36871.383306,,True,ny,12550,39100.0,360710104001,41.517748,-74.071288,mexican


In [13]:
# Get datasets with own and neighbors characteristics
own = restaurants[['sname_place_id', 
                   'price', 
                   'category1']].copy()
own.rename(columns = {'category1': 'own_category1', 
                      'price': 'own_price'}, 
           inplace = True)
neighbors = restaurants[['sname_place_id', 
                         'price', 
                         'rating', 
                         'category1']].copy()
neighbors.rename(columns = {'sname_place_id': 'neighbor_id',
                            'category1': 'neighbor_category1', 
                            'price': 'neighbor_price', 
                            'rating': 'neighbor_rating'}, 
                 inplace = True)

In [14]:
own.head()

Unnamed: 0,sname_place_id,own_price,own_category1
0,sg:3d7150bd87cb441f9c52ac4a24e6f69e,0,bbq
9,sg:09b25a8733b94885a0b9f4143ca1d8b0,1,italian
11,sg:1f5c7c0840854e1992ee73438d4047be,1,mexican
12,sg:c3190c316e5049b788b7d06e2bd5ca5c,1,indpak
13,sg:2480f6045c15418b8cd18121f656742c,1,mexican


In [15]:
neighbors.head()

Unnamed: 0,neighbor_id,neighbor_price,neighbor_rating,neighbor_category1
0,sg:3d7150bd87cb441f9c52ac4a24e6f69e,0,4.0,bbq
9,sg:09b25a8733b94885a0b9f4143ca1d8b0,1,4.0,italian
11,sg:1f5c7c0840854e1992ee73438d4047be,1,3.0,mexican
12,sg:c3190c316e5049b788b7d06e2bd5ca5c,1,3.0,indpak
13,sg:2480f6045c15418b8cd18121f656742c,1,4.0,mexican


In [16]:
# Merge neighbor pairs to characteristics
restaurant_neighbors = pd.merge(restaurant_neighbors, 
                                own, 
                                how = 'left', 
                                on = 'sname_place_id', 
                                validate = 'many_to_one')
restaurant_neighbors = pd.merge(restaurant_neighbors, 
                                neighbors, 
                                how = 'left', 
                                on = 'neighbor_id', 
                                validate = 'many_to_one')

In [17]:
restaurant_neighbors.head()

Unnamed: 0,sname_place_id,neighbor_id,distance,rank,own_price,own_category1,neighbor_price,neighbor_rating,neighbor_category1
0,sg:0000151bf1ed4fe3a39f1b27327821bf,sg:6e6d5a396c584e87bcdb5b987eae22e3,113.654197,1,0,chinese,0,4.0,mexican
1,sg:0000151bf1ed4fe3a39f1b27327821bf,sg:7f0565bb533c449b8172206b2d5d21f2,129.211783,2,0,chinese,1,4.0,japanese
2,sg:0000151bf1ed4fe3a39f1b27327821bf,sg:3c665ffb8fd14fa29430247bf33ca2b5,142.778952,3,0,chinese,1,4.0,vietnamese
3,sg:0000151bf1ed4fe3a39f1b27327821bf,sg:9af7a6a41b884562a27ccbc5b6bd09bd,253.470636,4,0,chinese,0,5.0,mexican
4,sg:0000151bf1ed4fe3a39f1b27327821bf,sg:0eae9a876f7a41f09f653f9dd358d3d1,274.326769,5,0,chinese,1,3.5,mexican


In [18]:
# Only consider restaurants and neighbors belonging to $ and $$ categories
restaurant_neighbors.dropna(subset = ['own_price', 'neighbor_price'], inplace = True)

In [19]:
restaurant_neighbors.shape

(1686567, 9)

In [20]:
# Equal categories (own vs neighbors)?
restaurant_neighbors['category1_equal'] = (restaurant_neighbors['own_category1'] == 
                                           restaurant_neighbors['neighbor_category1'])

In [21]:
# Aggregate within restaurant
restaurant_neighbors = restaurant_neighbors.groupby('sname_place_id'
                                                   ).aggregate({'category1_equal': 'any', 
                                                                'neighbor_price': 'mean'})

In [22]:
restaurant_neighbors.reset_index(inplace = True)

In [23]:
restaurant_neighbors['category1_equal'] = restaurant_neighbors['category1_equal'].astype('int')

In [24]:
restaurant_neighbors

Unnamed: 0,sname_place_id,category1_equal,neighbor_price
0,sg:0000151bf1ed4fe3a39f1b27327821bf,0,0.60
1,sg:00001b8625b64052888b8c2f2e3736bb,0,0.80
2,sg:0000342b96c3453fbcd3c80308517f75,0,0.25
3,sg:00003430f2e24d3d9579db600cdcd4f0,1,1.00
4,sg:00003e1625674e2f812a9d8d6e992058,0,0.20
...,...,...,...
429950,sg:ffff90e4832f4c8f8a710c3ff22dc6be,0,0.75
429951,sg:ffffc1b51e754dd7a2e69a9eb923dbd9,0,0.50
429952,sg:ffffd62e3ca9416d94a7025b5342968e,0,0.60
429953,sg:ffffd8f4da3c4ae881984d1abf8b16d6,0,1.00


In [25]:
# Export
# output_file_path = os.path.join(output_folder_path,
#                                'restaurants_neighbors.dta')
# restaurant_neighbors.to_stata(path = output_file_path,
#                              write_index = False,
#                              version = 119)
output_file_path = os.path.join(output_folder_path,
                               'restaurants_neighbors.csv')
restaurant_neighbors.to_csv(output_file_path, index = False)