In [10]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: price-instrument-construction.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Tue Apr 21 2020
#
# DESC: This code constructs an instrument for restaurants' price based on nearby
#       restaurants. 
#
# EXEC:
#      
################################################################################
################################################################################

In [11]:
############################### Libraries ######################################

import sqlalchemy as db
import pandas as pd

import numpy as np

import os
import json

################################################################################

In [12]:
# Select the CBSAs

restaurants_by_cbsa_table_statement = """
SELECT
    cbsa,
    COUNT(sname_place_id) AS restaurant_count
FROM
    restaurants
GROUP BY
    cbsa
;
"""

cbgs_by_cbsa_table_statement = """
SELECT
    cbsa,
    COUNT(censusblockgroup) AS cbg_count
FROM
    cbgs
GROUP BY
    cbsa
;
"""

engine = db.create_engine('postgresql://{user}:{user_pass}@{host}/{dataname2}')
connection = engine.connect()

#restaurants_by_cbsa_table = pd.read_sql(restaurants_by_cbsa_table_statement, 
#                                        engine)

#cbgs_by_cbsa_table = pd.read_sql(cbgs_by_cbsa_table_statement, 
#                                 engine)

#engine.dispose()

#cbsa_table = pd.merge(cbgs_by_cbsa_table, 
#                      restaurants_by_cbsa_table,
#                      how = 'outer', 
#                      on = 'cbsa', 
#                      validate = 'one_to_one')

#medium_restaurants_filter = ((cbsa_table['restaurant_count'] >= 500) &
#                             (cbsa_table['restaurant_count'] <= 1500)
#                            )
#medium_cbgs_filter = ((cbsa_table['cbg_count'] >= 150) & 
#                      (cbsa_table['cbg_count'] <= 250)
#                     )

#cbsa_medium_table = cbsa_table[medium_restaurants_filter & medium_cbgs_filter]

#cbsa_medium_table.reset_index(drop = True, inplace = True)

#selected_cbsa_list = cbsa_medium_table['cbsa'].to_list()

In [13]:
######################### Constants and settings ###############################

#selected_cbsas =  "('" + "','".join(selected_cbsa_list) + "')"
#print(selected_cbsas)

year = 2019
month = 7

database = 'postgresql://{user}:{user_pass}@{host}/{dataname2}'

output_folder_path = '/home/user/projects/urban/data/output/spatial-demand/main_demand' 

create_restaurants_with_geo_statement = """
CREATE TEMPORARY TABLE restaurants_with_geo AS
    SELECT
        r.sname_place_id,
        ST_SetSRID(ST_Point(r.longitude, r.latitude),4326)::geography AS location,
        r.price,
        r.rating,
        r.area_m2,
        (r.brands IS NOT NULL)::int AS branded,
        jsonb_array_length(r.categories) AS n_categories,
        r.categories->0->'alias' AS category1 
    FROM
        restaurants AS r
    INNER JOIN
        visits AS v
    ON
        r.sname_place_id = v.sname_place_id 
    AND
        v.year = {year} 
    AND
        v.month = {month}
;
-- Create spatial index
CREATE INDEX restaurants_with_geo_location_idx
ON restaurants_with_geo
USING GIST (location);
""".format(year = year, month = month)
#WHERE 
#r.cbsa IN {selected_cbsas}
#, selected_cbsas = selected_cbsas

spatial_join_statement = """
CREATE TEMPORARY TABLE restautants_joined_by_distance AS (
    SELECT 
        r1.sname_place_id,
        r2.sname_place_id AS neighbor_id,
        r2.price as neighbor_price,
        r2.rating AS neighbor_rating,
        r2.area_m2 AS neighbor_area_m2,
        r2.branded AS neighbor_branded,
        r2.n_categories AS neighbor_n_categories,
        (r1.category1 = r2.category1)::int AS neighbor_category1_equal,
        ST_Distance(r1.location, r2.location) AS distance
    FROM
        restaurants_with_geo AS r1
    LEFT JOIN
        restaurants_with_geo AS r2
    ON
        ST_DWithin(r1.location, r2.location, 1000) AND
        r1.sname_place_id != r2.sname_place_id
);
"""

count_neightbours_statement = """
SELECT 
    sname_place_id,
    COUNT(neighbor_id) AS neighbor_count,
    COUNT(neighbor_id) FILTER (WHERE neighbor_price IN (-1, 0) OR neighbor_price IS NULL) AS neighbor_price_0,
    COUNT(neighbor_id) FILTER (WHERE neighbor_price = 1) AS neighbor_price_1,
    COUNT(neighbor_id) FILTER (WHERE neighbor_price = 2) AS neighbor_price_2,
    COUNT(neighbor_id) FILTER (WHERE neighbor_price IN (3, 4)) AS neighbor_price_3,
    MIN(distance) AS neighbor_min_distance,
    AVG(neighbor_rating) AS neighbor_rating,
    AVG(neighbor_area_m2) AS neighbor_area,
    AVG(neighbor_branded) AS neighbor_branded,
    AVG(neighbor_n_categories) AS neighbor_n_categories,
    AVG(neighbor_category1_equal) AS neighbor_category1_equal
FROM 
    restautants_joined_by_distance
GROUP BY
    sname_place_id
;
"""

################################################################################

In [14]:
########################## Run SQL, export data ################################

engine = db.create_engine(database)
connection = engine.connect()

print('Creating restaurants_with_geo table.')
result = engine.execute(create_restaurants_with_geo_statement)
print('Doing the spatial join.')
result = engine.execute(spatial_join_statement)

print('Exporting.')
restaurant_neighbors = pd.read_sql(count_neightbours_statement, engine)

engine.dispose()

################################################################################

Creating restaurants_with_geo table.
Doing the spatial join.
Exporting.


In [15]:
restaurant_neighbors.shape

(537271, 12)

In [16]:
output_file_path = os.path.join(output_folder_path,
                               'restaurants_neighbors_features.csv')
restaurant_neighbors.to_csv(output_file_path, index = False)