In [1]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: export-month-restaurants-direct.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Fri Mar 13 2020
#
# DESC: This code produces tables with CBG-level restaurant choices.
#
# EXEC:
#      
################################################################################
################################################################################

In [2]:
############################### Libraries ######################################

import sqlalchemy as db
import pandas as pd
import os
import numpy as np

################################################################################

In [3]:
######################### Constants and settings ##############################

# Pandas display options
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

year = 2018
month = 10
output_folder_path = '/home/user/projects/urban/data/output/spatial-demand/restaurants-direct' 

################################################################################

In [4]:
######################### PostgreSQL connection ################################

engine = db.create_engine('postgresql://{user}:{user_pass}@{host}/{dataname2}')
connection = engine.connect()

################################################################################

In [5]:
########### CBG-to-restaurants dataset creation SQL statements #################

restaurants_table_statement = """
SELECT
    r.sname_place_id,
    r.brands,
    r.naics_code,
    r.categories,
    r.price,
    r.rating,
    r.review_count,
    r.area_m2,
    r.total_minutes_open,
    r.yelp_phone IS NOT NULL AS phone,
    r.state,
    r.zip_code,
    r.cbsa AS r_cbsa,
    r.cbg AS r_cbg,
    r.latitude,
    r.longitude
FROM
    restaurants as r
;
"""

# Create a restaurants table with visits breakdown by home CBG
restaurants_visits_breakdown_table_statement = """
CREATE TEMPORARY TABLE restaurant_visits_long AS (
    WITH restaurant_visits AS (
        SELECT
            r.sname_place_id,
            r.longitude,
            r.latitude,
            v.raw_visit_counts,
            v.raw_visitor_counts,
            v.visitor_home_cbgs
        FROM
            restaurants AS r
        LEFT JOIN
            visits AS v
        ON  
            r.sname_place_id = v.sname_place_id
        AND
            v.year = {year} 
        AND
            v.month = {month}
    )
    SELECT 
        sname_place_id,
        longitude,
        latitude,
        raw_visit_counts,
        raw_visitor_counts,
        (json_each(visitor_home_cbgs)).*
    FROM
        restaurant_visits
);
ALTER TABLE restaurant_visits_long
RENAME COLUMN key TO home_cbg;
ALTER TABLE restaurant_visits_long
RENAME COLUMN value TO visits_from_home_cbg;

CREATE INDEX restaurant_visits_long_sg_idx
ON restaurant_visits_long (sname_place_id);
CREATE INDEX restaurant_visits_long_cbg_idx
ON restaurant_visits_long (home_cbg);
""".format(year = year, month = month)

# CREATE A TABLE with distances between CBG and restaurant:
# shortest polygon-to point distance and centroid-to-point distance
restaurants_visits_breakdown_distances_table_statement = """
CREATE TEMPORARY TABLE restaurant_visits_long_distances AS (
    SELECT
        r.*,
        ST_SetSRID(ST_Point(r.longitude, r.latitude),4326)::geography AS r_location,
        c.wkb_geometry::geography AS cbg_location,
        ST_Centroid(c.wkb_geometry)::geography as cbg_centroid
    FROM 
        restaurant_visits_long AS r
    LEFT JOIN
        cbgs AS c
    ON
        r.home_cbg = c.censusblockgroup
);

ALTER TABLE restaurant_visits_long_distances
ADD COLUMN distance_closest_point NUMERIC,
ADD COLUMN distance_centroid_point NUMERIC;

UPDATE restaurant_visits_long_distances
SET distance_closest_point = ST_Distance(r_location, cbg_location),
    distance_centroid_point = ST_Distance(r_location, cbg_centroid);
"""

restaurants_visits_breakdown_distances_table_export_statement = """
SELECT 
    sname_place_id,
    home_cbg,
    raw_visit_counts,
    raw_visitor_counts,
    visits_from_home_cbg,
    distance_closest_point,
    distance_centroid_point
FROM
    restaurant_visits_long_distances
;
"""

################################################################################

In [None]:
################## CBG-to-restaurants dataset creation #########################

# Get restaurant characteristics
restaurants_characteristics = pd.read_sql(restaurants_table_statement, 
                                          engine)

# Build a table expanding visits-from-cbgs 
result = engine.execute(restaurants_visits_breakdown_table_statement)

# Build a table with restaurant-cbg distances
result = engine.execute(restaurants_visits_breakdown_distances_table_statement)

# Get the expanded visits from CBGs (and corresponding distances) table
restaurants_visits_breakdown = pd.read_sql(restaurants_visits_breakdown_distances_table_export_statement, 
                                           engine)

# Rename column
restaurants_visits_breakdown.rename(columns = {'visits_from_home_cbg': 'visitors_from_home_cbg'}, 
                                    inplace = True)

################################################################################

In [None]:
########################### Close connection ###################################

engine.dispose()

################################################################################

In [None]:
# Check the results
restaurants_visits_breakdown.head()

In [None]:
# Check the results
restaurants_visits_breakdown.dtypes

In [None]:
# Check the results
restaurants_characteristics.head()

In [None]:
# Check the results
restaurants_characteristics.dtypes

In [None]:
# Convert data type for integers with missing values
restaurants_characteristics['price'] = restaurants_characteristics['price'].astype('Int64')
restaurants_characteristics['review_count'] = restaurants_characteristics['review_count'].astype('Int64')

In [None]:
# Merge the datasets
restaurants_visits_breakdown = pd.merge(restaurants_visits_breakdown,
                                        restaurants_characteristics, 
                                        how = 'left',
                                        on = 'sname_place_id', 
                                        validate = 'many_to_one')

In [None]:
restaurants_visits_breakdown.head()

In [None]:
################## Get device and establishments data ##########################

# PostgreSQL connection
engine = db.create_engine('postgresql://{user}:{user_pass}@{host}/{dataname2}')
connection = engine.connect()

# Create a table with CBGs' device count and CBSA affiliation
cbg_table_statement = """
SELECT
    CONCAT(c.statefips, c.countyfips, c.tractcode) AS ct,
    c.censusblockgroup AS cbg,
    c.cbsa AS home_cbsa,
    h.number_devices_residing
FROM
    cbgs AS c
LEFT JOIN
    home AS h
ON
    c.censusblockgroup = h.census_block_group 
AND
    h.year = {year} 
AND
    h.month = {month}
;
""".format(year = year, month = month)

cbg_home = pd.read_sql(cbg_table_statement, engine)
# Remove duplicate CBGs
cbg_home = cbg_home.sort_values(['number_devices_residing'],
                                ascending = False).groupby('cbg').head(1)

# Create an establishments table with CBG affiliation and visits
visits_table_statement = """
SELECT 
    sname_place_id
FROM 
    visits
WHERE
    year = {year} AND
    month = {month}
;
""".format(year = year, month = month)
visits_month = pd.read_sql(visits_table_statement, engine)

establishments_table_statement = """
SELECT 
    sname_place_id,
    naics_first2,
    cbg
FROM 
    establishments
;
"""
establishments = pd.read_sql(establishments_table_statement, engine)

# Close the SQL connection-engine
engine.dispose()

################################################################################

In [None]:
############### Aggregate the establishments on the CBG level ##################

# Count open establishement by cbg
est_open_this_month = pd.merge(establishments,
                               visits_month,
                               how = 'inner',
                               validate = 'one_to_one')

del establishments, visits_month

# Aggregate at the cbg level
cbg_establishments = est_open_this_month.groupby(['cbg',
                                                  'naics_first2']
                                                ).size().reset_index(name = 'cbg_naics')

del est_open_this_month

# Merge to get the CT of the CBG
cbg_establishments = pd.merge(cbg_establishments, 
                              cbg_home[['cbg','ct']], 
                              how = 'left',
                              on = 'cbg', 
                              validate = 'many_to_one')
# Count open establishments by ct
cbg_establishments['ct_naics'] = cbg_establishments.groupby(['ct',
                                                             'naics_first2']
                                                           )['cbg_naics'].transform('sum')

# Pivot into longer form 
cbg_establishments = cbg_establishments.pivot(index = 'cbg',
                                              columns = 'naics_first2',
                                              values = ['cbg_naics', 'ct_naics'])

# Replace missing values with 0s
cbg_establishments.fillna(0, inplace = True)
# Flatten hierarchical index
cbg_establishments.set_axis([f'{x}{y}' for x, y in cbg_establishments.columns], 
                            axis = 1, inplace = True)
# Reset index
cbg_establishments.reset_index(inplace = True)

# Merge to get the CT of the destination CBG
cbg_establishments = pd.merge(cbg_establishments, 
                              cbg_home[['cbg','ct']], 
                              how = 'left',
                              on = 'cbg', 
                              validate = 'one_to_one')
# Rename columns
cbg_home.rename(columns = {'cbg': 'home_cbg', 'ct': 'home_ct'}, 
                inplace = True)
cbg_establishments.rename(columns = {'cbg': 'r_cbg', 'ct': 'r_ct'}, 
                          inplace = True)

################################################################################

In [None]:
cbg_home.head()

In [None]:
cbg_home.dtypes

In [None]:
cbg_establishments.head()

In [None]:
cbg_establishments.dtypes

In [None]:
# Convert datatypes
cbg_home['number_devices_residing'] = cbg_home['number_devices_residing'].astype('Int64')
naics_col = [col for col in cbg_establishments if ('naics' in col)]
cbg_establishments[naics_col] = cbg_establishments[naics_col].astype('Int64')

In [None]:
# Sort by home cbg
restaurants_visits_breakdown.sort_values(by = 'home_cbg', inplace = True)

# Merge the datasets
restaurants_visits_breakdown = pd.merge(restaurants_visits_breakdown,
                                        cbg_home, 
                                        how = 'left',
                                        on = 'home_cbg',
                                        validate = 'many_to_one')

restaurants_visits_breakdown = pd.merge(restaurants_visits_breakdown,
                                        cbg_establishments, 
                                        how = 'left',
                                        on = 'r_cbg',
                                        validate = 'many_to_one')

del cbg_home, cbg_establishments

In [33]:
restaurants_visits_breakdown.head()

Unnamed: 0,sname_place_id,home_cbg,raw_visit_counts,raw_visitor_counts,visitors_from_home_cbg,distance_closest_point,distance_centroid_point,brands,naics_code,categories,price,rating,review_count,area_m2,total_minutes_open,phone,state,zip_code,r_cbsa,r_cbg,latitude,longitude,home_ct,home_cbsa,number_devices_residing,...,ct_naics23,ct_naics31,ct_naics32,ct_naics33,ct_naics42,ct_naics44,ct_naics45,ct_naics48,ct_naics49,ct_naics51,ct_naics52,ct_naics53,ct_naics54,ct_naics55,ct_naics56,ct_naics61,ct_naics62,ct_naics71,ct_naics72,ct_naics81,ct_naics92,r_ct,category1,visits_from_home_cbg,outside_good_count
0,sg:e1a1221e3a0f43cf9b92adedbaa1bc45,10010201001,612,441,5,2067.53839,3571.699696,Wendy's,722513,"[{'alias': 'hotdogs', 'title': 'Fast Food'}, {...",1,1.0,3,300.925755,5580.0,True,al,36067,33860,10010207002,32.459887,-86.453256,1001020100,33860,212,...,0,0,0,0,0,8,3,0,0,0,0,0,1,0,0,4,2,2,0,10,0,1001020700,hotdogs,6.938776,6517.39887
1,sg:4b8ea3994d9d4345aae23d5f9d2c2b82,10010201001,1323,805,8,2295.988576,3811.078454,McDonald's,722513,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...",1,1.5,6,480.980496,32100.0,True,al,36067,33860,10010207002,32.45912,-86.450839,1001020100,33860,212,...,0,0,0,0,0,8,3,0,0,0,0,0,1,0,0,4,2,2,0,10,0,1001020700,burgers,13.147826,6517.39887
2,sg:aa20dfafc669471c9125bfc64fbb9caf,10010201001,4416,2534,5,5891.58548,7359.427729,Chick-fil-A,722513,"[{'alias': 'hotdogs', 'title': 'Fast Food'}]",1,4.0,24,332.517284,24780.0,True,al,36066,33860,10010205002,32.460242,-86.412584,1001020100,33860,212,...,0,0,1,0,0,18,14,0,2,0,16,1,3,0,0,0,37,2,5,12,0,1001020500,hotdogs,8.713496,6517.39887
3,sg:a8101adf3b5547469536a68ff01e4434,10010201001,2145,1428,7,5152.406576,6633.465403,,722511,"[{'alias': 'mexican', 'title': 'Mexican'}]",2,3.5,27,220.593252,20880.0,True,al,36066,33860,10010205001,32.459325,-86.420449,1001020100,33860,212,...,0,0,0,0,0,18,14,0,0,0,16,0,3,0,0,1,37,2,5,12,0,1001020500,mexican,10.514706,6517.39887
4,sg:9bc86afedafd4762a80ad5ba1f1a7e18,10010201001,1048,788,5,2359.904166,3812.526979,Taco Bell,722513,"[{'alias': 'hotdogs', 'title': 'Fast Food'}, {...",1,2.5,11,226.623737,29520.0,True,al,36067,33860,10010204003,32.462065,-86.450279,1001020100,33860,212,...,0,0,0,0,0,6,5,0,0,0,0,0,0,1,0,0,3,0,0,6,0,1001020400,hotdogs,6.649746,6517.39887


In [34]:
restaurants_visits_breakdown.dtypes

sname_place_id         object
home_cbg                   object
raw_visit_counts            int64
raw_visitor_counts          int64
visitors_from_home_cbg      int64
                           ...   
ct_naics92                  Int64
r_ct                       object
category1                  object
visits_from_home_cbg      float64
outside_good_count        float64
Length: 77, dtype: object

In [None]:
# Get first category for each restaurant
restaurants_visits_breakdown['category1'] = restaurants_visits_breakdown['categories'].apply(lambda x: x[0]['alias'] if x else None)

In [None]:
# Construct the within-cbg restaurant choice counts
restaurants_visits_breakdown['visits_from_home_cbg'] = (restaurants_visits_breakdown['visitors_from_home_cbg'] *
                                                        restaurants_visits_breakdown['raw_visit_counts'] /
                                                        restaurants_visits_breakdown['raw_visitor_counts'])

In [None]:
# Construct the outside-good choice count
restaurants_visits_breakdown['outside_good_count'] = (
    restaurants_visits_breakdown[
        'visits_from_home_cbg'
    ].groupby(restaurants_visits_breakdown['home_cbg']
             ).transform('sum')
)
# Total choices made 31 * restaurants_visits_breakdown['number_devices_residing']:
restaurants_visits_breakdown['outside_good_count'] = (
    31 * restaurants_visits_breakdown['number_devices_residing'] - 
    restaurants_visits_breakdown['outside_good_count']
)

In [36]:
# Add total establishments column
cbg_naics_col = [col for col in restaurants_visits_breakdown if ('cbg_naics' in col)]
ct_naics_col = [col for col in restaurants_visits_breakdown if ('ct_naics' in col)]
restaurants_visits_breakdown['cbg_est_number'] = restaurants_visits_breakdown[cbg_naics_col].sum(axis = 1)
restaurants_visits_breakdown['ct_est_number'] = restaurants_visits_breakdown[ct_naics_col].sum(axis = 1)

In [39]:
################### Export data to process on local computer ###################

# Export path
logit_file_path = os.path.join(output_folder_path,
                               'data_restaurants_oct18.csv')
# Columns to export
columns_to_export = [
    'home_cbg',
    'home_ct',
    'home_cbsa',
    'sname_place_id',
    'visits_from_home_cbg',
    'visitors_from_home_cbg',
    'raw_visit_counts',
    'raw_visitor_counts',
    'number_devices_residing',
    'outside_good_count',
    'distance_closest_point',
    'distance_centroid_point',
    'brands',
    'category1',
    'naics_code',
    'price', 
    'rating',
    'area_m2',
    'total_minutes_open',
    'phone',
    'r_cbg',
    'r_ct',
    'r_cbsa',
    'cbg_est_number',
    'ct_est_number'
]
columns_to_export = columns_to_export + cbg_naics_col + ct_naics_col

# Perform export
restaurants_visits_breakdown.to_csv(logit_file_path,
                                    columns = columns_to_export,
                                    index = False)

################################################################################

In [30]:
restaurants_visits_breakdown.head()

Unnamed: 0,sname_place_id,home_cbg,raw_visit_counts,raw_visitor_counts,visitors_from_home_cbg,distance_closest_point,distance_centroid_point,brands,naics_code,categories,price,rating,review_count,area_m2,total_minutes_open,phone,state,zip_code,r_cbsa,r_cbg,latitude,longitude,home_ct,home_cbsa,number_devices_residing,...,ct_naics23,ct_naics31,ct_naics32,ct_naics33,ct_naics42,ct_naics44,ct_naics45,ct_naics48,ct_naics49,ct_naics51,ct_naics52,ct_naics53,ct_naics54,ct_naics55,ct_naics56,ct_naics61,ct_naics62,ct_naics71,ct_naics72,ct_naics81,ct_naics92,r_ct,category1,visits_from_home_cbg,outside_good_count
0,sg:e1a1221e3a0f43cf9b92adedbaa1bc45,10010201001,612,441,5,2067.53839,3571.699696,Wendy's,722513,"[{'alias': 'hotdogs', 'title': 'Fast Food'}, {...",1,1.0,3,300.925755,5580.0,True,al,36067,33860,10010207002,32.459887,-86.453256,1001020100,33860,212,...,0,0,0,0,0,8,3,0,0,0,0,0,1,0,0,4,2,2,0,10,0,1001020700,hotdogs,6.938776,6517.39887
1,sg:4b8ea3994d9d4345aae23d5f9d2c2b82,10010201001,1323,805,8,2295.988576,3811.078454,McDonald's,722513,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...",1,1.5,6,480.980496,32100.0,True,al,36067,33860,10010207002,32.45912,-86.450839,1001020100,33860,212,...,0,0,0,0,0,8,3,0,0,0,0,0,1,0,0,4,2,2,0,10,0,1001020700,burgers,13.147826,6517.39887
2,sg:aa20dfafc669471c9125bfc64fbb9caf,10010201001,4416,2534,5,5891.58548,7359.427729,Chick-fil-A,722513,"[{'alias': 'hotdogs', 'title': 'Fast Food'}]",1,4.0,24,332.517284,24780.0,True,al,36066,33860,10010205002,32.460242,-86.412584,1001020100,33860,212,...,0,0,1,0,0,18,14,0,2,0,16,1,3,0,0,0,37,2,5,12,0,1001020500,hotdogs,8.713496,6517.39887
3,sg:a8101adf3b5547469536a68ff01e4434,10010201001,2145,1428,7,5152.406576,6633.465403,,722511,"[{'alias': 'mexican', 'title': 'Mexican'}]",2,3.5,27,220.593252,20880.0,True,al,36066,33860,10010205001,32.459325,-86.420449,1001020100,33860,212,...,0,0,0,0,0,18,14,0,0,0,16,0,3,0,0,1,37,2,5,12,0,1001020500,mexican,10.514706,6517.39887
4,sg:9bc86afedafd4762a80ad5ba1f1a7e18,10010201001,1048,788,5,2359.904166,3812.526979,Taco Bell,722513,"[{'alias': 'hotdogs', 'title': 'Fast Food'}, {...",1,2.5,11,226.623737,29520.0,True,al,36067,33860,10010204003,32.462065,-86.450279,1001020100,33860,212,...,0,0,0,0,0,6,5,0,0,0,0,0,0,1,0,0,3,0,0,6,0,1001020400,hotdogs,6.649746,6517.39887
