In [1]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: export-one-cbsa-combinations.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Fri Mar 13 2020
#
# DESC: This code tables with CBG-restaurant pairs in one CBSA.
#
# EXEC:
#      
################################################################################
################################################################################

In [2]:
############################### Libraries ######################################

import sqlalchemy as db
import pandas as pd
import os
import numpy as np

################################################################################

In [3]:
######################### Constants and settings ##############################

# Pandas display options
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

year = 2018
month = 10
output_folder_path = '/home/user/projects/urban/data/output/spatial-demand/restaurants-direct' 

################################################################################

In [4]:
######################### PostgreSQL connection ################################

engine = db.create_engine('postgresql://{user}:{user_pass}@{host}/{dataname2}')
connection = engine.connect()

################################################################################

In [5]:
# Get restaurant and cbg count by cbsa

restaurants_by_cbsa_table_statement = """
SELECT
    cbsa,
    COUNT(sname_place_id) AS restaurant_count
FROM
    restaurants
GROUP BY
    cbsa
;
"""

cbgs_by_cbsa_table_statement = """
SELECT
    cbsa,
    COUNT(censusblockgroup) AS cbg_count
FROM
    cbgs
GROUP BY
    cbsa
;
"""

restaurants_by_cbsa_table = pd.read_sql(restaurants_by_cbsa_table_statement, 
                                        engine)

cbgs_by_cbsa_table = pd.read_sql(cbgs_by_cbsa_table_statement, 
                                 engine)

cbsa_table = pd.merge(cbgs_by_cbsa_table, 
                      restaurants_by_cbsa_table,
                      how = 'outer', 
                      on = 'cbsa', 
                      validate = 'one_to_one')

medium = ((cbsa_table['restaurant_count'] >= 500) & (cbsa_table['restaurant_count'] <= 1000))

cbsa_medium_table = cbsa_table[medium]

cbsa_medium_table.reset_index(drop = True, inplace = True)

In [6]:
# CBSA 40420 - Rockford, IL selected
selected_cbsa = 40420

# Create table with 40420 CBGs only
create_cbgs_40420_table_statement = """
CREATE TEMPORARY TABLE cbgs_40420 AS (
    SELECT 
        censusblockgroup AS cbg,
        wkb_geometry::geography AS cbg_location
    FROM 
        cbgs
    WHERE
        cbsa = '{cbsa}'
)
""".format(cbsa = selected_cbsa)

# Create table with 40420 restaurants only
create_restaurants_40420_table_statement = """
CREATE TEMPORARY TABLE restaurants_40420 AS (
    SELECT 
        sname_place_id,
        price,
        rating,
        naics_code,
        area_m2,
        brands,
        categories,
        ST_SetSRID(ST_Point(longitude, latitude), 4326)::geography AS r_location
    FROM 
        restaurants
    WHERE
        cbsa = '{cbsa}'
)
""".format(cbsa = selected_cbsa)

distances_40420_table_statement = """
SELECT 
    c.cbg,
    r.sname_place_id,
    ST_Distance(c.cbg_location, r.r_location) as distance
FROM
    cbgs_40420 AS c,
    restaurants_40420 AS r
;
"""

In [7]:
################## CBG-restaurants pairs dataset creation ######################

# Build a table of cbgs in selected cbsa
result = engine.execute(create_cbgs_40420_table_statement)

# Build a table with restaurants in selected cbsa
result = engine.execute(create_restaurants_40420_table_statement)

# Get restaurant characteristics
cbgs_restaurants_pairs = pd.read_sql(distances_40420_table_statement,
                                     engine)

################################################################################

In [8]:
# Create a table with CBGs' device count and CBSA affiliation
devices_table_statement = """
SELECT
    c.cbg AS cbg,
    h.number_devices_residing
FROM
    cbgs_40420 AS c
LEFT JOIN
    home AS h
ON
    c.cbg = h.census_block_group 
AND
    h.year = {year} 
AND
    h.month = {month}
;
""".format(year = year, month = month)

devices_table = pd.read_sql(devices_table_statement, engine)

In [9]:
cbgs_restaurants_pairs = pd.merge(cbgs_restaurants_pairs,
                                  devices_table, 
                                  how = 'left', 
                                  on = 'cbg', 
                                  validate = 'many_to_one')

In [10]:
# Create a table with CBGs' device count and CBSA affiliation
restaurants_table_statement = """
SELECT
    r.sname_place_id AS sname_place_id,
    r.price,
    r.rating,
    r.naics_code,
    r.area_m2,
    r.brands,
    r.categories,
    v.raw_visit_counts AS raw_visit_counts
FROM
    restaurants_40420 AS r
LEFT JOIN
    visits AS v
ON
    r.sname_place_id = v.sname_place_id 
AND
    v.year = {year} 
AND
    v.month = {month}
;
""".format(year = year, month = month)

restaurants_table = pd.read_sql(restaurants_table_statement, engine)

In [11]:
cbgs_restaurants_pairs = pd.merge(cbgs_restaurants_pairs,
                                  restaurants_table, 
                                  how = 'left', 
                                  on = 'sname_place_id', 
                                  validate = 'many_to_one')

In [12]:
########################### Close connection ###################################

engine.dispose()

################################################################################

In [13]:
cbgs_restaurants_pairs['category1'] = cbgs_restaurants_pairs['categories'].apply(lambda x: x[0]['alias'] if x else None)

In [14]:
cbgs_restaurants_pairs.drop(columns = 'categories', inplace = True)

In [15]:
cbgs_restaurants_pairs.to_csv(os.path.join(output_folder_path, 'fit_test_40420.csv'),
                              index = False)