In [1]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: export-all-restaurants-direct.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Thu Mar 19 2020
#
# DESC: This code produces tables with CBG-level restaurant choices for all 
#       months of the data.
#
# EXEC:
#      
################################################################################
################################################################################

In [2]:
############################### Libraries ######################################

import sqlalchemy as db
import pandas as pd
import numpy as np
import os
from calendar import monthrange

################################################################################

In [3]:
####################### Settings and constants #################################

# Pandas display options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

output_folder_path = ('/home/user/projects/urban/data/output/' 
                      'spatial-demand/restaurants-direct/months')

################################################################################

In [4]:
############################# SQL statements ###################################

# Create a restaurants table with visits breakdown by home CBG
restaurants_visits_breakdown_table_statement = """
CREATE TEMPORARY TABLE restaurant_visits_long AS (
    WITH restaurant_visits AS (
        SELECT
            r.sname_place_id,
            r.cbg AS r_cbg,
            v.raw_visit_counts,
            v.raw_visitor_counts,
            v.visitor_home_cbgs
        FROM
            restaurants AS r
        INNER JOIN
            visits AS v
        ON
            r.sname_place_id = v.sname_place_id
        AND
            v.year = {year} 
        AND
            v.month = {month}
    )
    SELECT 
        sname_place_id,
        r_cbg,
        raw_visit_counts,
        raw_visitor_counts,
        (json_each(visitor_home_cbgs)).*
    FROM
        restaurant_visits
);
ALTER TABLE restaurant_visits_long
RENAME COLUMN key TO home_cbg;
ALTER TABLE restaurant_visits_long
RENAME COLUMN value TO visitors_from_home_cbg;
"""

restaurants_visits_breakdown_export_statement = """
SELECT * FROM restaurant_visits_long;
"""

restaurants_visits_breakdown_drop_statement = """
DROP TABLE restaurant_visits_long;
"""

# Create a table with CBGs' device count and CBSA affiliation
cbg_table_statement = """
SELECT
    CONCAT(statefips, countyfips, tractcode) AS ct,
    censusblockgroup AS cbg,
    cbsa AS home_cbsa
FROM
    cbgs
;
"""

cbg_devices_statement = """
SELECT 
    census_block_group AS cbg,
    number_devices_residing
FROM
    home
WHERE
    year = {year} 
AND
    month = {month}
"""

################################################################################

In [5]:
# Preload the CBG data
engine = db.create_engine('postgresql://{user}:{user_pass}@{host}/{dataname2}')
connection = engine.connect()

cbg_table = pd.read_sql(cbg_table_statement, engine)

engine.dispose()

In [6]:
############### Function to export a month of visits breakdown #################
def get_month_visits_breakdown(i, vintage, engine):
    
    # Parse the vintage
    year, month = vintage
    
    print(f'Working on {year}-{month}.')
    
    # Read the month of visits breakdown
    print(f'Getting visits breakdown...')
    rvbts_formatted = restaurants_visits_breakdown_table_statement.format(year = year,
                                                                          month = month)
    result = engine.execute(rvbts_formatted)
    restaurants_visits_breakdown = pd.read_sql(restaurants_visits_breakdown_export_statement,
                                              engine)
    result = engine.execute(restaurants_visits_breakdown_drop_statement)
    
    # Get the devices count on the home-cbg level
    print(f'Getting device counts...')
    cds_formatted = cbg_devices_statement.format(year = year, 
                                                 month = month)
    cbg_home = pd.read_sql(cds_formatted, engine)
    cbg_home = pd.merge(cbg_table,
                        cbg_home, 
                        how = 'left', 
                        on = 'cbg')
    # Remove duplicate cbgs
    cbg_home = cbg_home.sort_values(['number_devices_residing'],
                                    ascending = False).groupby('cbg').head(1)
    cbg_home['number_devices_residing'] = cbg_home['number_devices_residing'].astype('Int64')
    
    # Merge with cbg to get the restaurant census tract
    print('Getting the restaurants CTs.')
    cbg_home.rename(columns = {'cbg': 'r_cbg'}, inplace = True)
    restaurants_visits_breakdown = pd.merge(restaurants_visits_breakdown, 
                                            cbg_home[['r_cbg', 'ct']], 
                                            how = 'left', 
                                            on = 'r_cbg', 
                                            validate = 'many_to_one')
    restaurants_visits_breakdown.rename(columns = {'ct': 'r_ct'}, 
                                        inplace = True)
    print('Getting the home CTs and device counts.')
    cbg_home.rename(columns = {'r_cbg': 'home_cbg'}, inplace = True)
    restaurants_visits_breakdown = pd.merge(restaurants_visits_breakdown, 
                                            cbg_home, 
                                            how = 'left', 
                                            on = 'home_cbg', 
                                            validate = 'many_to_one')
    restaurants_visits_breakdown.rename(columns = {'ct': 'home_ct'}, 
                                        inplace = True)
    
    # Construct the within-cbg restaurant choice counts
    print('Constructing the outside good choice count.')
    restaurants_visits_breakdown['visits_from_home_cbg'] = (restaurants_visits_breakdown['visitors_from_home_cbg'] *
                                                        restaurants_visits_breakdown['raw_visit_counts'] /
                                                        restaurants_visits_breakdown['raw_visitor_counts'])
    # Construct the outside-good choice count
    restaurants_visits_breakdown['outside_good_count'] = (
        restaurants_visits_breakdown[
            'visits_from_home_cbg'
        ].groupby(restaurants_visits_breakdown['home_cbg']
                 ).transform('sum')
    )
    n_days = monthrange(int(year), int(month))[1]
    # Total choices made n_days * restaurants_visits_breakdown['number_devices_residing']:
    restaurants_visits_breakdown['outside_good_count'] = (
        n_days * restaurants_visits_breakdown['number_devices_residing'] - 
        restaurants_visits_breakdown['outside_good_count']
    )
    
    # Adding year and month columns 
    print('Adding month and year.')
    restaurants_visits_breakdown['year'] = int(year)
    restaurants_visits_breakdown['month'] = int(month)
    
    # Drop unncessary columns 
    restaurants_visits_breakdown.drop(['raw_visit_counts', 
                                   'raw_visitor_counts', 
                                   'visitors_from_home_cbg', 
                                   'number_devices_residing'], 
                                  axis = 1, 
                                  inplace = True)
    
    # Exporting to Stata
    print('Exporting.')
    output_file_path = os.path.join(output_folder_path, 
                                    f'restaurants_shares_{i}.dta')
    restaurants_visits_breakdown.to_stata(path = output_file_path, 
                                          write_index = False, 
                                          version = 119)
    print('Done.')
    return i
     
################################################################################

In [7]:
############# Export within-CBG restaurant shares by month #####################

engine = db.create_engine('postgresql://{user}:{user_pass}@{host}/{dataname2}')
connection = engine.connect()

# Construct all of the data vinatages
vintages_2017 = [('2017', '{0:0=2d}'.format(x)) for x in range(6, 13)] 
vintages_2018 = [('2018', '{0:0=2d}'.format(x)) for x in range(1, 13)] 
vintages_2019 = [('2019', '{0:0=2d}'.format(x)) for x in range(1, 8)]                                                                                                                                                                         
vintages_all = vintages_2017 + vintages_2018 + vintages_2019

export_all = []
for i, vintage in enumerate(vintages_all):
    j = get_month_visits_breakdown(i, vintage, engine)
    export_all.append(j)

engine.dispose()

################################################################################

Working on 2017-06.
Getting visits breakdown...
Getting device counts...
Getting the restaurants CTs.
Getting the home CTs and device counts.
Constructing the outside good choice count.
Adding month and year.
Exporting.
Done.
Working on 2017-07.
Getting visits breakdown...
Getting device counts...
Getting the restaurants CTs.
Getting the home CTs and device counts.
Constructing the outside good choice count.
Adding month and year.
Exporting.
Done.
Working on 2017-08.
Getting visits breakdown...
Getting device counts...
Getting the restaurants CTs.
Getting the home CTs and device counts.
Constructing the outside good choice count.
Adding month and year.
Exporting.
Done.
Working on 2017-09.
Getting visits breakdown...
Getting device counts...
Getting the restaurants CTs.
Getting the home CTs and device counts.
Constructing the outside good choice count.
Adding month and year.
Exporting.
Done.
Working on 2017-10.
Getting visits breakdown...
Getting device counts...
Getting the restaurants

In [None]:
################################################################################
################################################################################
################################# Tests ########################################
################################################################################
################################################################################

In [None]:
engine = db.create_engine('postgresql://{user}:{user_pass}@{host}/{dataname2}')
connection = engine.connect()

vintage_test = ('2018', '07')
year, month = vintage_test

In [None]:
# Read the month of visits breakdown
print(f'Getting visits breakdown for {year}-{month}...')
rvbts_formatted = restaurants_visits_breakdown_table_statement.format(year = year,
                                                                      month = month)
result = engine.execute(rvbts_formatted)
restaurants_visits_breakdown = pd.read_sql(restaurants_visits_breakdown_export_statement,
                                          engine)
result = engine.execute(restaurants_visits_breakdown_drop_statement)

In [None]:
restaurants_visits_breakdown.head()

In [None]:
# Get the devices count on the home-cbg level
print(f'Getting device counts for {year}-{month}...')
cds_formatted = cbg_devices_statement.format(year = year, 
                                             month = month)
cbg_home = pd.read_sql(cds_formatted, engine)
cbg_home = pd.merge(cbg_table,
                    cbg_home, 
                    how = 'left', 
                    on = 'cbg')
# Remove duplicate cbgs
cbg_home = cbg_home.sort_values(['number_devices_residing'],
                                ascending = False).groupby('cbg').head(1)
cbg_home['number_devices_residing'] = cbg_home['number_devices_residing'].astype('Int64')

In [None]:
cbg_home.head()

In [None]:
# Merge with cbg to get the restaurant census tract
print('Getting the restaurants CTs.')
cbg_home.rename(columns = {'cbg': 'r_cbg'}, inplace = True)
restaurants_visits_breakdown = pd.merge(restaurants_visits_breakdown, 
                                        cbg_home[['r_cbg', 'ct']], 
                                        how = 'left', 
                                        on = 'r_cbg',
                                        validate = 'many_to_one')
restaurants_visits_breakdown.rename(columns = {'ct': 'r_ct'}, 
                                    inplace = True)

In [None]:
restaurants_visits_breakdown.head()

In [None]:
print('Getting the home CTs and device counts.')
cbg_home.rename(columns = {'r_cbg': 'home_cbg'}, inplace = True)
restaurants_visits_breakdown = pd.merge(restaurants_visits_breakdown, 
                                        cbg_home, 
                                        how = 'left', 
                                        on = 'home_cbg', 
                                        validate = 'many_to_one')
restaurants_visits_breakdown.rename(columns = {'ct': 'home_ct'}, 
                                    inplace = True)

In [None]:
restaurants_visits_breakdown.head()

In [None]:
# Construct the within-cbg restaurant choice counts
print('Constructing the outside good choice count.')
restaurants_visits_breakdown['visits_from_home_cbg'] = (restaurants_visits_breakdown['visitors_from_home_cbg'] *
                                                    restaurants_visits_breakdown['raw_visit_counts'] /
                                                    restaurants_visits_breakdown['raw_visitor_counts'])
# Construct the outside-good choice count
restaurants_visits_breakdown['outside_good_count'] = (
    restaurants_visits_breakdown[
        'visits_from_home_cbg'
    ].groupby(restaurants_visits_breakdown['home_cbg']
             ).transform('sum')
)
n_days = monthrange(int(year), int(month))[1]
# Total choices made n_days * restaurants_visits_breakdown['number_devices_residing']:
restaurants_visits_breakdown['outside_good_count'] = (
    n_days * restaurants_visits_breakdown['number_devices_residing'] - 
    restaurants_visits_breakdown['outside_good_count']
)

In [None]:
restaurants_visits_breakdown.head()

In [None]:
print(restaurants_visits_breakdown.shape)
print(sum(restaurants_visits_breakdown['outside_good_count'] > 0))

In [None]:
# Adding year and month columns 
print('Adding month and year.')
restaurants_visits_breakdown['year'] = int(year)
restaurants_visits_breakdown['month'] = int(month)
restaurants_visits_breakdown.head()

In [None]:
# Drop unncessary columns 
restaurants_visits_breakdown.drop(['raw_visit_counts', 
                               'raw_visitor_counts', 
                               'visitors_from_home_cbg', 
                               'number_devices_residing'], 
                              axis = 1, 
                              inplace = True)

# Exporting to Stata
print('Exporting.')
output_file_path = os.path.join(output_folder_path, 
                                f'restaurants_shares_{year}_{month}.dta')
restaurants_visits_breakdown.to_stata(path = output_file_path, 
                                      write_index = False, 
                                      version = 119)
print('Done.')

In [None]:
restaurants_visits_breakdown.dtypes