In [1]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: cbg_open_close_by_categories.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Sun Feb 23 2020
#
# DESC: This code produces tables with categories open in CBGs by month,
#       considering only units that are more likely to have changed the 
#       open / close status.
#
# EXEC:
#      
################################################################################
################################################################################

In [2]:
################################ Libraries ######################################

import os
import sqlalchemy as db
import pandas as pd

################################################################################

In [3]:
############################### Constants ######################################

output_folder_path = '/home/user/projects/urban/data/output/reduced-form' 

visits_table_statement = """
SELECT 
    sname_place_id
FROM 
    visits
WHERE
    year = {year} AND
    month = {month}
;
"""

################################################################################

In [4]:
################## Get permanent part of data from SQL #########################

engine = db.create_engine('postgresql://{user}:{user_pass}@{host}/{dataname2}')
connection = engine.connect()

open_close_table_statement = """
CREATE TEMPORARY TABLE open_close_units AS (
    SELECT 
        sname_place_id
    FROM
        entry_info
    WHERE
        is_special = TRUE and "group" IN (0, 1)
);
CREATE INDEX open_close_sname_idx
ON open_close_units(sname_place_id);
"""
result = engine.execute(open_close_table_statement)

establishments_table_statement = """
SELECT 
    e.sname_place_id,
    e.naics_first2,
    e.cbg
FROM 
    establishments AS e
INNER JOIN
    open_close_units AS o
ON
    e.sname_place_id = o.sname_place_id
;
"""
establishments = pd.read_sql(establishments_table_statement, engine)

restaurants_table_statement = """
SELECT 
    r.sname_place_id,
    r.cbg
FROM 
    restaurants AS r
INNER JOIN
    open_close_units AS o
ON
    r.sname_place_id = o.sname_place_id
;
"""
restaurants = pd.read_sql(restaurants_table_statement, engine)

cbg_table_statement = """
SELECT
    censusblockgroup AS cbg
FROM
    cbgs 
;
"""

cbg_table = pd.read_sql(cbg_table_statement, engine)
# Remove duplicate CBGs
cbg_table = cbg_table.groupby('cbg').head(1).set_index('cbg')

################################################################################

In [6]:
############## Function to get month of visits to establishments ###############

def get_month_categories(vintage):
    
    # Parse the vintage
    year, month = vintage
    
    # Get the visits for that vinatage month
    print(f'Getting visits for {year}-{month}...')
    visits_table_statement_month = visits_table_statement.format(year = year,
                                                                 month = month)
    visits_month = pd.read_sql(visits_table_statement_month, engine)
    
    print('Merging...')
    est_open_this_month = pd.merge(establishments,
                                   visits_month,
                                   how = 'inner',
                                   validate = 'one_to_one')
    
    rest_open_this_month = pd.merge(restaurants,
                                    visits_month,
                                    how = 'inner',
                                    validate = 'one_to_one')
    
    print('Transforming...')
    cbg_est_open = est_open_this_month.groupby('cbg'
                                              ).size().to_frame('est_open')
    
    cbg_rest_open = rest_open_this_month.groupby('cbg'
                                                ).size().to_frame('rest_open')
    
    cbg_categs = pd.merge(cbg_est_open, 
                          cbg_rest_open, 
                          how = 'outer', 
                          validate = 'one_to_one',
                          left_index = True,
                          right_index = True)
    
    # Merge with the table with all cbgs
    for_output = pd.merge(cbg_table, 
                          cbg_categs, 
                          how = 'left', 
                          left_index = True,
                          right_index = True,
                          validate = 'one_to_one'
                         )
    for_output = for_output.fillna(0).astype('int64')
    for_output.reset_index(inplace = True)
    
    for_output['year'] = int(year)
    for_output['month'] = int(month)
    
    # Export data, if file exists - appending to the exisiting one
    output_file_path = os.path.join(output_folder_path, 
                                    'cbg_open_close_categs.csv')
    if (os.path.exists(output_file_path)):
        # If file exists: append to csv
        for_output.to_csv(path_or_buf = output_file_path, 
                          na_rep = '', 
                          index = False, 
                          header = False, 
                          mode = 'a')
        return 'appended to csv'
    else:
        # If not: create csv and export
        for_output.to_csv(path_or_buf = output_file_path, 
                          na_rep = '',
                          index = False,
                          header = True,
                          mode = 'w')
        return 'created csv'

################################################################################

In [7]:
# Construct all of the data vinatages
vintages_2017 = [('2017', '{0:0=2d}'.format(x)) for x in range(6, 13)] 
vintages_2018 = [('2018', '{0:0=2d}'.format(x)) for x in range(1, 13)] 
vintages_2019 = [('2019', '{0:0=2d}'.format(x)) for x in range(1, 8)]                                                                                                                                                                         
vintages_all = vintages_2017 + vintages_2018 + vintages_2019

In [8]:
# Export all of the data
data_all = [get_month_categories(v) for v in vintages_all]

Getting visits for 2017-06...
Merging...
Transforming...
Getting visits for 2017-07...
Merging...
Transforming...
Getting visits for 2017-08...
Merging...
Transforming...
Getting visits for 2017-09...
Merging...
Transforming...
Getting visits for 2017-10...
Merging...
Transforming...
Getting visits for 2017-11...
Merging...
Transforming...
Getting visits for 2017-12...
Merging...
Transforming...
Getting visits for 2018-01...
Merging...
Transforming...
Getting visits for 2018-02...
Merging...
Transforming...
Getting visits for 2018-03...
Merging...
Transforming...
Getting visits for 2018-04...
Merging...
Transforming...
Getting visits for 2018-05...
Merging...
Transforming...
Getting visits for 2018-06...
Merging...
Transforming...
Getting visits for 2018-07...
Merging...
Transforming...
Getting visits for 2018-08...
Merging...
Transforming...
Getting visits for 2018-09...
Merging...
Transforming...
Getting visits for 2018-10...
Merging...
Transforming...
Getting visits for 2018-11...
M

In [None]:
data_all