In [1]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: cbg_establishments_over_time_conservative.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Sun Feb 23 2020
#
# DESC: This code produces tables with categories open in CBGs by month,
#       considering only units that are more likely to have changed the 
#       open / close status in the MIDDLE of the observational period. 
#       Breakdown by NAICS code is done. 
#
# EXEC:
#      
################################################################################
################################################################################

In [2]:
################################ Libraries ######################################

import os
import sqlalchemy as db
import pandas as pd

################################################################################

In [5]:
############################### Constants ######################################

output_folder_path = '/home/user/projects/urban/data/output/reduced-form' 

visits_table_statement = """
SELECT 
    sname_place_id
FROM 
    visits
WHERE
    year = {year} AND
    month = {month}
;
"""

################################################################################

In [93]:
################## Get permanent part of data from SQL #########################

engine = db.create_engine('postgresql://{user}:{user_pass}@{host}/{dataname2}')
connection = engine.connect()

open_close_table_statement = """
CREATE TEMPORARY TABLE open_close_units AS (
    SELECT 
        sname_place_id
    FROM
        entry_info
    WHERE
        is_special = TRUE AND "group" IN (0, 1) AND kind >= 3 AND kind <= 20
);
CREATE INDEX open_close_sname_idx
ON open_close_units(sname_place_id);
"""
result = engine.execute(open_close_table_statement)

establishments_table_statement = """
SELECT 
    e.sname_place_id,
    e.naics_first2,
    e.cbg
FROM 
    establishments AS e
INNER JOIN
    open_close_units AS o
ON
    e.sname_place_id = o.sname_place_id
;
"""
establishments = pd.read_sql(establishments_table_statement, engine)

restaurants_table_statement = """
SELECT 
    r.sname_place_id,
    r.cbg
FROM 
    restaurants AS r
INNER JOIN
    open_close_units AS o
ON
    r.sname_place_id = o.sname_place_id
;
"""
restaurants = pd.read_sql(restaurants_table_statement, engine)

cbg_table_statement = """
SELECT
    censusblockgroup AS cbg
FROM
    cbgs 
;
"""

cbg_table = pd.read_sql(cbg_table_statement, engine)
# Remove duplicate CBGs
cbg_table = cbg_table.groupby('cbg').head(1).set_index('cbg')

################################################################################

In [94]:
############ Function count establishments in each CBG by category ############

def get_month_categories(vintage):
    
    # Parse the vintage
    year, month = vintage
    
    # Get the visits for that vinatage month
    print(f'Getting visits for {year}-{month}...')
    visits_table_statement_month = visits_table_statement.format(year = year,
                                                                 month = month)
    visits_month = pd.read_sql(visits_table_statement_month, engine)

    print('Merging...')
    est_open_this_month = pd.merge(establishments,
                                   visits_month,
                                   how = 'inner',
                                   validate = 'one_to_one')

    rest_open_this_month = pd.merge(restaurants,
                                    visits_month,
                                    how = 'inner',
                                    validate = 'one_to_one')
    
    print('Transforming...')
    # Total open establishments count
    cbg_est_total = est_open_this_month.groupby(['cbg']
                                               ).size().to_frame(name = 'open')
    cbg_est_total.insert(loc = 0, column = 'category', value = 'total_est')
    cbg_est_total.reset_index(inplace = True)
    
    # Count establishments in each cbg by naics category
    cbg_est_open = est_open_this_month.groupby(['cbg',
                                            'naics_first2']
                                          ).size().reset_index(name = 'open')
    cbg_est_open.rename(columns = {'naics_first2': 'category'}, 
                        inplace = True)
    cbg_est_open['category'] = cbg_est_open['category'].apply(lambda x: 'naics' + str(x))
    
    # Count restaurants in each cbg 
    cbg_rest_open = rest_open_this_month.groupby(['cbg']
                                            ).size().to_frame('open')
    cbg_rest_open.insert(loc = 0, column = 'category', value = 'rest')
    cbg_rest_open.reset_index(inplace = True)
    
    # Get all establishments and restaurants in cbgs into one dataframe
    cbg_categs = pd.concat([cbg_est_total, cbg_est_open, cbg_rest_open])
    
    # Pivot to put categories into columns 
    cbg_categs = cbg_categs.pivot(index = 'cbg',
                                  columns = 'category',
                                  values = 'open')
    cbg_categs.rename_axis(columns = None, inplace = True)
    
    # Reshape and join with the entire cbg table
    cbg_categs = pd.merge(cbg_table, 
                          cbg_categs, 
                          how = 'left', 
                          left_index = True, 
                          right_index = True, 
                          validate = 'one_to_one')
    cbg_categs.fillna(0, inplace = True)
    cbg_categs = cbg_categs.astype('int64')
    cbg_categs['year'] = int(year)
    cbg_categs['month'] = int(month)
    cbg_categs.reset_index(inplace = True)
    
    return cbg_categs
    
################################################################################

In [3]:
# Construct all of the data vinatages
vintages_2017 = [('2017', '{0:0=2d}'.format(x)) for x in range(6, 13)] 
vintages_2018 = [('2018', '{0:0=2d}'.format(x)) for x in range(1, 13)] 
vintages_2019 = [('2019', '{0:0=2d}'.format(x)) for x in range(1, 8)]                                                                                                                                                                         
vintages_all = vintages_2017 + vintages_2018 + vintages_2019

In [105]:
# Export all of the data
data_all = [get_month_categories(v) for v in vintages_all]

Getting visits for 2017-06...
Merging...
Getting visits for 2017-07...
Merging...
Getting visits for 2017-08...
Merging...
Getting visits for 2017-09...
Merging...
Getting visits for 2017-10...
Merging...
Getting visits for 2017-11...
Merging...
Getting visits for 2017-12...
Merging...
Getting visits for 2018-01...
Merging...
Getting visits for 2018-02...
Merging...
Getting visits for 2018-03...
Merging...
Getting visits for 2018-04...
Merging...
Getting visits for 2018-05...
Merging...
Getting visits for 2018-06...
Merging...
Getting visits for 2018-07...
Merging...
Getting visits for 2018-08...
Merging...
Getting visits for 2018-09...
Merging...
Getting visits for 2018-10...
Merging...
Getting visits for 2018-11...
Merging...
Getting visits for 2018-12...
Merging...
Getting visits for 2019-01...
Merging...
Getting visits for 2019-02...
Merging...
Getting visits for 2019-03...
Merging...
Getting visits for 2019-04...
Merging...
Getting visits for 2019-05...
Merging...
Getting visits f

In [112]:
for i, d in enumerate(data_all):
    file_path = os.path.join(output_folder_path,
                             f'part{i}.csv')
    d.to_csv(file_path, 
             na_rep = '',
             index = False,
             header = True)

In [17]:
data_all = []
for i in range(1, len(vintages_all)):
    print(i)
    file_path = os.path.join(output_folder_path,
                             f'part{i}.csv')
    d = pd.read_csv(file_path)
    cols_to_add = [x for x in ['naics11', 'naics23', 'naics22'] if x not in d.columns]
    for c in cols_to_add:
        print(f'Adding column {c}.')
        d[c] = int(0)
    data_all.append(d)

1
Adding column naics11.
Adding column naics23.
Adding column naics22.
2
Adding column naics11.
Adding column naics23.
Adding column naics22.
3
Adding column naics11.
Adding column naics23.
Adding column naics22.
4
Adding column naics11.
Adding column naics23.
Adding column naics22.
5
Adding column naics23.
Adding column naics22.
6
Adding column naics22.
7
Adding column naics22.
8
Adding column naics22.
9
Adding column naics22.
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


In [22]:
data_all_combined = pd.concat(data_all, ignore_index = True, sort = True)

In [23]:
data_all_combined

Unnamed: 0,cbg,month,naics11,naics22,naics23,naics31,naics32,naics33,naics42,naics44,...,naics56,naics61,naics62,naics71,naics72,naics81,naics92,rest,total_est,year
0,340110403001,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2017
1,340110409024,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2017
2,360110415001,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2017
3,360650245001,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2017
4,181410029001,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5508320,60710001161,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2019
5508321,60710119004,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2019
5508322,60710034033,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2019
5508323,60710053003,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2019


In [25]:
output_file_path = os.path.join(output_folder_path, 
                                'cbg_establishments_over_time_conservative.csv')
data_all_combined.to_csv(path_or_buf = output_file_path, 
                          na_rep = '', 
                          index = False, 
                          header = True)