In [1]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: datasets-for-description.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Wed Feb 12 2020
#
# DESC: Code that export the datasets to construct descriptive tables.
#       October 2018 is used to for devices and visits count.
#
# COMMENT: 
#
################################################################################
################################################################################

In [2]:
############################# Libraries ########################################

import sqlalchemy as db
import pandas as pd
import os
import json

################################################################################

In [3]:
############################# Parameters #######################################

year = 2018
month = 10
output_folder_path = '/home/user/projects/urban/data/output/descriptive/'

################################################################################

In [4]:
######################### Database connections #################################

# Connect to the database via SQLalchemy                                                                           
engine = db.create_engine('postgresql://{user}:{user_pass}@{host}/{dataname2}')                                                       
connection = engine.connect()                                                                                

################################################################################

In [8]:
####################### CBG-dataset construction ###############################

# Create a table with CBGs' device count and CBSA affiliation
cbg_table_statement = """
SELECT
    CONCAT(c.statefips, c.countyfips, c.tractcode) AS ct,
    c.censusblockgroup AS cbg,
    c.cbsa,
    c.area_m2,
    h.number_devices_residing
FROM
    cbgs AS c
LEFT JOIN
    home AS h
ON
    c.censusblockgroup = h.census_block_group 
AND
    h.year = {year} 
AND
    h.month = {month}
;
""".format(year = year, month = month)

# Create a restaurants table with CBG affiliation and visits
cbg_restaurants_table_statement = """
WITH month_restaurants AS (
    SELECT
        r.sname_place_id,
        r.cbg,
        r.area_m2,
        v.raw_visit_counts
    FROM
        restaurants AS r
    LEFT JOIN
        visits AS v
    ON  
        r.sname_place_id = v.sname_place_id
    AND
        v.year = {year} 
    AND
        v.month = {month}
) 
SELECT 
    cbg,
    COUNT(sname_place_id) AS rest_number,
    SUM(raw_visit_counts) AS rest_visits,
    SUM(area_m2) AS rest_area
FROM
    month_restaurants
WHERE 
    raw_visit_counts IS NOT NULL
GROUP BY 
    cbg;
""".format(year = year, month = month)

# Create an establishments table with CBG affiliation and visits
cbg_establishments_table_statement = """
WITH month_establishments AS (
    SELECT
        e.sname_place_id,
        e.cbg,
        e.area_m2,
        v.raw_visit_counts
    FROM
        establishments AS e
    LEFT JOIN
        visits AS v
    ON  
        e.sname_place_id = v.sname_place_id
    AND
        v.year = {year}
    AND
        v.month = {month}
)
SELECT 
    cbg,
    COUNT(sname_place_id) AS est_number,
    SUM(raw_visit_counts) AS est_visits,
    SUM(area_m2) AS est_area
FROM
    month_establishments
WHERE 
    raw_visit_counts IS NOT NULL
GROUP BY 
    cbg;
""".format(year = year, month = month)

cbg_table = pd.read_sql(cbg_table_statement, engine)
# Remove duplicate CBGs
cbg_table = cbg_table.sort_values(['number_devices_residing'],
                                  ascending=False).groupby('cbg').head(1)

cbg_restaurants_table = pd.read_sql(cbg_restaurants_table_statement, engine)
cbg_establishments_table = pd.read_sql(cbg_establishments_table_statement, engine)

# Merge datasets
data_cbg = pd.merge(cbg_table, cbg_restaurants_table, how = 'left', on = 'cbg')
data_cbg = pd.merge(data_cbg, cbg_establishments_table, how = 'left', on = 'cbg')

del cbg_table, cbg_restaurants_table, cbg_establishments_table

# Export data to process on local computer
data_cbg_file_path = os.path.join(output_folder_path,
                                  'data_cbg.csv')
data_cbg.to_csv(data_cbg_file_path, 
                index = False)

################################################################################

In [5]:
#################### Restaurants-dataset construction ##########################

# Create a restaurants table with CBG affiliation and visits
restaurants_table_statement = """
SELECT
    r.brands,
    r.naics_code,
    r.categories,
    r.price,
    r.rating,
    r.review_count,
    r.area_m2,
    r.total_minutes_open,
    r.yelp_phone IS NOT NULL AS phone,
    v.raw_visit_counts,
    r.state,
    r.zip_code,
    r.cbsa,
    r.cbg,
    r.latitude,
    r.longitude
FROM
    restaurants AS r
LEFT JOIN
    visits AS v
ON  
    r.sname_place_id = v.sname_place_id
AND
    v.year = {year} 
AND
    v.month = {month}
;
""".format(year = year, month = month)

data_restaurants = pd.read_sql(restaurants_table_statement, engine)

# Convert dicts in categories column to json string
data_restaurants['categories'] = data_restaurants.apply(lambda row: json.dumps(row['categories']), 
                                                        axis = 1)

# Export data to process on local computer
data_restaurants_file_path = os.path.join(output_folder_path, 
                                          'data_restaurants.csv')
data_restaurants.to_csv(data_restaurants_file_path,
                        index = False)

################################################################################

In [6]:
#################### Restaurants-dataset construction ##########################

# Create a restaurants table with CBG affiliation and visits
establishments_table_statement = """
SELECT
    e.naics_code,
    e.area_m2,
    e.total_minutes_open,
    v.raw_visit_counts,
    e.state,
    e.zip_code,
    e.cbsa,
    e.latitude,
    e.longitude
FROM
    establishments AS e
LEFT JOIN
    visits AS v
ON  
    e.sname_place_id = v.sname_place_id
AND
    v.year = {year} 
AND
    v.month = {month}
;
""".format(year = year, month = month)

data_establishments = pd.read_sql(establishments_table_statement, engine)

# Export data to process on local computer
data_establishments_file_path = os.path.join(output_folder_path, 
                                          'data_establishments.csv')
data_establishments.to_csv(data_establishments_file_path, 
                index = False)

################################################################################

In [6]:
####################### Close database connections #############################

engine.dispose()

################################################################################