# Spatial join parcel IDs to census block IDs

In [1]:
import pandas as pd
import psycopg2
from keys import pg_user, pg_pass, pg_host, pg_port, pg_db

output_path = 'data/parcels_joined_blocks.csv'

In [2]:
# connect to the postgres database
connection = psycopg2.connect(database=pg_db,
                              user=pg_user,
                              password=pg_pass,
                              host=pg_host,
                              port=pg_port)
cursor = connection.cursor()

In [3]:
# verify that the SRIDs match
cursor.execute("""SELECT
                      Find_SRID('public', 'blocks', 'geom') as blocks_srid,
                      Find_SRID('public', 'parcels', 'geom') as parcels_srid""")
cursor.fetchall()

[(4326, 4326)]

In [4]:
# joining how many rows to how many rows?
cursor.execute("SELECT count(*) AS row_count FROM blocks")
count_blocks_table = cursor.fetchall()[0][0]
cursor.execute("SELECT count(*) AS row_count FROM parcels")
count_parcels_table = cursor.fetchall()[0][0]
count_blocks_table, count_parcels_table

(710145, 1956207)

## Do the spatial join

In [5]:
%%time
# drop the table if it already exists and recreate it
# select distinct to keep only 1 parcel/block pair if parcel centroid lies on border of multiple blocks
query = """
        DROP TABLE IF EXISTS parcels_blocks;
        CREATE TABLE parcels_blocks AS
        SELECT DISTINCT ON (id)
            parcels.id AS id,
            parcels.parcel_id AS parcel_id,
            blocks.geoid AS block_geoid
        FROM parcels LEFT JOIN blocks
        ON ST_Intersects(parcels.geom, blocks.geom)
        """

cursor.execute(query)
connection.commit()

Wall time: 2min 37s


In [6]:
%%time
# vacuum and analyze the database to optimize it after creating new table
old_isolation_level = connection.isolation_level
connection.set_isolation_level(0)
cursor.execute("VACUUM ANALYZE")
connection.commit()
connection.set_isolation_level(old_isolation_level)

Wall time: 6.04 s


## Retrieve the joined data

In [7]:
# do all parcels have a block geoid?
cursor.execute("SELECT count(*) AS row_count, count(block_geoid) AS geoid_count FROM parcels_blocks")
print(count_parcels_table)
cursor.fetchall()

1956207


[(1956207, 1956207)]

In [8]:
%%time
# select the results from the new table, ignoring any parcels with null block_geoid
cursor.execute("""SELECT parcel_id, block_geoid
                  FROM parcels_blocks
                  WHERE block_geoid IS NOT NULL""")
rows = cursor.fetchall()

Wall time: 1.22 s


In [9]:
# all done, close the database connection
cursor.close()
connection.close()

## Work with the results

In [10]:
# turn the selected rows into a dataframe
df = pd.DataFrame(rows, columns=['parcel_id', 'block_geoid'])
assert len(df) == count_parcels_table
len(df)

1956207

In [11]:
df.head()

Unnamed: 0,parcel_id,block_geoid
0,229116,60014511022061
1,244166,60014511011025
2,202378,60014351031029
3,2004420,60971541003180
4,340332,60014419272014


In [12]:
df.to_csv(output_path, index=False, encoding='utf-8')