In [1]:
import geopandas as gpd
import os
import pandas as pd
import psycopg2
import time
from shapely.geometry import Polygon

from config import split_files_folder
from keys import pg_user, pg_pass, pg_host, pg_port, pg_db, pg_table

srid = 4326 #ms building footprints crs

In [2]:
def connect_pg(pg_user=pg_user, pg_pass=pg_pass, pg_host=pg_host, 
               pg_port=pg_port, pg_db=None, autocommit=True):
    
    connection = psycopg2.connect(database=pg_db,
                                  user=pg_user,
                                  password=pg_pass,
                                  host=pg_host,
                                  port=pg_port)
    cursor = connection.cursor()
    connection.set_session(autocommit=autocommit)
    
    return cursor, connection

In [3]:
def check_integrity(gdf):
    
    count_rows = len(gdf)
    count_invalid = (~gdf['geometry'].is_valid).sum()
    count_nonpolygons = gdf['geometry'].map(lambda geom: not isinstance(geom, Polygon)).sum()
    count_bad_area = ((pd.isnull(gdf.area)) | (gdf.area==0)).sum()

    return count_rows, count_invalid, count_nonpolygons, count_bad_area

## Load the data

In [4]:
# connect to the server and drop db if already exists
cursor, connection = connect_pg(pg_db=None)
cursor.execute("DROP DATABASE IF EXISTS {};".format(pg_db))
cursor.execute("CREATE DATABASE {};".format(pg_db))

In [5]:
# connect to the newly created db
cursor, connection = connect_pg(pg_db=pg_db)
cursor.execute("CREATE EXTENSION postgis;")

In [6]:
# create the table
cursor.execute("DROP TABLE IF EXISTS {};".format(pg_table))
cursor.execute("CREATE TABLE {} (id SERIAL PRIMARY KEY, state VARCHAR NOT NULL);".format(pg_table))
cursor.execute("SELECT AddGeometryColumn ('{}', 'geom', {}, 'POLYGON', 2);".format(pg_table, srid))
cursor.execute("CREATE INDEX fp_spatial_idx ON {} USING GIST(geom);".format(pg_table))
cursor.execute("CREATE INDEX fp_state_idx ON {} (state);".format(pg_table))

In [7]:
# set up the query template to fill in with multiple rows' values
query_template = "INSERT INTO {table} (state, geom) VALUES ('{state}', ST_GeomFromText('{}', {srid}))"
additional_row = ", ('{state}', ST_GeomFromText('{}', {srid}))"

In [8]:
for filename in os.listdir(split_files_folder):
    
    print(filename, end=' ')
    start_time = time.time()
    
    state = filename[filename.find('_') + 1 : filename.find('-')]
    filepath = '{}/{}'.format(split_files_folder, filename)
    
    # load the geojson file and extract its geometries as wkt
    gdf = gpd.read_file(filepath)
    geoms_wkt = [geom.wkt for geom in gdf['geometry'].values]
    
    # prep query to insert all the rows with a single command
    additional_rows = additional_row * (len(gdf) - 1)
    query_template_full = query_template + additional_rows + ';'
    query = query_template_full.format(*geoms_wkt, srid=srid, state=state, table=pg_table)
    
    cursor.execute(query)
    
    print('{:.1f} secs'.format(time.time() - start_time))

01_AL-000.json 114.8 secs
01_AL-001.json 140.2 secs
01_AL-002.json 54.6 secs
02_AK-000.json 44.0 secs
04_AZ-000.json 137.0 secs
04_AZ-001.json 162.5 secs
04_AZ-002.json 77.6 secs
05_AR-000.json 147.9 secs
05_AR-001.json 72.5 secs
06_CA-000.json 155.0 secs
06_CA-001.json 174.9 secs
06_CA-002.json 175.6 secs
06_CA-003.json 179.7 secs
06_CA-004.json 182.6 secs
06_CA-005.json 199.9 secs
06_CA-006.json 218.8 secs
06_CA-007.json 227.9 secs
06_CA-008.json 249.5 secs
06_CA-009.json 255.8 secs
06_CA-010.json 146.9 secs
08_CO-000.json 180.7 secs
08_CO-001.json 225.5 secs
08_CO-002.json 15.5 secs
09_CT-000.json 181.6 secs
09_CT-001.json 32.7 secs
10_DE-000.json 61.5 secs
11_DC-000.json 11.4 secs
12_FL-000.json 218.2 secs
12_FL-001.json 212.9 secs
12_FL-002.json 199.5 secs
12_FL-003.json 198.7 secs
12_FL-004.json 217.5 secs
12_FL-005.json 229.5 secs
12_FL-006.json 115.1 secs
13_GA-000.json 179.8 secs
13_GA-001.json 223.0 secs
13_GA-002.json 265.3 secs
13_GA-003.json 275.7 secs
15_HI-000.json 51.3 

In [9]:
%%time
# optimize the db after all loaded and rebuild indexes
cursor.execute('VACUUM(FULL, ANALYZE);')

Wall time: 1h 7min 35s


## Test results

In [10]:
# how many rows in the table? expecting 122,608,100 rows.
cursor.execute("SELECT count(*) AS exact_count FROM {};".format(pg_table))
rows = cursor.fetchall()
print('{:,.0f} rows'.format(rows[0][0]))

122,608,100 rows


In [11]:
# get all footprints in AL. expecting 2,392,171 rows.
state = 'AL'
query = """
        SELECT ST_AsText({table}.geom)
        FROM {table}
        WHERE {table}.state='{state}';
        """
cursor.execute(query.format(state=state, table=pg_table))
rows = cursor.fetchall()
len(rows)

2392171

## All done

In [12]:
cursor.close()
connection.close()