In [1]:
import os
import psycopg2
import subprocess
import time

from config import split_files_folder
from keys import pg_user, pg_pass, pg_host, pg_port, pg_db, pg_table

srid = 4326 #ms building footprints crs

In [2]:
def connect_pg(pg_user=pg_user, pg_pass=pg_pass, pg_host=pg_host, 
               pg_port=pg_port, pg_db=None, autocommit=True):
    
    connection = psycopg2.connect(database=pg_db,
                                  user=pg_user,
                                  password=pg_pass,
                                  host=pg_host,
                                  port=pg_port)
    cursor = connection.cursor()
    connection.set_session(autocommit=autocommit)
    
    return cursor, connection

In [3]:
# connect to the server and drop db if already exists
cursor, connection = connect_pg(pg_db=None)
cursor.execute("DROP DATABASE IF EXISTS {};".format(pg_db))
cursor.execute("CREATE DATABASE {};".format(pg_db))

In [4]:
# connect to the newly created db
cursor, connection = connect_pg(pg_db=pg_db)
cursor.execute("CREATE EXTENSION postgis;")

In [5]:
# create the table
cursor.execute("CREATE TABLE {} (id SERIAL PRIMARY KEY);".format(pg_table))
cursor.execute("SELECT AddGeometryColumn ('{}', 'geom', %s, 'POLYGON', 2);".format(pg_table), [srid])
cursor.execute("CREATE INDEX my_index ON {} USING GIST(geom);".format(pg_table))
cursor.execute("VACUUM ANALYZE;")

In [6]:
# ogr2ogr command to load geojson files into pg db
cmd_template = 'ogr2ogr -f "PostgreSQL" PG:"dbname={} user={} password={}" "{}" -nln {} -append'

In [7]:
# for each geojson file, load it into the db
for filename in os.listdir(split_files_folder):
        
    print(filename, end=' ')
    start_time = time.time()
    
    filepath = '{}/{}'.format(split_files_folder, filename)
    cmd = cmd_template.format(pg_db, pg_user, pg_pass, filepath, pg_table)
    result = subprocess.run(cmd, shell=False)
    
    end_time = time.time() - start_time
    print('{:.2f}'.format(end_time, end=' '))

01_AL-000.json 162.57
01_AL-001.json 153.35
01_AL-002.json 60.16
02_AK-000.json 41.47
04_AZ-000.json 160.40
04_AZ-001.json 160.20
04_AZ-002.json 79.26
05_AR-000.json 162.32
05_AR-001.json 76.83
06_CA-000.json 162.58
06_CA-001.json 159.36
06_CA-002.json 167.89
06_CA-003.json 166.12
06_CA-004.json 180.42
06_CA-005.json 175.13
06_CA-006.json 189.20
06_CA-007.json 188.11
06_CA-008.json 247.72
06_CA-009.json 193.90
06_CA-010.json 105.57
08_CO-000.json 163.43
08_CO-001.json 158.44
08_CO-002.json 6.95
09_CT-000.json 152.42
09_CT-001.json 24.26
10_DE-000.json 51.51
11_DC-000.json 9.09
12_FL-000.json 156.25
12_FL-001.json 155.63
12_FL-002.json 161.63
12_FL-003.json 169.00
12_FL-004.json 172.27
12_FL-005.json 187.93
12_FL-006.json 105.50
13_GA-000.json 154.75
13_GA-001.json 153.46
13_GA-002.json 156.34
13_GA-003.json 127.50
15_HI-000.json 38.93
16_ID-000.json 134.81
17_IL-000.json 153.65
17_IL-001.json 154.52
17_IL-002.json 159.77
17_IL-003.json 166.19
17_IL-004.json 127.53
18_IN-000.json 155.93

In [8]:
%%time
# optimize the db after all loaded and rebuild indexes
cursor.execute('VACUUM(FULL, ANALYZE);')

Wall time: 57min 31s


In [9]:
# how many files now in the table?
cursor.execute("SELECT count(*) AS exact_count FROM {};".format(pg_table))
rows = cursor.fetchall()
print('{:,.0f} rows'.format(rows[0][0]))

122,608,100 rows


In [10]:
cursor.close()
connection.close()