# Store tracts and points in PostGIS

...for a fast spatial-join of points to tracts.

First, install postgres, postgis, and psycopg2. Then create the database from command prompt if it doesn't already exist:

```
createdb -U postgres points_tracts
psql -U postgres -d points_tracts -c "CREATE EXTENSION postgis;"
```

More info in the psycopg2 docs: http://initd.org/psycopg/docs/usage.html

In [None]:
import geopandas as gpd
import pandas as pd
import psycopg2
from shapely.geometry import Point
from keys import pg_user, pg_pass, pg_host, pg_port, pg_db

crs = {'init' : 'epsg:4326'}

In [None]:
%%time
# load tracts and project to 4326
tracts = gpd.read_file('data/us_census_tracts_2014')
tracts = tracts.to_crs(crs)
print(len(tracts))

In [None]:
%%time
# load points and set initial crs to 4326
points = pd.read_csv('data/points-dataset.csv')
geometry = points.apply(lambda row: Point((row['lng'], row['lat'])), axis=1)
points = gpd.GeoDataFrame(points, geometry=geometry, crs=crs)
print(len(points))

In [None]:
assert tracts.crs == points.crs
# srid is the numeric spatial reference ID PostGIS uses
srid = tracts.crs['init'].strip('epsg:')

In [None]:
#points = points.sample(1000)
#tracts = tracts[['GEOID', 'ALAND', 'geometry']].sample(1000)

## Upload tracts and points to PostGIS

In [None]:
connection = psycopg2.connect(database=pg_db,
                              user=pg_user,
                              password=pg_pass,
                              host=pg_host,
                              port=pg_port)
cursor = connection.cursor()

In [None]:
# list all tables
cursor.execute("select relname from pg_class where relkind='r' and relname !~ '^(pg_|sql_)'")
cursor.fetchall()

#### add tracts table

In [None]:
# drop tracts table if it already exists, then create tracts table
cursor.execute("DROP TABLE IF EXISTS tracts")
cursor.execute("CREATE TABLE tracts (id SERIAL PRIMARY KEY, geoid VARCHAR NOT NULL, aland BIGINT NOT NULL)")
cursor.execute("SELECT AddGeometryColumn ('tracts', 'geom', %s, 'MULTIPOLYGON', 2)", [srid])
cursor.execute("CREATE INDEX tract_index ON tracts USING GIST(geom)")
connection.commit()

In [None]:
%%time
cursor.execute("DELETE FROM tracts")

# insert each tract into the tracts table one at a time
for label, row in tracts.iterrows():
    geoid = row['GEOID']
    aland = row['ALAND']
    geometry_wkt = row['geometry'].wkt
    
    query = """INSERT INTO tracts (geoid, aland, geom) 
               VALUES (%s, %s, ST_Multi(ST_GeomFromText(%s, %s)))"""
    data = (geoid, aland, geometry_wkt, srid)
    cursor.execute(query, data)

connection.commit()

#### add points table

In [None]:
# drop points table if it already exists, then create points table
cursor.execute("DROP TABLE IF EXISTS points")
cursor.execute("""CREATE TABLE points (id SERIAL PRIMARY KEY,
                                         date VARCHAR NOT NULL,
                                         region VARCHAR NOT NULL,
                                         bedrooms INTEGER,
                                         rent REAL,
                                         sqft REAL)""")
cursor.execute("SELECT AddGeometryColumn ('points', 'geom', %s, 'POINT', 2)", [srid])
cursor.execute("CREATE INDEX point_index ON points USING GIST(geom)")
connection.commit()

In [None]:
%%time
cursor.execute("DELETE FROM points")

# insert each point into the points table one at a time
for label, row in points.iterrows():
    date = row['date']
    region = row['region']
    bedrooms = row['bedrooms']
    rent = row['rent']
    sqft = row['sqft']
    geometry_wkt = row['geometry'].wkt
    
    # bedrooms can be null, but must be None for psycopg2 to insert it as a null value, not a 'NaN' string
    if pd.isnull(bedrooms):
        bedrooms = None
    
    query = """
            INSERT INTO points (date, region, bedrooms, rent, sqft, geom)
            VALUES (%s, %s, %s, %s, %s, ST_GeomFromText(%s, %s))
            """
    data = (date, region, bedrooms, rent, sqft, geometry_wkt, srid)
    cursor.execute(query, data)

connection.commit()

#### optimize the database

In [None]:
%%time
# vacuum and analyze the database to optimize it after building indices and inserting rows
original_isolation_level = connection.isolation_level
connection.set_isolation_level(0)
cursor.execute("VACUUM ANALYZE")
connection.commit()
connection.set_isolation_level(original_isolation_level)

#### verify SRIDs, row counts, and data

In [None]:
# look up the SRIDs
cursor.execute("""SELECT
                   Find_SRID('public', 'tracts', 'geom') as tracts_srid,
                   Find_SRID('public', 'points', 'geom') as points_srid""")
cursor.fetchall()

In [None]:
cursor.execute("SELECT count(*) AS exact_count FROM tracts")
rows = cursor.fetchall()
rows[0][0]

In [None]:
cursor.execute("SELECT geoid, aland, ST_AsText(geom) FROM tracts LIMIT 3")
rows = cursor.fetchall()
gpd.GeoDataFrame(rows, columns=['GEOID', 'ALAND', 'geometry'])

In [None]:
cursor.execute("SELECT count(*) AS exact_count FROM points")
rows = cursor.fetchall()
rows[0][0]

In [None]:
cursor.execute("""SELECT date, region, bedrooms, rent, sqft, ST_AsText(geom)
                  FROM points LIMIT 3""")
rows = cursor.fetchall()
gpd.GeoDataFrame(rows, columns=['date', 'region', 'bedrooms', 'rent', 'sqft', 'geometry'])

## all done

In [None]:
cursor.close()
connection.close()