# Store tracts and rental listings in PostGIS

...for a fast spatial-join of listings to tracts.

First, create the database from command prompt if it doesn't already exist:

```
createdb -U postgres craigslist_tracts
psql -U postgres -d craigslist_tracts -c "CREATE EXTENSION postgis;"
```

More info in the psycopg2 docs: http://initd.org/psycopg/docs/usage.html

In [1]:
import geopandas as gpd
import pandas as pd
import psycopg2
from shapely.geometry import Point
from keys import pg_user, pg_pass, pg_host, pg_port, pg_db

crs = {'init' : 'epsg:4326'}

In [2]:
%%time
# load tracts and project to 4326
tracts = gpd.read_file('data/us_census_tracts_2014')
tracts = tracts.to_crs(crs)
print(len(tracts))

74133
Wall time: 36.7 s


In [3]:
%%time
# load listings and set initial crs to 4326
listings = pd.read_csv('data/craigslist_listings_cleaned.csv')
geometry = listings.apply(lambda row: Point((row['lng'], row['lat'])), axis=1)
listings = gpd.GeoDataFrame(listings, geometry=geometry, crs=crs)
print(len(listings))

1393426
Wall time: 51.5 s


In [4]:
assert tracts.crs == listings.crs
# srid is the numeric spatial reference ID PostGIS uses
srid = tracts.crs['init'].strip('epsg:')

In [5]:
#listings = listings.sample(1000)
#tracts = tracts[['GEOID', 'ALAND', 'geometry']].sample(1000)

## Upload tracts and listings to PostGIS

In [6]:
connection = psycopg2.connect(database=pg_db,
                              user=pg_user,
                              password=pg_pass,
                              host=pg_host,
                              port=pg_port)
cursor = connection.cursor()

In [7]:
# list all tables
cursor.execute("select relname from pg_class where relkind='r' and relname !~ '^(pg_|sql_)'")
cursor.fetchall()

[('listings_tracts',), ('spatial_ref_sys',), ('listings',), ('tracts',)]

#### add tracts table

In [8]:
# drop tracts table if it already exists, then create tracts table
cursor.execute("DROP TABLE IF EXISTS tracts")
cursor.execute("CREATE TABLE tracts (id SERIAL PRIMARY KEY, geoid VARCHAR NOT NULL, aland BIGINT NOT NULL)")
cursor.execute("SELECT AddGeometryColumn ('tracts', 'geom', %s, 'MULTIPOLYGON', 2)", [srid])
cursor.execute("CREATE INDEX tract_index ON tracts USING GIST(geom)")
connection.commit()

In [9]:
%%time
cursor.execute("DELETE FROM tracts")

# insert each tract into the tracts table one at a time
for label, row in tracts.iterrows():
    geoid = row['GEOID']
    aland = row['ALAND']
    geometry_wkt = row['geometry'].wkt
    
    query = """INSERT INTO tracts (geoid, aland, geom) 
               VALUES (%s, %s, ST_Multi(ST_GeomFromText(%s, %s)))"""
    data = (geoid, aland, geometry_wkt, srid)
    cursor.execute(query, data)

connection.commit()

Wall time: 2min 54s


#### add listings table

In [10]:
# drop listings table if it already exists, then create listings table
cursor.execute("DROP TABLE IF EXISTS listings")
cursor.execute("""CREATE TABLE listings (id SERIAL PRIMARY KEY,
                                         date VARCHAR NOT NULL,
                                         region VARCHAR NOT NULL,
                                         bedrooms INTEGER,
                                         rent REAL,
                                         sqft REAL)""")
cursor.execute("SELECT AddGeometryColumn ('listings', 'geom', %s, 'POINT', 2)", [srid])
cursor.execute("CREATE INDEX listing_index ON listings USING GIST(geom)")
connection.commit()

In [11]:
%%time
cursor.execute("DELETE FROM listings")

# insert each listing into the listings table one at a time
for label, row in listings.iterrows():
    date = row['date']
    region = row['region']
    bedrooms = row['bedrooms']
    rent = row['rent']
    sqft = row['sqft']
    geometry_wkt = row['geometry'].wkt
    
    # bedrooms can be null, but must be None for psycopg2 to insert it as a null value, not a 'NaN' string
    if pd.isnull(bedrooms):
        bedrooms = None
    
    query = """
            INSERT INTO listings (date, region, bedrooms, rent, sqft, geom)
            VALUES (%s, %s, %s, %s, %s, ST_GeomFromText(%s, %s))
            """
    data = (date, region, bedrooms, rent, sqft, geometry_wkt, srid)
    cursor.execute(query, data)

connection.commit()

Wall time: 7min 22s


#### optimize the database

In [12]:
%%time
# vacuum and analyze the database to optimize it after building indices and inserting rows
original_isolation_level = connection.isolation_level
connection.set_isolation_level(0)
cursor.execute("VACUUM ANALYZE")
connection.commit()
connection.set_isolation_level(original_isolation_level)

Wall time: 4.32 s


#### verify SRIDs, row counts, and data

In [13]:
# look up the SRIDs
cursor.execute("""SELECT
                   Find_SRID('public', 'tracts', 'geom') as tracts_srid,
                   Find_SRID('public', 'listings', 'geom') as listings_srid""")
cursor.fetchall()

[(4326, 4326)]

In [14]:
cursor.execute("SELECT count(*) AS exact_count FROM tracts")
rows = cursor.fetchall()
rows[0][0]

74133

In [15]:
cursor.execute("SELECT geoid, aland, ST_AsText(geom) FROM tracts LIMIT 3")
rows = cursor.fetchall()
gpd.GeoDataFrame(rows, columns=['GEOID', 'ALAND', 'geometry'])

Unnamed: 0,GEOID,ALAND,geometry
0,1089001902,12784312,"MULTIPOLYGON(((-86.556074 34.674245,-86.555981..."
1,1089010800,62434260,"MULTIPOLYGON(((-86.554613 34.786468,-86.554611..."
2,1089010701,88171204,"MULTIPOLYGON(((-86.642866 34.876408,-86.642823..."


In [16]:
cursor.execute("SELECT count(*) AS exact_count FROM listings")
rows = cursor.fetchall()
rows[0][0]

1393426

In [17]:
cursor.execute("""SELECT date, region, bedrooms, rent, sqft, ST_AsText(geom)
                  FROM listings LIMIT 3""")
rows = cursor.fetchall()
gpd.GeoDataFrame(rows, columns=['date', 'region', 'bedrooms', 'rent', 'sqft', 'geometry'])

Unnamed: 0,date,region,bedrooms,rent,sqft,geometry
0,2014-05-11,santabarbara,3,3500.0,1200.0,POINT(-119.726987 34.399757)
1,2014-05-11,santabarbara,2,850.0,882.0,POINT(-119.855845 34.411019)
2,2014-05-11,santabarbara,1,1290.0,,POINT(-119.85433 34.410415)


## all done

In [18]:
cursor.close()
connection.close()