In [7]:
import psycopg2
import numpy

import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from geoalchemy2 import Geometry, WKTElement
from shapely.geometry import Point

In [8]:
# Define a database name (we're using a dataset on births, so we'll call it birth_db)
# Set your postgres username
dbname = 'map_the_vote'
username = 'codyschank' # change this to your username

engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)

## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

postgres://codyschank@localhost/map_the_vote
True


In [9]:
# Connect to make queries using psycopg2
con = None
con = psycopg2.connect(database = dbname, user = username)

In [42]:
# intersect with districts where I have voter files
sql_query = """
CREATE TABLE select_all_addresses AS
SELECT b.* FROM us_congressional_districts a, addresses_table_tx_no_dupes b 
    WHERE ST_Intersects(a.geom,b.geom) AND a.geoid IN ('4821','4825','4810');
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x11cf7cef0>

In [43]:
# Join open addresses to voter files
sql_query = """
CREATE TABLE voters_join AS
SELECT a.geom, a.oa_lon, a.oa_lat, a.oa_postcode, a.oa_street_address, a.oa_street, b.* FROM select_all_addresses a LEFT JOIN voter_file_all b ON a.oa_street_address = b.vf_street_address;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x114ddd390>

In [4]:
# create table of voters not joined to open addresses
sql_query = """
CREATE TABLE addresses_not_joined AS
SELECT geom, oa_lon, oa_lat, oa_postcode, oa_street_address, oa_street FROM voters_join WHERE vf_voter_file_vanid IS NULL;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x11382e400>

In [5]:
sql_query = """
SELECT * FROM addresses_not_joined;
"""
addresses_not_joined = pd.read_sql_query(sql_query,con)
addresses_not_joined.shape

(545051, 6)

In [6]:
# voter_join is now only the voters and addresses joined correctly
sql_query = """
DELETE FROM voters_join
WHERE vf_voter_file_vanid IS NULL;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x113817860>

In [44]:
sql_query = """
CREATE TABLE voters_not_joined AS
SELECT a.*, b.geom FROM voter_file_all a LEFT JOIN select_all_addresses b ON a.vf_street_address = b.oa_street_address;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x114ddd5c0>

In [45]:
# delete voters who were joined, left over is voters not joined
sql_query = """
DELETE FROM voters_not_joined
WHERE geom IS NOT NULL;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x114ddd748>

In [46]:
sql_query = """
ALTER TABLE voters_not_joined
DROP geom;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x114ddd8d0>

In [47]:
sql_query = """
SELECT * FROM voters_not_joined;
"""
voters_not_joined = pd.read_sql_query(sql_query,con)
voters_not_joined.shape

(290508, 24)

In [36]:
#voters_not_joined.groupby('vf_zip5').nunique()
#voters_not_joined_select = voters_not_joined.loc[voters_not_joined.vf_zip5=='76442']
#maybe I write a loop by zipcode to do the fuzzy matching

In [48]:
voters_not_joined.head()

Unnamed: 0,index,vf_voter_file_vanid,vf_sex,vf_age,vf_streetprefix,vf_streetno,vf_streetname,vf_streettype,vf_apttype,vf_aptno,...,vf_countyname,vf_street_address,vf_street_address_no_prefix,vf_street_address_no_type,vf_residential,vf_multi_unit,vf_PrecinctNamePad,vf_CountyCode,vf_cntyvtd,vf_join_field
0,1443,1993200,F,64,,40,Creek Side,Dr,,,...,Hays,40 CREEK SIDE DR 78676,40 CREEK SIDE DR 78676,40 Creek Side 78676,y,n,337,209,2090337,1993200
1,1561,1988372,M,63,,430,Jeffery,Ln,,,...,Coryell,430 JEFFERY LN 76522,430 JEFFERY LN 76522,430 Jeffery 76522,y,n,207,99,990207,1988372
2,2895,2005795,M,32,,1615,Fraser,Dr,,,...,Johnson,1615 FRASER DR 76028,1615 FRASER DR 76028,1615 Fraser 76028,y,n,3,251,2510003,2005795
3,9163,2358282,M,49,S,1606,Lake,Dr,,,...,Burnet,1606 S LAKE DR 78654,1606 LAKE DR 78654,1606 Lake 78654,y,n,18,53,530018,2358282
4,11801,2275607,F,67,,1255,County Road 197,,,,...,Coryell,1255 CR 197 76538,1255 COUNTY ROAD 197 76538,1255 County Road 197 76538,y,n,415,99,990415,2275607


In [37]:
# STOP TO DO GEOCODING OF VOTERS NOT JOINED AND RDI CHECK OF ADDRESSES NOT JOINED

In [11]:
# create table voters_not_joined_geocoded by joining geocoded voters not joined with original table that identified them, tx25_join_check2
sql_query = """
CREATE TABLE voters_not_joined_geocoded AS
SELECT a.*, b.geom FROM voters_not_joined a LEFT JOIN smarty_streets_geocode b ON a.vf_street_address = b.vf_street_address;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x122f6cc88>

In [13]:
# now append voters_not_joined_geocoded to voters_join
sql_query = """
INSERT INTO voters_join (geom, vf_voter_file_vanid, vf_sex, vf_age, vf_street_address, vf_multi_unit, vf_cntyvtd)
SELECT geom, vf_voter_file_vanid, vf_sex, vf_age, vf_street_address, vf_multi_unit, vf_cntyvtd
FROM voters_not_joined_geocoded;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x122f6c400>

In [16]:
sql_query = """
CREATE TABLE voters_join_3081 AS
    SELECT * FROM voters_join;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x122fb6a90>

In [17]:
sql_query = """
ALTER TABLE voters_join_3081 
   ALTER COLUMN geom 
   TYPE Geometry(Point, 3081) 
   USING ST_Transform(geom, 3081);
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x122fb6748>

In [18]:
sql_query = """
CREATE INDEX voters_join_3081_gix ON voters_join_3081 USING GIST (geom);
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x122fb6f28>

In [None]:
## START HERE WHEN RDI CHECK DONE

In [None]:
# Copy and project tables that I need for DWithin, calculate indices
# I could have used this projection from the beginning
sql_query = """
CREATE TABLE tx25_addresses_not_joined2_3081 AS
    SELECT * FROM tx25_addresses_not_joined2
"""
engine.execute(sql_query)

In [None]:
sql_query = """
ALTER TABLE tx25_addresses_not_joined2_3081 
   ALTER COLUMN geom 
   TYPE Geometry(Point, 3081) 
   USING ST_Transform(geom, 3081);
"""
engine.execute(sql_query)

In [None]:
sql_query = """
CREATE INDEX tx25_addresses_not_joined2_3081_gix ON tx25_addresses_not_joined2_3081 USING GIST (geom);
"""
engine.execute(sql_query)

In [16]:
sql_query = """
ALTER TABLE tx25_addresses_not_joined2_3081
ADD COLUMN tx25_join_mask integer;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x10f3ce080>

In [24]:
sql_query = """
UPDATE tx25_addresses_not_joined2_3081
SET "tx25_join_mask" = 0;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x15e79b2b0>

In [25]:
sql_query = """
UPDATE tx25_addresses_not_joined2_3081 dst
SET "tx25_join_mask" = 1
FROM tx25_join_3081 src
WHERE ST_DWITHIN(src.geom,dst.geom,10);
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x117d3d240>

In [28]:
sql_query = """
CREATE TABLE final_addresses_not_joined AS 
SELECT a.*, b.residential, b.vacant FROM tx25_addresses_not_joined2_3081 a LEFT JOIN smarty_streets_rdi_check b ON a.oa_street_address = b.oa_street_address;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x15e79bb38>

In [34]:
sql_query = """
DELETE FROM final_addresses_not_joined
WHERE residential != 'Residential' OR tx25_join_mask = 1;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x15e79b898>

In [None]:
# I am going to need to check these last two to queries to figure out how I need to change them.

In [5]:
# this is a bit slow
# could figure out how to counting in SQL, includes GROUP BY and COUNT
sql_query = """
CREATE TABLE registered_addresses_by_vtd AS
SELECT a.cntyvtd FROM vtds_tx a, tx25_join b 
    WHERE b.voter_file_vanid IS NOT NULL AND ST_Intersects(a.geom,b.geom);
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x1172a1a58>

In [4]:
sql_query = """
SELECT * FROM tx25_join_check WHERE geom IS NULL;
"""
voters_not_joined = pd.read_sql_query(sql_query,con)
voters_not_joined.to_csv("/Users/codyschank/Dropbox/Insight/voters_not_joined.csv")
# Voters not matched to an address