In [1]:
import os
import glob
import psycopg2
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from geoalchemy2 import Geometry, WKTElement
from shapely.geometry import Point

In [2]:
# Define a database name (we're using a dataset on births, so we'll call it birth_db)
# Set your postgres username
dbname = 'map_the_vote'
username = 'codyschank' # change this to your username

engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)

## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

postgres://codyschank@localhost/map_the_vote
True


In [3]:
# Connect to make queries using psycopg2
con = None
con = psycopg2.connect(database = dbname, user = username)

In [4]:
path = '/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/'
extension = 'csv'
os.chdir(path)
files = [i for i in glob.glob('*.{}'.format(extension))]
files = [path + s for s in files]
files = sorted(files)

In [5]:
# FIRST do the first file
address_data = pd.read_csv(files[0], dtype={"LON": float, "LAT": float, "NUMBER": str, "STREET": str, "UNIT": str, 
                            "CITY": str, "DISTRICT": str, "REGION": str, "POSTCODE": str, "ID": str,
                            "HASH": str}, keep_default_na=False)
# combine text LON and LAT fields into a geometry 
geometry = [Point(xy) for xy in zip(address_data.LON, address_data.LAT)]

# add oa prefix to all columns
address_data.columns = ['oa_' + str(col) for col in address_data.columns]
address_data_gd = gpd.GeoDataFrame(address_data, crs=4326, geometry=geometry)

# convert column names to lower case will help later with database queries
address_data_gd.columns = map(str.lower, address_data_gd.columns)

address_data_gd["oa_street_address"] = address_data_gd["oa_number"].map(str) + ' ' +  address_data_gd["oa_street"]

# this is a bit slow, takes about 30-60s
# I don't know why this is necessary, but it is 
address_data_gd['geom'] = address_data_gd['geometry'].apply(lambda x: WKTElement(x.wkt, srid=4326))
address_data_gd.drop('geometry', 1, inplace=True)

In [6]:
table_name = "addresses_table_tx"
#write first 1000 rows to database
address_data_gd.head(1000).to_sql(table_name, engine, if_exists='replace', index=False, 
                                dtype={'geom': Geometry('POINT', srid= 4326)})

In [7]:
chunk_size = 1000
for i in range(1000, address_data_gd.shape[0]+chunk_size, chunk_size):
    address_data_gd[i:(i+chunk_size)].to_sql(table_name, engine, if_exists='append', index=False, 
                                dtype={'geom': Geometry('POINT', srid= 4326)})

In [8]:
# NOW do the rest of the files
for file in files[1:]: # skip the first one since I read it in already
    print(file)
    address_data = pd.read_csv(file, dtype={"LON": float, "LAT": float, "NUMBER": str, "STREET": str, "UNIT": str, 
                                "CITY": str, "DISTRICT": str, "REGION": str, "POSTCODE": str, "ID": str,
                                "HASH": str}, keep_default_na=False)

    # combine text LON and LAT fields into a geometry 
    geometry = [Point(xy) for xy in zip(address_data.LON, address_data.LAT)]
    
    # add oa prefix to all columns
    address_data.columns = ['oa_' + str(col) for col in address_data.columns]
    address_data_gd = gpd.GeoDataFrame(address_data, crs=4326, geometry=geometry)

    # convert column names to lower case will help later with database queries
    address_data_gd.columns = map(str.lower, address_data_gd.columns)
    
    address_data_gd["oa_street_address"] = address_data_gd["oa_number"].map(str) + ' ' +  address_data_gd["oa_street"]
    
    # this is a bit slow, takes about 30-60s
    # I don't know why this is necessary, but it is 
    address_data_gd['geom'] = address_data_gd['geometry'].apply(lambda x: WKTElement(x.wkt, srid=4326))
    address_data_gd.drop('geometry', 1, inplace=True)
    for i in range(0, address_data_gd.shape[0]+chunk_size, chunk_size):
        address_data_gd[i:(i+chunk_size)].to_sql(table_name, engine, if_exists='append', index=False, 
                                                    dtype={'geom': Geometry('POINT', srid= 4326)})
    

/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/austin.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/bandera.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/bastrop.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/bexar.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/brazoria.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/brazos.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/burleson.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/camp.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/capcog.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/carson.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/city_of_abilene.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/city_of_amarillo.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/city_of_austin.csv
/Users/codyschank/Dow

/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/shackelford.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/statewide-partial.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/tarrant-county.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/texoma-counties.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/titus.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/trinity.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/uvalde.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/vanzandt.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/waco.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/walker.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/washington.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/wharton.csv
/Users/codyschank/Downloads/openaddr-collected-us_south/us/tx/williamson.csv
/Users/co

In [None]:
# do I need 'as dst', or is it just 'dst'
sql_query = """
UPDATE addresses_table_tx dst
SET oa_postcode = src.zcta5ce10
FROM zip5_us src
WHERE ST_Intersects(src.geom, dst.geom);
"""

engine.execute(sql_query)

In [None]:
# Update street_address column in addresses_table_tx
sql_query = """
UPDATE addresses_table_tx
SET oa_street_address = oa_street_address || ' ' || oa_postcode;
"""
engine.execute(sql_query)

In [9]:
# Delete rows where street number is empty
sql_query = """
DELETE FROM addresses_table_tx
WHERE oa_number = '';
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x11f0e7908>

In [10]:
sql_query = """
DELETE FROM addresses_table_tx
WHERE oa_number = '0';
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x11f0e74a8>

In [11]:
sql_query = """
DELETE FROM addresses_table_tx
WHERE oa_street = '';
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x11f0e7160>

In [None]:
# start here

In [4]:
sql_query = """
UPDATE addresses_table_tx SET oa_street_address = replace(oa_street_address, 'COUNTY ROAD', 'CR');
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x11d27c550>

In [5]:
sql_query = """
UPDATE addresses_table_tx SET oa_street_address = replace(oa_street_address, 'STATE ROAD', 'SR');
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x11d2688d0>

In [6]:
# COUNTY RD to CR in openaddress_street_address
sql_query = """
UPDATE addresses_table_tx SET oa_street_address = replace(oa_street_address, 'RANCH ROAD', 'RR');
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x11d2687b8>

In [7]:
sql_query = """
UPDATE addresses_table_tx SET oa_street_address = replace(oa_street_address, 'HIGHWAY', 'HWY');
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x11d27cf28>

In [13]:
# replace mc _ with mc_ in openaddress_street_address
sql_query = """
UPDATE addresses_table_tx SET oa_street_address = replace(oa_street_address, 'MC ', 'MC')
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x117259a20>

In [4]:
# get rid of double spaces in openaddress_street_address, run twice
sql_query = """
UPDATE addresses_table_tx SET oa_street_address = replace(oa_street_address, '  ', ' ');
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x11c1841d0>

In [None]:
# run it again, just to be sure.
sql_query = """
UPDATE addresses_table_tx SET oa_street_address = replace(oa_street_address, '  ', ' ');
"""
engine.execute(sql_query)

In [27]:
# remove duplicates, just take first one
sql_query = """
CREATE TABLE addresses_table_tx_no_dupes AS
SELECT DISTINCT ON (oa_street_address) * FROM addresses_table_tx;
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x11c555f98>

In [9]:
# Delete the shorter street name (i.e. no prefix)
sql_query = """
DELETE FROM addresses_table_tx_no_dupes a USING addresses_table_tx_no_dupes b
    WHERE LENGTH(a.oa_street) < LENGTH(b.oa_street) 
    AND a.oa_postcode = b.oa_postcode 
    AND a.oa_number = b.oa_number
    AND (a.oa_street LIKE '%%' || b.oa_street OR b.oa_street LIKE '%%' || a.oa_street); 
"""
engine.execute(sql_query)

<sqlalchemy.engine.result.ResultProxy at 0x117293668>