# permits-data / Clean Data

ETL pipeline for construction permits data in Los Angeles, California, USA.

For more information:
https://data.lacity.org/A-Prosperous-City/Building-and-Safety-Permit-Information/yv23-pmwf

## Setup

In [21]:
import os
import sys

# Set path for modules
sys.path[0] = '../'

from dotenv import load_dotenv, find_dotenv
import numpy as np
import pandas as pd

# SQL libraries
import psycopg2
print(psycopg2.__version__)

# Import custom eda and sql functions
from src.toolkits.eda import get_snapshot
from src.toolkits.sql import connect_db, get_table_names

# Import dependencies for geocoding
from geopy.geocoders import Nominatim
from geopy.geocoders import GoogleV3
from geopy.extra.rate_limiter import RateLimiter

2.8.5 (dt dec pq3 ext lo64)


In [2]:
# Set notebook display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [3]:
# Get project root directory
root_dir = os.path.dirname(os.getcwd())

# Set environment variables
load_dotenv(find_dotenv());
POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
POSTGRES_DB = os.getenv("POSTGRES_DB")
DB_PORT = os.getenv("DB_PORT")
DB_HOST = os.getenv("DB_HOST")
DATA_URL = os.getenv("DATA_URL")

# Google Maps environment variables
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Environment variables specific to notebook
DATA_DIR = os.path.dirname(root_dir) + '/data'
DB_TABLE = "permits_raw"

## 1. Clean Data

In [5]:
# Connect to db
conn = connect_db()

# Extract partial dataset
sql = 'SELECT * FROM {} LIMIT 500;'.format(DB_TABLE)

# Columns to parse as dates
date_columns = ['status_date', 'issue_date', 'license_expiration_date']

# Fetch fresh data
data = pd.read_sql_query(sql, conn, parse_dates=date_columns, coerce_float=False)

# Replace None with np.nan
data.fillna(np.nan, inplace=True)

Connected as user "postgres" to database "permits" on http://localhost:5432.



### 1.1 Missing Data

#### Overview of Unique Values in Qualitative Data

Before making decisions about how to address missing values, it is important to be familiar with the content of each column. In some cases data can be left alone, imputed, recollected, or dropped from the dataset. Since the permits data has mostly qualitative data and unstructured text, most of it will be left alone.

In the case of geographic data such as addresses and lat/long coordinates, it will be necessary to accurately geocode the missing values. Since this information is split across several columns they will be concatenated into one column.

In [7]:
# Get an overview of data types, # unique values, # missing values and sample value
# for each column
get_snapshot(data)

Unnamed: 0_level_0,DATA TYPE,# UNIQUE VALUES,# MISSING VALUES,SAMPLE VALUE
COLUMN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
assessor_book,object,365,0,2375
assessor_page,object,44,0,016
assessor_parcel,object,75,0,029
tract,object,446,3,V. DOL TRACT
block,object,52,384,25
lot,object,155,4,80
reference_no_old_permit_no,object,165,304,13VN58697
pcis_permit_no,object,500,0,13010-20001-02755
status,object,8,0,Permit Finaled
status_date,datetime64[ns],423,0,2019-04-25 00:00:00


At the moment the only missing data of interest are *zip_code* and *latitude_longitude* coordinates, since these are necesary for mapping. 

### 1.2 Processing Missing Data

***Overview:***
* 1.2.1 - Combine address columns into one columns: *full_address*<br>
    - Correct *suffix_direction*
    - Convert *zip_code* to string
    - Concatenate to form *full_address*
* 1.2.2 - Geocode missing *latitude_longitude* with *full_address*<br>
* 1.2.3 - Split *latitude_longitude* into separate columns and convert to float values: *latitude*, *longitude*<br>
<br>
* Geocode missing *zip_code* with complete *latitude_longitude*<br>
* Geocode any missing *full_address* with *latitude_longitude*<br>

#### 1.2.1 Concatenate *full_address*

1) Correct values *suffix_direction*.<br>
2) Convert *zip_code* to string.<br>
3) Concatenate to form a complete street address string.

In [8]:
# Truncate suffix_direction to first letter (N, S, E, W)
data['suffix_direction'] = data['suffix_direction'].str[0].fillna('')

# Convert zip_code to string
data['zip_code'] = data['zip_code'].fillna('').astype(str)

# Combine address columns to concatenate
address_columns = ["address_start", "street_direction", "street_name", "street_suffix", "suffix_direction",
                  "zip_code"]

# Concatenate address values
data['full_address'] = data[address_columns].fillna('').astype(str).apply(' '.join, axis=1).str.replace('  ', ' ')

# Replace empty strings with NaN values
data[address_columns] = data[address_columns].replace('', np.nan)

In [9]:
# Display
data[address_columns + ['full_address']].head()

Unnamed: 0,address_start,street_direction,street_name,street_suffix,suffix_direction,zip_code,full_address
0,1823,S,THAYER,AVE,,90025,1823 S THAYER AVE 90025
1,2122,W,54TH,ST,,90062,2122 W 54TH ST 90062
2,415,S,BURLINGTON,AVE,,90057,415 S BURLINGTON AVE 90057
3,315,S,OCEANO,DR,,90049,315 S OCEANO DR 90049
4,13640,W,PIERCE,ST,,91331,13640 W PIERCE ST 91331


#### 1.2.2 Geocode missing *latitude_longitude*

In [10]:
# Extract rows missing in latitude_longitude
data_missing = data[data['latitude_longitude'].isnull()==1]

# Size
data_missing.shape

(19, 60)

In [11]:
# Display
data_missing[['full_address', 'latitude_longitude']].head()

Unnamed: 0,full_address,latitude_longitude
5,7111 N MARISA RD 91405,
112,12453 W BROMWICH ST 91331,
147,9842 N LASSEN ROAD 91345,
160,101 S THE GROVE DR 90036,
170,1956 N CARMEN AVE 90068,


In [12]:
# Create helper function to geocode missing latitude_longitude values
def geocode(address, key, agent, timeout=None):
    
    """
    Uses GoogleMaps API to batch geocode address strings to lat/long coordinates. RateLimiter is to 
    avoid timeout errors. If an address cannot be geocoded it is left as NaN. Use of GoogleMaps 
    API incurs a charge at $0.005 per request.
    
    
    """
    
    if address:
        # Initializes GoogleMaps geocoder
        geolocator = GoogleV3(api_key=key, 
                              user_agent=agent, 
                              timeout=timeout)

        # Adds Rate Limiter to space out requests
        geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

        # Geocode address input and format for dataframe
        location = geolocator.geocode(address)
        #print(address, location.latitude)
        
        latitude, longitude = location.latitude, location.longitude
        
        return latitude, longitude
    else:
        return np.nan

In [13]:
# Calculate cost
cost = len(data_missing) * 0.005
print("Cost for geocoding {} addresses is ${:.2f}.".format(len(data_missing), cost))

# Geocode missing coordinates using full addresses
data_missing['latitude_longitude'] = data_missing['full_address'].apply(geocode, args=(GOOGLE_API_KEY, 
                                                                                       "permits-data"))

# Update dataframe
data.update(data_missing)

Cost for geocoding 19 addresses is $0.10.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [14]:
# Display
data_missing[['full_address', 'latitude_longitude']].head()

Unnamed: 0,full_address,latitude_longitude
5,7111 N MARISA RD 91405,"(34.2003503, -118.4533963)"
112,12453 W BROMWICH ST 91331,"(34.2538783, -118.40469)"
147,9842 N LASSEN ROAD 91345,"(34.2498959, -118.4665838)"
160,101 S THE GROVE DR 90036,"(34.072878, -118.357463)"
170,1956 N CARMEN AVE 90068,"(34.1068231, -118.3226816)"


In [15]:
# Check that there are no more missing coordinates before proceeding
assert data['latitude_longitude'].notnull().any(), "Missing coordinates must be geocoded."

#### 1.2.3 Split *latitude_longitude* 

Split coordinates into separate columns and convert to float values.

In [16]:
# Split latitude_longitude into separate columns and convert to float values: latitude, longitude
lat_long_series = data['latitude_longitude'].astype(str).str[1:-1].str.split(',', expand=True) \
                        .astype(float).rename(columns={0: "latitude", 1: "longitude"})

# Add to original data
data = pd.concat([data, lat_long_series], axis=1)

In [17]:
# Display
data[['latitude_longitude', 'latitude', 'longitude']].head(1)

Unnamed: 0,latitude_longitude,latitude,longitude
0,"(34.05474, -118.42628)",34.05474,-118.42628


In [18]:
# Check for null values
assert data['latitude'].any(), 'Column "latitude" has missing values.'
assert data['longitude'].any(), 'Column "longitude" has missing values.'

# Check for erroneous coordinates. All coordinates should fall within Los Angeles county.
assert (data['latitude'] > 33.2).all() and (data['latitude'] < 34.9).all(), "Incorrect latitude detected"
assert (data['longitude'] > -118.9).all() and (data['longitude'] < -118).all(), "Incorrect longitude detected"

In [167]:
# Path to csv
sql_path = root_dir + '/data/interim/permits_geocoded.csv'

# Write to csv
data.to_csv(sql_path, index=False)

## 2. Update PostgreSQL Database

In [142]:
conn = connect_db()

Connected as user "postgres" to database "permits" on http://localhost:5432.



### 2.1 Add New Columns

A list of current columns is retrieved and compared to the new columns. Current columns will be updated with new columns if they are not already present.

In [159]:
def add_columns(db_table, con):

    # Get names of current columns in PostgreSQL table
    current_names = get_table_names(db_table, con)

    # Get names of updated table not in current table
    updated_names = data.columns.tolist()
    new_names = list(set(updated_names) - set(current_names))
    
    # Check names list is not empty
    if not new_names:
        print("Table is up to date.")
        return

    # Format strings for query
    alter_table_sql = "ALTER TABLE {db_table}\n"
    add_column_sql = "\tADD COLUMN {column} TEXT,\n"

    # Create a list and append ADD column statements
    sql_query = [alter_table_sql.format(db_table=db_table)]
    for name in new_names:
        sql_query.append(add_column_sql.format(column=name))

    # Join into one string
    sql_query = ''.join(sql_query)[:-2] + ";"
    
    ### ADD TRY/EXCEPT TO RUN QUERY AGAINST DB
    try:
        print("Connecting...")
        cur = con.cursor()
        print("Executing query...")
        cur.execute(sql_query)
        print("Committing changes...")
        con.commit()
        #con.close()
        print("Database updated successfully.")
    except Exception as e:
        conn.rollback()
        print('Error:\n', e)
    
    return

In [160]:
add_columns(DB_TABLE, conn)

Table is up to date.


In [156]:
names = get_table_names(DB_TABLE, conn)
names = ', '.join(names.tolist())

### 2.2 Update Table Values

In [201]:
# CREATE TABLE and COPY
tmp_table = "tmp_" + DB_TABLE

create_tmp_table_sql = "CREATE TEMP TABLE {tmp_table} ({names});\n\n".format(tmp_table=tmp_table, names=names)
copy_from_tablel_sql = "COPY {tmp_table} FROM \'{path}\' (FORMAT csv HEADER TRUE);\n\nUPDATE {db_table}\n".format(tmp_table=tmp_table, 
                                                                                               path=sql_path,db_table=DB_TABLE)
update_table_sql = "UPDATE {db_table}\n"

###
# SET statements
column_names = get_table_names(DB_TABLE, conn)

updates_sql = ["SET "]

for name in column_names:
    set_sql = "{name} = {tmp_name},\n\t".format(name=name, 
                                               tmp_name=tmp_table + '.' + name)
    updates_sql.append(set_sql)

updates_sql = ''.join(updates_sql)

updates_sql = updates_sql[:-3] + "\n"
####

# FROM and WHERE clause
tail_sql = "FROM {tmp_table}\nWHERE {db_table}.pcis_permit_no = {tmp_table}.pcis_permit_no;\n\nDROP TABLE {tmp_table};" \
    .format(tmp_table=tmp_table, db_table=DB_TABLE)

print(create_tmp_table_sql + copy_from_tablel_sql + updates_sql + tail_sql)


CREATE TEMP TABLE tmp_permits_raw (assessor_book, assessor_page, assessor_parcel, tract, block, lot, reference_no_old_permit_no, pcis_permit_no, status, status_date, permit_type, permit_sub_type, permit_category, project_number, event_code, initiating_office, issue_date, address_start, address_fraction_start, address_end, address_fraction_end, street_direction, street_name, street_suffix, suffix_direction, unit_range_start, unit_range_end, zip_code, work_description, valuation, floor_area_la_zoning_code_definition, no_of_residential_dwelling_units, no_of_accessory_dwelling_units, no_of_stories, contractors_business_name, contractor_address, contractor_city, contractor_state, license_type, license_no, principal_first_name, principal_middle_name, principal_last_name, license_expiration_date, applicant_first_name, applicant_last_name, applicant_business_name, applicant_address_1, applicant_address_2, applicant_address_3, zone, occupancy, floor_area_la_building_code_definition, census_trac