# permits-data / Load Data

ETL pipeline for construction permits data in Los Angeles, California, USA.

For more information:
https://data.lacity.org/A-Prosperous-City/Building-and-Safety-Permit-Information/yv23-pmwf

In [23]:
import os
import sys
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv());
import numpy as np
import pandas as pd
import psycopg2

In [24]:
# Set notebook display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [25]:
# Get project root directory
root_dir = os.path.dirname(os.getcwd())

# Set path for modules
sys.path[0] = '../'

# Set environment variables
POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
POSTGRES_DB = os.getenv("POSTGRES_DB")
DB_PORT = os.getenv("DB_PORT")
DB_HOST = os.getenv("DB_HOST")
DATA_URL = os.getenv("DATA_URL")

# Environment variables specific to notebook
DATA_DIR = os.path.dirname(root_dir) + '/data'
DB_TABLE = "permits_raw"

## 1. Import Data

In [61]:
# Connect to PostgreSQL, useful only for notebook
def connect_db():
    try:
        con = psycopg2.connect(dbname=POSTGRES_DB,
                               user=POSTGRES_USER,
                               password=POSTGRES_PASSWORD,
                                host=DB_HOST, 
                                port=DB_PORT)
    except Exception as e:
        print('Error:\n', e)
    
    return con

In [62]:
conn = connect_db()

### 1.1 Update Table Columns in PostgreSQL Database

In [63]:
# Get raw data column names
def get_table_names(table, con):
    sql = "SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = N'{}'".format(table)
    etl = pd.read_sql_query(sql, con)
    columns = etl['column_name']
    
    return columns

In [64]:
# Check table names
get_table_names("permits_raw", conn).head(10)

0                 assessor_book
1                 assessor_page
2               assessor_parcel
3                         tract
4                         block
5                           lot
6    reference_no_old_permit_no
7                pcis_permit_no
8                        status
9                   status_date
Name: column_name, dtype: object

In [30]:
# Retrieve table column names
old_columns = get_table_names("permits_raw", conn)

In [31]:
# Rename columns, will update table later
def format_names(series):
    
    replace_map = {' ': '_', '-': '_', '#': 'No', '/': '_', 
                   '.': '', '(': '', ')': '', "'": ''}

    def replace_chars(text):
        for oldchar, newchar in replace_map.items():
            text = text.replace(oldchar, newchar).lower()
        return text

    return series.apply(replace_chars)

In [32]:
# Transform table column names for permits_raw
new_columns = format_names(old_columns)

In [33]:
new_columns.head()

0      assessor_book
1      assessor_page
2    assessor_parcel
3              tract
4              block
Name: column_name, dtype: object

In [35]:
# Creates a SQL query to update table columns and writes to text file
### pass conn context
def create_query(old_columns, new_columns, db_table, con, run=False):
    
    sql = 'ALTER TABLE {} '.format(db_table) + 'RENAME "{old_name}" to {new_name};'
    
    
    sql_query = []

    for idx, name in old_columns.iteritems():
        sql_query.append(sql.format(old_name=name, new_name=new_columns[idx]))
        
    update_names = '\n'.join(sql_query)
    
    # update later: sql_file = os.path.join(os.path.dirname(__file__), "../postgres/scripts/update_names.sql")
    with open('../postgres/sql/update_names.sql', 'w') as text:
        text.write(update_names)
        
    # Update db is desired
    if run:
        cur = con.cursor()
        sql_file = open('../postgres/sql/update_names.sql', 'r')
        cur.execute(sql_file.read())
        con.commit()
        #conn.close()

In [36]:
# Create SQL query for permits_raw
try:
    create_query(old_columns, new_columns, run=True, con=conn, db_table=DB_TABLE)
except Exception as e: 
    conn.rollback()
    print('Error:\n', e)

In [38]:
# Check table names are updated
get_table_names("permits_raw", conn).head()

0      assessor_book
1      assessor_page
2    assessor_parcel
3              tract
4              block
Name: column_name, dtype: object

In [53]:
# TEST: 
assert (get_table_names("permits_raw", 
                        conn) == new_columns).mean() == 1, "Database table names do not match new table names"

In [55]:
# Extract full dataset
sql_all = 'SELECT * FROM {};'.format(DB_TABLE)

# Extract full dataset
data = pd.read_sql_query(sql_all, conn)
data.head()

Unnamed: 0,assessor_book,assessor_page,assessor_parcel,tract,block,lot,reference_no_old_permit_no,pcis_permit_no,status,status_date,permit_type,permit_sub_type,permit_category,project_number,event_code,initiating_office,issue_date,address_start,address_fraction_start,address_end,address_fraction_end,street_direction,street_name,street_suffix,suffix_direction,unit_range_start,unit_range_end,zip_code,work_description,valuation,floor_area_la_zoning_code_definition,no_of_residential_dwelling_units,no_of_accessory_dwelling_units,no_of_stories,contractors_business_name,contractor_address,contractor_city,contractor_state,license_type,license_no,principal_first_name,principal_middle_name,principal_last_name,license_expiration_date,applicant_first_name,applicant_last_name,applicant_business_name,applicant_address_1,applicant_address_2,applicant_address_3,zone,occupancy,floor_area_la_building_code_definition,census_tract,council_district,latitude_longitude,applicant_relationship,existing_code,proposed_code
0,4317,3,***,TR 30210-C,,LT 1,,15044-90000-08405,Permit Finaled,09/10/2015,HVAC,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2015-08-18,1823,1/2,1823,1/2,S,THAYER,AVE,,,,90025.0,,,,,,,CONDITIONED AIRE MECHANICAL & ENGINEERING INC,18650 PARTHENIA STREET,NORTHRIDGE,CA,C20,532440,BRETT,MOORE,HOFFER,2016-06-30,BRETT,HOFFER,,18650 PARTHENIA ST,,"NORTHRIDGE, CA",R3-1-O,,0.0,2671.0,5,"(34.05474, -118.42628)",Net Applicant,,
1,5005,10,017,CHESTERFIELD SQUARE,,465,16SL57806,16016-70000-02464,Permit Finaled,08/01/2017,Bldg-Alter/Repair,1 or 2 Family Dwelling,No Plan Check,,,SOUTH LA,2016-02-04,2122,,2122,,W,54TH,ST,,,,90062.0,General rehabilitation for single family dwell...,40000.0,,,,,OWNER-BUILDER,,,,,0,JAVIER,,TALAMANTES,,JAVIER,TALAMANTES,OWNER-BUILDER,,,,C2-1VL,,,2325.0,8,"(33.99307, -118.31668)",Owner-Bldr,1.0,
2,5154,23,022,SUN-SET TRACT,D,13,14VN81535,14016-20000-13092,Issued,08/13/2014,Bldg-Alter/Repair,Apartment,Plan Check,,,VAN NUYS,2014-08-13,415,,415,,S,BURLINGTON,AVE,,1-30,1-30,90057.0,PHOTOVOLTAIC SOLAR PANELS ON ROOF OF (E) APT BLDG,37000.0,,,,,PERMACITY CONSTRUCTION CORP,5570 W WASHINGTON BLVD,LOS ANGELES,CA,B,827864,JONATHAN,SAUL,PORT,2015-11-30,LINDA,MARTON,,710 WILSHIRE BLVD,,"SANTA MONICA, CA",R4-1,,,2089.04,1,"(34.06012, -118.26997)",Agent for Owner,5.0,
3,4404,30,010,TR 12086,,2,,16044-30000-09658,Permit Finaled,08/29/2016,HVAC,1 or 2 Family Dwelling,No Plan Check,,,WEST LA,2016-08-22,315,,315,,S,OCEANO,DR,,,,90049.0,,,,,,,E/C HEATING AND AIR CONDITION,26888 CUATRO MILPAS ST,VALENCIA,CA,C20,651051,EDY,RUDOLFO,CORDON,2018-07-31,,,,,,,RS-1,,0.0,2640.0,11,"(34.05707, -118.4732)",Contractor,,
4,2646,19,011,TR 7158,,11,,17042-90000-31792,Permit Finaled,12/28/2017,Plumbing,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2017-12-26,13640,,13640,,W,PIERCE,ST,,,,91331.0,,,,,,,TITANIUM POWER INC,1545 S LA CIENEGA BLVD,LOS ANGELES,CA,B,989217,DENNIS,HARUO,MIYAHIRA,2017-12-31,YONI,GHERMEZI,,1545 S LA CIENEGA BLVD,,"LOS ANGELES, CA",R1-1-O,,0.0,1044.03,7,"(34.25487, -118.43002)",Net Applicant,,


In [56]:
data.shape
conn.close()

## 2. Clean Data

In [1090]:
# Connect to db
conn = connect_db()

# Extract partial dataset
sql_all = 'SELECT * FROM {} LIMIT 1500;'.format(DB_TABLE)

# Columns to parse as dates
date_columns = ['status_date', 'issue_date', 'license_expiration_date']

# Fetch fresh data
data = pd.read_sql_query(sql_all, conn, parse_dates=date_columns, 
                         coerce_float=False)

In [1091]:
data.head()

Unnamed: 0,assessor_book,assessor_page,assessor_parcel,tract,block,lot,reference_no_old_permit_no,pcis_permit_no,status,status_date,permit_type,permit_sub_type,permit_category,project_number,event_code,initiating_office,issue_date,address_start,address_fraction_start,address_end,address_fraction_end,street_direction,street_name,street_suffix,suffix_direction,unit_range_start,unit_range_end,zip_code,work_description,valuation,floor_area_la_zoning_code_definition,no_of_residential_dwelling_units,no_of_accessory_dwelling_units,no_of_stories,contractors_business_name,contractor_address,contractor_city,contractor_state,license_type,license_no,principal_first_name,principal_middle_name,principal_last_name,license_expiration_date,applicant_first_name,applicant_last_name,applicant_business_name,applicant_address_1,applicant_address_2,applicant_address_3,zone,occupancy,floor_area_la_building_code_definition,census_tract,council_district,latitude_longitude,applicant_relationship,existing_code,proposed_code
0,5017,18,18,VERMONT AVENUE SQUARE,7.0,18,,18044-40000-12028,Issued,2018-09-27,HVAC,1 or 2 Family Dwelling,No Plan Check,,,SANPEDRO,2018-09-27,1110,,1110,,W,45TH,ST,,,,90037,,,,,,,WE CARE,41085 GOLDEN GATE CIR,MURRIETA,CA,C20,779604,RUSSELL,JAY,COCHRAN,2019-06-30,RAY,THOMAS,,,,,R1-1,,0.0,2322.0,9,"(34.00254, -118.29429)",Agent for Contractor,,
1,5092,30,9,WESTERN WILSHIRE HEIGHTS,,10,,17041-90000-35394,Refund in Progress,2017-10-20,Electrical,Commercial,No Plan Check,,,INTERNET,2017-10-03,3932,,3932,,W,WILSHIRE,BLVD,,,,90010,,,,,,,HERZOG M B ELECTRIC INC,15709 ILLINOIS STREET,PARAMOUNT,CA,C10,383811,RYAN,MICHAEL,HERZOG,2017-11-30,LINDA,BUSH,,15709 ILLINOIS AVE,,"PARAMOUNT, CA",C4-2,,0.0,2126.1,10,"(34.06153, -118.31244)",Net Applicant,,
2,2014,35,7,TR 26109,,27,,15041-90000-24929,Permit Finaled,2015-08-24,Electrical,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2015-08-03,7707,,7707,,N,SEDAN,AVE,,,,91304,,,,,,,ZERO ENERGY CONTRACTING INC,1010 ESPLANADE APT 2,REDONDO BEACH,CA,B,953417,KENNETH,GARY,GIETZ,2016-10-31,MICHAEL,MURPHY,,1060 SHOEMAKER AVE,C,"SANTA FE SPRINGS, CA",RS-1,,0.0,1344.23,12,"(34.21049, -118.62979)",Net Applicant,,
3,5561,5,7,TR 11794,,13,13LA17342,12016-20002-14411,Permit Finaled,2014-08-13,Bldg-Alter/Repair,1 or 2 Family Dwelling,Plan Check,,,VAN NUYS,2013-06-03,1544,,1544,,N,DOHENY,DR,,,,90069,SUPPLEMENTAL PERMIT TO VOLUNTARY ADD...,15000.0,,,,,MC BUILDERS GROUP,7801 VICKY AVENUE,WEST HILLS,CA,B,880257,JOSEPH,NATHANAEL,COOK,2014-06-30,JESUS,BERMUDEL,,4225 E IMPERIAL HWY,B,LYNWOOD,RE11-1,,,1943.0,5,"(34.09993, -118.38902)",Agent for Owner,1.0,
4,4404,23,4,WESTGATE ACRES,,73,16LA69558,16016-10000-21023,Permit Finaled,2017-05-05,Bldg-Alter/Repair,1 or 2 Family Dwelling,Plan Check,,,METRO,2016-09-13,445,,445,,S,SALTAIR,AVE,,,,90049,REPLACE GARAGE DOORS AT (E) GARAGE W...,5000.0,,,,,VALLE/REINIS BUILDERS INC,10537 TENNESSEE AVENUE,LOS ANGELES,CA,B,839178,BRIAN,SCOTT,VALLE,2017-07-31,TIM,BARBER,,8455 BEVERLY BL,,"LOS ANGELES, CA",RS-1,,,2640.0,11,"(34.05463, -118.47387)",Agent for Owner,7.0,


### 2.1 Missing Data & Data Types

#### Overview of Unique Values in Qualitative Data

Before making decisions regarding missing values or data types, it is important to be familiar with the content of each column especially considering the permits dataset mostly contains qualitative data.

#### Summary
* *zip_code* and *latitude_longitude* need their missing values to be inferred through geocoding.
* *issue_date* and *license_expiration_date* should be parsed as datetime objects on import
* *license_no*, *existing_code* and *proposed_code* converted to int
*  and 

#### Steps
1) Split *latitude_longitude* into separate columns and convert to float values: *latitude*, *longitude*<br>
2) Combine address columns into one columns: *full_address*<br>
3) Geocode missing *latitude_longitude* with *full_address*<br>
4) Geocode missing *zip_code* with complete *latitude_longitude*<br>
5) Geocode any missing *full_address* with *latitude_longitude*<br>
6) Convert *zip_code* to integer type<br>

In [1264]:
### Returns overview with column, dtype, # unique values, # missing values and sample value
def get_overview(dataframe):
    
    """
    Returns a new pandas DataFrame with the column name, number
    of unique values, number of null values, and a sample value.
    
    Example
    -------
    
    overview = get_overview(my_dataframe)
    
    """
    unique = dataframe.nunique(axis=0)
    is_null = dataframe.isnull().sum()
    data_types = dataframe.dtypes
    
    samples = pd.DataFrame()
    column_names = pd.DataFrame()
    
    for column, row in df.iteritems():
        try:
            sample = df[column].dropna(axis=0).sample()
            column_name = pd.Series(column)
        except:
            pass

        samples = pd.concat([samples, sample], axis=0).reset_index(drop=True)
        column_names = pd.concat([column_names, column_name], 
                                 axis=0, ).reset_index(drop=True)

        examples = pd.concat([column_names, samples], axis=1, ignore_index=True)
        examples.columns = ['COLUMN', 'SAMPLE VALUE']
    
    overview = pd.concat([data_types, unique, is_null], axis=1)
    overview.reset_index(inplace=True)
    overview.columns = columns=['COLUMN', 'DATA TYPE', '# UNIQUE VALUES', '# MISSING VALUES']
    overview = overview.merge(right=examples, on='COLUMN').drop_duplicates(subset=['COLUMN']).set_index('COLUMN')
    
    return overview

In [1110]:
get_overview(data)

Unnamed: 0_level_0,DATA TYPE,# UNIQUE VALUES,# MISSING VALUES,SAMPLE VALUE
COLUMN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
assessor_book,object,735,4,5053
assessor_page,object,46,4,003
assessor_parcel,object,121,4,002
tract,object,1202,8,TR 16981
block,object,93,1231,6
lot,object,287,7,10
reference_no_old_permit_no,object,527,878,19LA27894
pcis_permit_no,object,1500,0,14016-10000-18696
status,object,11,0,Issued
status_date,datetime64[ns],985,0,10/09/2018


In [1263]:
# Creates a report to show value counts for columns with less than n unique values
def explore_value_counts(dataframe, n=None, all_values=False, max_n=1500, columns=None, printed=True):
    
    """
    This function is helpful for quickly determining which values
    should be converted to integer or category types in a dataframe.
    
    Prints a series of custom text summaries with n value counts 
    for each column. Can work if neither n nor columns are specified.
    
    Also can returns a generator yielding a text summary with n
    value counts for each column.
    
    Example:
    --------
    ## Iterates through individual tables
    gen = explore_value_counts(data, printed=False)
    print(next(gen)) 
    
    ## Prints all tables to STDOUT
    explore_value_counts(data, printed=True)
    
    Params
    --------
    dataframe : pandas DataFrame
        DataFrame with columns to be summarized
    n : integer
        Max number of unique categories in column
    all_values : bool
        Set to True to return all unique values
    max_n : integer
        Ceiling safeguard to avoid extremely large values of n
    columns : list 
        Columns to include in output
    printed : bool
        If true prints to console; if false returns generator object
        which can be printed as text.
    
    Returns
    --------
    if printed=True: prints all formatted text of all tables
    
    if printed=False: generator object that outputs one table
    
    """
    
    # Parsing arguments
    if columns:
        dataframe = dataframe[columns]
    
    if n:
        n = n if n <= max_n else max_n
    else:
        n = len(dataframe) if len(dataframe) <= max_n else max_n
    
    if all_values:            
        n = len(dataframe) if len(dataframe) <= max_n else max_n
    else:
        n = 30
    
    def make_tables():
        dataframe_n = pd.DataFrame()

        # Data selection
        for column, row in dataframe.iteritems():
            
            n_unique = dataframe[column].nunique()
            
            if (dataframe[column].dtype not in ['float64', 'int', '<M8[ns]']):
                dataframe_n = pd.concat([dataframe_n, dataframe[column]], axis=1)

        summary_list = []

        # Text generation
        for column, row in dataframe_n.iteritems(): 
            series = dataframe_n[column]
            name = series.name
            
            # Create dataframe of value counts
            counted = series.value_counts(sort=True)[:n]
            percent = series.value_counts(sort=True, normalize=True)[:n]
            summary = pd.concat([counted, percent], axis=1)
            summary.columns = ['COUNT', 'PERCENTAGE']
            summary.index = summary.index.rename('UNIQUE VALUES:')
            
            # Create a custom table with n unique, missing values to print to console as text
            summary_text = 'COLUMN:   "{}"\nTOTAL UNIQUE:  {}'.format(name, series.nunique())
            summary_text = summary_text + '\nTOTAL MISSING:  {}'.format(series.isnull().sum())
            summary_text = summary_text + '\n' + summary.to_string() + '\n\n'

            summary_list.append(summary_text)
        
        if not printed:
            summary_gen = iter(summary_list)
            return summary_gen
        else:
            return summary_list
        
    if printed:
        print('\n'.join(make_tables()))
    else:
        return make_tables()

In [1260]:
# Examine variables to determine appropriate data types
explore_value_counts(data, n=30, printed=True)

COLUMN:   "assessor_book"
TOTAL UNIQUE:  735
TOTAL MISSING:  4
                COUNT  PERCENTAGE
UNIQUE VALUES:                   
5144               30    0.020053
4211               19    0.012701
4129               15    0.010027
4319               14    0.009358
5151               12    0.008021
4117                9    0.006016
2146                9    0.006016
4334                8    0.005348
5532                7    0.004679
2761                7    0.004679
5085                7    0.004679
4303                7    0.004679
5139                6    0.004011
4423                6    0.004011
2350                6    0.004011
5039                6    0.004011
5546                6    0.004011
4360                6    0.004011
5086                5    0.003342
5153                5    0.003342
2664                5    0.003342
4326                5    0.003342
4112                5    0.003342
2260                5    0.003342
4405                5    0.003342
5202               

### Summary of Missing Values & Data Types
The following columns will have there data types updated:

#### Category columns
* status
* permit_type
* permit_sub_type
* permit_category
* initiating_office
* license_type
* zone
* census_tract
* council_district
* applicant_relationship

#### Integer columns:
* project_number
* address_start
* address_end
* no_of_residential_dwelling_units
* no_of_accessory_dwelling_units
* no_of_stories
* license_no

#### Float columns:
* valuation
