# permits-data / Load Data

ETL pipeline for construction permits data in Los Angeles, California, USA.

For more information:
https://data.lacity.org/A-Prosperous-City/Building-and-Safety-Permit-Information/yv23-pmwf

In [23]:
import os
import sys
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv());
import numpy as np
import pandas as pd
import psycopg2

In [24]:
# Set notebook display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [25]:
# Get project root directory
root_dir = os.path.dirname(os.getcwd())

# Set path for modules
sys.path[0] = '../'

# Set environment variables
POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
POSTGRES_DB = os.getenv("POSTGRES_DB")
DB_PORT = os.getenv("DB_PORT")
DB_HOST = os.getenv("DB_HOST")
DATA_URL = os.getenv("DATA_URL")

# Environment variables specific to notebook
DATA_DIR = os.path.dirname(root_dir) + '/data'
DB_TABLE = "permits_raw"

## 1. Import Data

In [61]:
# Connect to PostgreSQL, useful only for notebook
def connect_db():
    try:
        con = psycopg2.connect(dbname=POSTGRES_DB,
                               user=POSTGRES_USER,
                               password=POSTGRES_PASSWORD,
                                host=DB_HOST, 
                                port=DB_PORT)
    except Exception as e:
        print('Error:\n', e)
    
    return con

In [62]:
conn = connect_db()

### 1.1 Update Table Columns in PostgreSQL Database

In [63]:
# Get raw data column names
def get_table_names(table, con):
    sql = "SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = N'{}'".format(table)
    etl = pd.read_sql_query(sql, con)
    columns = etl['column_name']
    
    return columns

In [64]:
# Check table names
get_table_names("permits_raw", conn).head(10)

0                 assessor_book
1                 assessor_page
2               assessor_parcel
3                         tract
4                         block
5                           lot
6    reference_no_old_permit_no
7                pcis_permit_no
8                        status
9                   status_date
Name: column_name, dtype: object

In [30]:
# Retrieve table column names
old_columns = get_table_names("permits_raw", conn)

In [31]:
# Rename columns, will update table later
def format_names(series):
    
    replace_map = {' ': '_', '-': '_', '#': 'No', '/': '_', 
                   '.': '', '(': '', ')': '', "'": ''}

    def replace_chars(text):
        for oldchar, newchar in replace_map.items():
            text = text.replace(oldchar, newchar).lower()
        return text

    return series.apply(replace_chars)

In [32]:
# Transform table column names for permits_raw
new_columns = format_names(old_columns)

In [33]:
new_columns.head()

0      assessor_book
1      assessor_page
2    assessor_parcel
3              tract
4              block
Name: column_name, dtype: object

In [35]:
# Creates a SQL query to update table columns and writes to text file
### pass conn context
def create_query(old_columns, new_columns, db_table, con, run=False):
    
    sql = 'ALTER TABLE {} '.format(db_table) + 'RENAME "{old_name}" to {new_name};'
    
    
    sql_query = []

    for idx, name in old_columns.iteritems():
        sql_query.append(sql.format(old_name=name, new_name=new_columns[idx]))
        
    update_names = '\n'.join(sql_query)
    
    # update later: sql_file = os.path.join(os.path.dirname(__file__), "../postgres/scripts/update_names.sql")
    with open('../postgres/sql/update_names.sql', 'w') as text:
        text.write(update_names)
        
    # Update db is desired
    if run:
        cur = con.cursor()
        sql_file = open('../postgres/sql/update_names.sql', 'r')
        cur.execute(sql_file.read())
        con.commit()
        #conn.close()

In [36]:
# Create SQL query for permits_raw
try:
    create_query(old_columns, new_columns, run=True, con=conn, db_table=DB_TABLE)
except Exception as e: 
    conn.rollback()
    print('Error:\n', e)

In [38]:
# Check table names are updated
get_table_names("permits_raw", conn).head()

0      assessor_book
1      assessor_page
2    assessor_parcel
3              tract
4              block
Name: column_name, dtype: object

In [53]:
# TEST: 
assert (get_table_names("permits_raw", 
                        conn) == new_columns).mean() == 1, "Database table names do not match new table names"

In [55]:
# Extract full dataset
sql_all = 'SELECT * FROM {};'.format(DB_TABLE)

# Extract full dataset
data = pd.read_sql_query(sql_all, conn)
data.head()

Unnamed: 0,assessor_book,assessor_page,assessor_parcel,tract,block,lot,reference_no_old_permit_no,pcis_permit_no,status,status_date,permit_type,permit_sub_type,permit_category,project_number,event_code,initiating_office,issue_date,address_start,address_fraction_start,address_end,address_fraction_end,street_direction,street_name,street_suffix,suffix_direction,unit_range_start,unit_range_end,zip_code,work_description,valuation,floor_area_la_zoning_code_definition,no_of_residential_dwelling_units,no_of_accessory_dwelling_units,no_of_stories,contractors_business_name,contractor_address,contractor_city,contractor_state,license_type,license_no,principal_first_name,principal_middle_name,principal_last_name,license_expiration_date,applicant_first_name,applicant_last_name,applicant_business_name,applicant_address_1,applicant_address_2,applicant_address_3,zone,occupancy,floor_area_la_building_code_definition,census_tract,council_district,latitude_longitude,applicant_relationship,existing_code,proposed_code
0,4317,3,***,TR 30210-C,,LT 1,,15044-90000-08405,Permit Finaled,09/10/2015,HVAC,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2015-08-18,1823,1/2,1823,1/2,S,THAYER,AVE,,,,90025.0,,,,,,,CONDITIONED AIRE MECHANICAL & ENGINEERING INC,18650 PARTHENIA STREET,NORTHRIDGE,CA,C20,532440,BRETT,MOORE,HOFFER,2016-06-30,BRETT,HOFFER,,18650 PARTHENIA ST,,"NORTHRIDGE, CA",R3-1-O,,0.0,2671.0,5,"(34.05474, -118.42628)",Net Applicant,,
1,5005,10,017,CHESTERFIELD SQUARE,,465,16SL57806,16016-70000-02464,Permit Finaled,08/01/2017,Bldg-Alter/Repair,1 or 2 Family Dwelling,No Plan Check,,,SOUTH LA,2016-02-04,2122,,2122,,W,54TH,ST,,,,90062.0,General rehabilitation for single family dwell...,40000.0,,,,,OWNER-BUILDER,,,,,0,JAVIER,,TALAMANTES,,JAVIER,TALAMANTES,OWNER-BUILDER,,,,C2-1VL,,,2325.0,8,"(33.99307, -118.31668)",Owner-Bldr,1.0,
2,5154,23,022,SUN-SET TRACT,D,13,14VN81535,14016-20000-13092,Issued,08/13/2014,Bldg-Alter/Repair,Apartment,Plan Check,,,VAN NUYS,2014-08-13,415,,415,,S,BURLINGTON,AVE,,1-30,1-30,90057.0,PHOTOVOLTAIC SOLAR PANELS ON ROOF OF (E) APT BLDG,37000.0,,,,,PERMACITY CONSTRUCTION CORP,5570 W WASHINGTON BLVD,LOS ANGELES,CA,B,827864,JONATHAN,SAUL,PORT,2015-11-30,LINDA,MARTON,,710 WILSHIRE BLVD,,"SANTA MONICA, CA",R4-1,,,2089.04,1,"(34.06012, -118.26997)",Agent for Owner,5.0,
3,4404,30,010,TR 12086,,2,,16044-30000-09658,Permit Finaled,08/29/2016,HVAC,1 or 2 Family Dwelling,No Plan Check,,,WEST LA,2016-08-22,315,,315,,S,OCEANO,DR,,,,90049.0,,,,,,,E/C HEATING AND AIR CONDITION,26888 CUATRO MILPAS ST,VALENCIA,CA,C20,651051,EDY,RUDOLFO,CORDON,2018-07-31,,,,,,,RS-1,,0.0,2640.0,11,"(34.05707, -118.4732)",Contractor,,
4,2646,19,011,TR 7158,,11,,17042-90000-31792,Permit Finaled,12/28/2017,Plumbing,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2017-12-26,13640,,13640,,W,PIERCE,ST,,,,91331.0,,,,,,,TITANIUM POWER INC,1545 S LA CIENEGA BLVD,LOS ANGELES,CA,B,989217,DENNIS,HARUO,MIYAHIRA,2017-12-31,YONI,GHERMEZI,,1545 S LA CIENEGA BLVD,,"LOS ANGELES, CA",R1-1-O,,0.0,1044.03,7,"(34.25487, -118.43002)",Net Applicant,,


In [56]:
data.shape
conn.close()

## 2. Clean Data

In [73]:
# Connect to db
conn = connect_db()

# Extract full dataset
sql_all = 'SELECT * FROM {};'.format(DB_TABLE)

# Fetch fresh data
#data = pd.read_sql_query(sql_all, conn)

In [75]:
data.head()

Unnamed: 0,assessor_book,assessor_page,assessor_parcel,tract,block,lot,reference_no_old_permit_no,pcis_permit_no,status,status_date,permit_type,permit_sub_type,permit_category,project_number,event_code,initiating_office,issue_date,address_start,address_fraction_start,address_end,address_fraction_end,street_direction,street_name,street_suffix,suffix_direction,unit_range_start,unit_range_end,zip_code,work_description,valuation,floor_area_la_zoning_code_definition,no_of_residential_dwelling_units,no_of_accessory_dwelling_units,no_of_stories,contractors_business_name,contractor_address,contractor_city,contractor_state,license_type,license_no,principal_first_name,principal_middle_name,principal_last_name,license_expiration_date,applicant_first_name,applicant_last_name,applicant_business_name,applicant_address_1,applicant_address_2,applicant_address_3,zone,occupancy,floor_area_la_building_code_definition,census_tract,council_district,latitude_longitude,applicant_relationship,existing_code,proposed_code
0,4317,3,***,TR 30210-C,,LT 1,,15044-90000-08405,Permit Finaled,09/10/2015,HVAC,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2015-08-18,1823,1/2,1823,1/2,S,THAYER,AVE,,,,90025.0,,,,,,,CONDITIONED AIRE MECHANICAL & ENGINEERING INC,18650 PARTHENIA STREET,NORTHRIDGE,CA,C20,532440,BRETT,MOORE,HOFFER,2016-06-30,BRETT,HOFFER,,18650 PARTHENIA ST,,"NORTHRIDGE, CA",R3-1-O,,0.0,2671.0,5,"(34.05474, -118.42628)",Net Applicant,,
1,5005,10,017,CHESTERFIELD SQUARE,,465,16SL57806,16016-70000-02464,Permit Finaled,08/01/2017,Bldg-Alter/Repair,1 or 2 Family Dwelling,No Plan Check,,,SOUTH LA,2016-02-04,2122,,2122,,W,54TH,ST,,,,90062.0,General rehabilitation for single family dwell...,40000.0,,,,,OWNER-BUILDER,,,,,0,JAVIER,,TALAMANTES,,JAVIER,TALAMANTES,OWNER-BUILDER,,,,C2-1VL,,,2325.0,8,"(33.99307, -118.31668)",Owner-Bldr,1.0,
2,5154,23,022,SUN-SET TRACT,D,13,14VN81535,14016-20000-13092,Issued,08/13/2014,Bldg-Alter/Repair,Apartment,Plan Check,,,VAN NUYS,2014-08-13,415,,415,,S,BURLINGTON,AVE,,1-30,1-30,90057.0,PHOTOVOLTAIC SOLAR PANELS ON ROOF OF (E) APT BLDG,37000.0,,,,,PERMACITY CONSTRUCTION CORP,5570 W WASHINGTON BLVD,LOS ANGELES,CA,B,827864,JONATHAN,SAUL,PORT,2015-11-30,LINDA,MARTON,,710 WILSHIRE BLVD,,"SANTA MONICA, CA",R4-1,,,2089.04,1,"(34.06012, -118.26997)",Agent for Owner,5.0,
3,4404,30,010,TR 12086,,2,,16044-30000-09658,Permit Finaled,08/29/2016,HVAC,1 or 2 Family Dwelling,No Plan Check,,,WEST LA,2016-08-22,315,,315,,S,OCEANO,DR,,,,90049.0,,,,,,,E/C HEATING AND AIR CONDITION,26888 CUATRO MILPAS ST,VALENCIA,CA,C20,651051,EDY,RUDOLFO,CORDON,2018-07-31,,,,,,,RS-1,,0.0,2640.0,11,"(34.05707, -118.4732)",Contractor,,
4,2646,19,011,TR 7158,,11,,17042-90000-31792,Permit Finaled,12/28/2017,Plumbing,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2017-12-26,13640,,13640,,W,PIERCE,ST,,,,91331.0,,,,,,,TITANIUM POWER INC,1545 S LA CIENEGA BLVD,LOS ANGELES,CA,B,989217,DENNIS,HARUO,MIYAHIRA,2017-12-31,YONI,GHERMEZI,,1545 S LA CIENEGA BLVD,,"LOS ANGELES, CA",R1-1-O,,0.0,1044.03,7,"(34.25487, -118.43002)",Net Applicant,,


In [None]:
# Create a smaller version of dataset
df = data[:1000]

### 2.1 Null Values, Missing Data & Data Types

#### Null Values & Missing Data

In [99]:
#data.isnull().sum()

assessor_book                                1324
assessor_page                                1324
assessor_parcel                              1324
tract                                        4735
block                                      925241
lot                                          8111
reference_no_old_permit_no                 633772
pcis_permit_no                                  0
status                                          0
status_date                                     0
permit_type                                     0
permit_sub_type                                 0
permit_category                                 0
project_number                            1136762
event_code                                1148177
initiating_office                               0
issue_date                                      0
address_start                                  10
address_fraction_start                    1132625
address_end                                    14


#### Data Types

In [74]:
data.dtypes

assessor_book                              object
assessor_page                              object
assessor_parcel                            object
tract                                      object
block                                      object
lot                                        object
reference_no_old_permit_no                 object
pcis_permit_no                             object
status                                     object
status_date                                object
permit_type                                object
permit_sub_type                            object
permit_category                            object
project_number                             object
event_code                                 object
initiating_office                          object
issue_date                                 object
address_start                              object
address_fraction_start                     object
address_end                                object


#### Summary
The columns *zip_code* and *latitude_longitude* need their missing values to be inferred through geocoding. Other missing data can remain as is.

#### Steps
1) Split *latitude_longitude* into separate columns and convert to float values: *latitude*, *longitude*<br>
2) Combine address columns into one columns: *full_address*<br>
3) Geocode missing *latitude_longitude* with *full_address*<br>
4) Geocode missing *zip_code* with complete *latitude_longitude*<br>
5) Geocode any missing *full_address* with *latitude_longitude*<br>
6) Convert *zip_code* to integer type<br>

In [242]:
# Creates a dataframe to show value counts for columns with less than n unique values
def explore_value_counts(dataframe, n):
    
    for column, row in dataframe.iteritems():
        
        series = dataframe[column]
        name = series.name
        
        if series.nunique() < n:
            counted = series.value_counts(sort=True)
            percent = series.value_counts(sort=True, normalize=True)
            summary = pd.concat([counted, percent], axis=1)
            summary.columns = ['COUNT', 'PERCENTAGE']
            #summary.index = summary.index.rename('"{}":'.format(name))
            summary.index = summary.index.rename('CATEGORIES:')
            
            #print('\n', summary, '\n', '\n"{}" total unique values: {}'.format(name, series.nunique()))
            print('\n', '\n"{}:"\n\nUnique values:  {}'.format(name, series.nunique()), '\n', summary)

In [243]:
explore_value_counts(df, 30)


 
"status:"

Unique values:  10 
                     COUNT  PERCENTAGE
CATEGORIES:                          
Permit Finaled        724       0.724
Issued                150       0.150
Permit Expired         51       0.051
CofO Issued            38       0.038
Permit Closed          20       0.020
Refund Completed        9       0.009
Re-Activate Permit      3       0.003
CofO in Progress        2       0.002
CofC Issued             2       0.002
Refund in Progress      1       0.001

 
"permit_type:"

Unique values:  15 
                       COUNT  PERCENTAGE
CATEGORIES:                            
Electrical              301       0.301
Bldg-Alter/Repair       220       0.220
Plumbing                203       0.203
HVAC                     90       0.090
Fire Sprinkler           55       0.055
Bldg-Addition            23       0.023
Bldg-Demolition          19       0.019
Bldg-New                 19       0.019
Grading                  16       0.016
Elevator                 13  

In [209]:
df.nunique(axis=0)

assessor_book                              584
assessor_page                               45
assessor_parcel                             97
tract                                      835
block                                       81
lot                                        237
reference_no_old_permit_no                 350
pcis_permit_no                            1000
status                                      10
status_date                                731
permit_type                                 15
permit_sub_type                              5
permit_category                              2
project_number                               3
event_code                                   0
initiating_office                            6
issue_date                                 751
address_start                              908
address_fraction_start                       2
address_end                                909
address_fraction_end                         2
street_direct

In [None]:
# Display number unique values for all columns
# Make generator
def explore_n_unique_gen(dataframe):
    
    for column, row in dataframe.iteritems():
        
        series = dataframe[column]
        name = series.name
        n_unique = series.nunique()
        n_unique_string = '"{}" unique items: {}'.format(name, n_unique)
        
        print(n_unique_string)

In [None]:
# Creates a dataframe to show value counts for columns with less than n unique values
def explore_value_counts(dataframe, n):
    
    for column, row in dataframe.iteritems():
        
        series = dataframe[column]
        name = series.name
        
        if series.nunique() < n:
            counted = series.value_counts(sort=True)
            percent = series.value_counts(sort=True, normalize=True)
            summary = pd.concat([counted, percent], axis=1)
            summary.columns = ['Count', 'Percentage']
            summary.index = summary.index.rename('"{}" categories:'.format(name))
            
            print('\n', summary, '\n')

In [116]:
column = 'permit_type'
series = df[column]

In [166]:
series.name

'permit_type'

In [178]:
name = series.name
counted = series.value_counts(sort=True)
percent = series.value_counts(sort=True, normalize=True)
summary = pd.concat([counted, percent], axis=1)
#summary[name] = name
#summary.reset_index(inplace=True)
summary.columns = ['Count', 'Percentage']
summary.index = summary.index.rename('"{}" categories:'.format(name))
print(summary)

                           Count  Percentage
"permit_type" categories:                   
Electrical                   301       0.301
Bldg-Alter/Repair            220       0.220
Plumbing                     203       0.203
HVAC                          90       0.090
Fire Sprinkler                55       0.055
Bldg-Addition                 23       0.023
Bldg-Demolition               19       0.019
Bldg-New                      19       0.019
Grading                       16       0.016
Elevator                      13       0.013
Swimming-Pool/Spa             11       0.011
Nonbldg-New                   11       0.011
Sign                           9       0.009
Nonbldg-Alter/Repair           7       0.007
Pressure Vessel                3       0.003


In [140]:
# Display Pandas tables side-by-side
from IPython.display import display, HTML
HTML('<style>.output {flex-direction: column;}</style>')