# permits-data

ETL pipeline for construction permits data in Los Angeles, California, USA.

For more information:
https://data.lacity.org/A-Prosperous-City/Building-and-Safety-Permit-Information/yv23-pmwf

In [373]:
import os
import sys
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv());
import numpy as np
import pandas as pd
import psycopg2

In [374]:
# Set notebook display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [375]:
# Set path for modules
sys.path[0] = '../'

# Set environment variables
POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
POSTGRES_DB = os.getenv("POSTGRES_DB")
DB_PORT = os.getenv("DB_PORT")
DB_HOST = os.getenv("DB_HOST")
DATA_URL = os.getenv("DB_DATA_URLPORT")

## 1. Import Data

In [376]:
raw_data = 'permits_raw.csv'
DATA_PATH = sys.path[0] + 'data/raw/' + raw_data

In [377]:
conn = psycopg2.connect(dbname=POSTGRES_DB,
                       user=POSTGRES_USER,
                       password=POSTGRES_PASSWORD,
                        host=DB_HOST, 
                        port=DB_PORT)

# Extract full dataset
#data = pd.read_sql_query(sql, conn)

### 1.1 Update Table Columns in PostgreSQL Database

In [378]:
# Get raw data column names
def get_table_names(table):
    sql = "SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = N'{}'".format(table)
    etl = pd.read_sql_query(sql, conn)
    old_columns = etl['column_name']
    
    return old_columns

In [379]:
# Check table names
get_table_names("permits_raw").head()

0      Assessor Book
1      Assessor Page
2    Assessor Parcel
3              Tract
4              Block
Name: column_name, dtype: object

In [380]:
# Retrieve table column names
old_columns = get_table_names("permits_raw")

In [381]:
# Rename columns, will update table later
def format_names(series):
    # Replace whitespace with underscore
    series = series.str.replace(' ', '_')

    # Replace hyphen with underscore
    series = series.str.replace('-', '_')

    # Replace hashtag with No (short for number)
    series = series.str.replace('#', 'No')

    # Replace forward slash with underscore
    series = series.str.replace('/', '_')

    # Remove period
    series = series.str.replace('.', '')

    # Remove open parenthesis
    series = series.str.replace('(', '')

    # Remove closed parenthesis
    series = series.str.replace(')', '')

    # Remove apostrophe
    series = series.str.replace("'", '')
    
    return series.str.lower()

In [382]:
# Transform table column names for permits_raw
new_columns = format_names(old_columns);

In [383]:
# Creates a SQL query to update table columns and writes to text file
def create_query(old_columns, new_columns, update_db=False):
    
    sql = 'ALTER TABLE permits_raw RENAME "{old_name}" to {new_name};'
    
    sql_query = []

    for idx, name in old_columns.iteritems():
        #print(idx, name)
        sql_query.append(sql.format(old_name=name, new_name=new_columns[idx]))
        
    update_names = '\n'.join(sql_query)
    # update later: sql_file = os.path.join(os.path.dirname(__file__), "../postgres/scripts/update_names.sql")
    with open('../postgres/scripts/update_names.sql', 'w') as text:
        text.write(update_names)
        
    if update_db==True:
        cur = conn.cursor()
        sql_file = open('../postgres/scripts/update_names.sql', 'r')
        cur.execute(sql_file.read())

In [384]:
# Create SQL query for permits_raw
create_query(old_columns, new_columns, update_db=True)

In [385]:
# Check table names are updated
get_table_names("permits_raw").head()

0      assessor_book
1      assessor_page
2    assessor_parcel
3              tract
4              block
Name: column_name, dtype: object

In [386]:
# Extract full dataset
data = pd.read_sql_query(sql_all, conn)
data.head()

Unnamed: 0,assessor_book,assessor_page,assessor_parcel,tract,block,lot,reference_no_old_permit_no,pcis_permit_no,status,status_date,permit_type,permit_sub_type,permit_category,project_number,event_code,initiating_office,issue_date,address_start,address_fraction_start,address_end,address_fraction_end,street_direction,street_name,street_suffix,suffix_direction,unit_range_start,unit_range_end,zip_code,work_description,valuation,floor_area_la_zoning_code_definition,no_of_residential_dwelling_units,no_of_accessory_dwelling_units,no_of_stories,contractors_business_name,contractor_address,contractor_city,contractor_state,license_type,license_no,principal_first_name,principal_middle_name,principal_last_name,license_expiration_date,applicant_first_name,applicant_last_name,applicant_business_name,applicant_address_1,applicant_address_2,applicant_address_3,zone,occupancy,floor_area_la_building_code_definition,census_tract,council_district,latitude_longitude,applicant_relationship,existing_code,proposed_code
0,4317,3,***,TR 30210-C,,LT 1,,15044-90000-08405,Permit Finaled,09/10/2015,HVAC,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2015-08-18,1823,1/2,1823,1/2,S,THAYER,AVE,,,,90025.0,,,,,,,CONDITIONED AIRE MECHANICAL & ENGINEERING INC,18650 PARTHENIA STREET,NORTHRIDGE,CA,C20,532440,BRETT,MOORE,HOFFER,2016-06-30,BRETT,HOFFER,,18650 PARTHENIA ST,,"NORTHRIDGE, CA",R3-1-O,,0.0,2671.0,5,"(34.05474, -118.42628)",Net Applicant,,
1,5005,10,017,CHESTERFIELD SQUARE,,465,16SL57806,16016-70000-02464,Permit Finaled,08/01/2017,Bldg-Alter/Repair,1 or 2 Family Dwelling,No Plan Check,,,SOUTH LA,2016-02-04,2122,,2122,,W,54TH,ST,,,,90062.0,General rehabilitation for single family dwell...,40000.0,,,,,OWNER-BUILDER,,,,,0,JAVIER,,TALAMANTES,,JAVIER,TALAMANTES,OWNER-BUILDER,,,,C2-1VL,,,2325.0,8,"(33.99307, -118.31668)",Owner-Bldr,1.0,
2,5154,23,022,SUN-SET TRACT,D,13,14VN81535,14016-20000-13092,Issued,08/13/2014,Bldg-Alter/Repair,Apartment,Plan Check,,,VAN NUYS,2014-08-13,415,,415,,S,BURLINGTON,AVE,,1-30,1-30,90057.0,PHOTOVOLTAIC SOLAR PANELS ON ROOF OF (E) APT BLDG,37000.0,,,,,PERMACITY CONSTRUCTION CORP,5570 W WASHINGTON BLVD,LOS ANGELES,CA,B,827864,JONATHAN,SAUL,PORT,2015-11-30,LINDA,MARTON,,710 WILSHIRE BLVD,,"SANTA MONICA, CA",R4-1,,,2089.04,1,"(34.06012, -118.26997)",Agent for Owner,5.0,
3,4404,30,010,TR 12086,,2,,16044-30000-09658,Permit Finaled,08/29/2016,HVAC,1 or 2 Family Dwelling,No Plan Check,,,WEST LA,2016-08-22,315,,315,,S,OCEANO,DR,,,,90049.0,,,,,,,E/C HEATING AND AIR CONDITION,26888 CUATRO MILPAS ST,VALENCIA,CA,C20,651051,EDY,RUDOLFO,CORDON,2018-07-31,,,,,,,RS-1,,0.0,2640.0,11,"(34.05707, -118.4732)",Contractor,,
4,2646,19,011,TR 7158,,11,,17042-90000-31792,Permit Finaled,12/28/2017,Plumbing,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2017-12-26,13640,,13640,,W,PIERCE,ST,,,,91331.0,,,,,,,TITANIUM POWER INC,1545 S LA CIENEGA BLVD,LOS ANGELES,CA,B,989217,DENNIS,HARUO,MIYAHIRA,2017-12-31,YONI,GHERMEZI,,1545 S LA CIENEGA BLVD,,"LOS ANGELES, CA",R1-1-O,,0.0,1044.03,7,"(34.25487, -118.43002)",Net Applicant,,


In [387]:
data.shape

(10000, 59)

## 2. Clean Data

In [316]:
# Fetch fresh data
data = pd.read_sql_query(sql_all, conn)

### 2.1 Data Types

In [317]:
df = data[data['assessor_parcel'].isnull()==1]
df

Unnamed: 0,assessor_book,assessor_page,assessor_parcel,tract,block,lot,reference_no_old_permit_no,pcis_permit_no,status,status_date,permit_type,permit_sub_type,permit_category,project_number,event_code,initiating_office,issue_date,address_start,address_fraction_start,address_end,address_fraction_end,street_direction,street_name,street_suffix,suffix_direction,unit_range_start,unit_range_end,zip_code,work_description,valuation,floor_area_la_zoning_code_definition,no_of_residential_dwelling_units,no_of_accessory_dwelling_units,no_of_stories,contractors_business_name,contractor_address,contractor_city,contractor_state,license_type,license_no,principal_first_name,principal_middle_name,principal_last_name,license_expiration_date,applicant_first_name,applicant_last_name,applicant_business_name,applicant_address_1,applicant_address_2,applicant_address_3,zone,occupancy,floor_area_la_building_code_definition,census_tract,council_district,latitude_longitude,applicant_relationship,existing_code,proposed_code
609,,,,TR 72558,,1,,17041-20000-21404,Permit Closed,04/18/2019,Electrical,Commercial,No Plan Check,,,VAN NUYS,2017-06-26,1077,,1077,,W,38TH,ST,,TMP 1,,90037.0,,,,,,,POWER PLUS,1210 N RED GUM STREET,ANAHEIM,CA,C10,980589,STEVEN,RONALD,BRAY,2019-01-31,JOSE,,,,,,(T)(Q)C2-1,,0,2312.2,8.0,"(34.0174, -118.29395)",Agent for Contractor,,
1463,,,,,,,,17041-90000-46288,Re-Activate Permit,06/05/2018,Electrical,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2017-12-26,849,,849,,S,SHENANDOAH,ST,,1-16,,90035.0,,,,,,,SOUTH WEST BUILDERS,12517 SHERMAN WAY UNIT B,NORTH HOLLYWOOD,CA,C10,1012298,RONALD,FREDERICK,KROFTA,2018-03-31,ASHER,AMAR,,12517 SHERMAN WAY,,"NORTH HOLLYWOOD, CA",,,0,,,,Net Applicant,,
4547,,,,TR 67429-01,,1,14VN72906,13047-20000-01168,Permit Finaled,09/09/2014,Swimming-Pool/Spa,Apartment,Plan Check,,,VAN NUYS,2014-03-14,6201,,6201,,W,HOLLYWOOD,BLVD,,,,90028.0,"(N) SPA (8'-4"" X 8'-4"") PER LA CITY STD PLAN #268",19000.0,,,,,ADDISON POOLS INC,4363 WOODMAN AVENUE,SHERMAN OAKS,CA,C53,753417,DRAKE,ADDISON,WOODS,2014-08-31,IAN,MORGAN,,4363 WOODMAN AVE,,"SHERMAN OAKS, CA",[Q]R3-1XL,Misc. Occ. Group,0,1910.0,13.0,"(34.1018, -118.32461)",Agent for Contractor,,20.0
5710,,,,TR 68142,,16,15LA,15043-10000-00231,Permit Finaled,02/23/2015,Fire Sprinkler,Commercial,Plan Check,,,METRO,2015-01-16,14729,,14729,,W,SHERMAN,WAY,,,,91405.0,REVISIONS TO EXISTING NFPA 13D SPRINKLER SYSTE...,,,,,,FIRE SPRINKLER SYSTEMS INC,705 E HARRISON STE 200,CORONA,CA,C16,684600,HAROLD,JEROME,RODGERS,2015-02-28,,,"FIRE SPRINKLER SYSTEMS, INC.",705 E. HARRISON,STE 200,CORONA CA,[Q]RD1.5-1,,0,1272.2,6.0,"(34.20141, -118.45388)",Contractor,,
5941,,,,TR 73320,,24,,17041-20000-09974,Permit Finaled,01/26/2019,Electrical,1 or 2 Family Dwelling,No Plan Check,,,VAN NUYS,2017-03-29,127,,127,,S,BEECH,LANE,,,,90042.0,,,,,,,WILLIAMS HOMES INC,21080 CENTRE POINTE PARKWAY,SANTA CLARITA,CA,B,734097,LANCE,KARL,WILLIAMS,2017-03-31,CHERYL,,,,,,(Q)C4-2D-HPOZ,,0,1838.1,1.0,"(34.10479, -118.20032)",Agent for Contractor,,
6204,,,,TR 72691,,11,,15041-10000-36040,Permit Finaled,03/15/2016,Electrical,Commercial,Plan Check,,,METRO,2015-12-11,111,,111,,E,UNIVERSAL HOLLYWOOD,DR,,30TH FLR,,91608.0,,,,,,,O'BRYANT ELECTRIC INC,20417 NORDHOFF STREET,CHATSWORTH,CA,C10,386619,CATHERINE,TERUKO,O'BRYANT,2016-01-31,WHITNEY,ROBINSON,AMA CONSULTING ENGINEERINGS,2101 EL SEGUNDO BLVD,#303,"EL SEGUNDO, CA",C2-1,,0,1437.0,4.0,"(34.13855, -118.36112)",Other,,
7843,,,,TR 68945-C,,LT 1,,13044-10000-08813,Permit Finaled,06/10/2014,HVAC,Apartment,No Plan Check,,,METRO,2013-08-21,5200,,5200,,W,WILSHIRE,BLVD,,,,90036.0,,,,,,,BERNARDS BROS INC,555 FIRST STREET,SAN FERNANDO,CA,B,302007,DOUGLAS,DEAN,BERNARDS,2015-03-31,,,,,,,[Q]C4-2D,,0,2110.0,4.0,"(34.06194, -118.3435)",Contractor,,
8442,,,,,,,,19042-90000-13476,Permit Finaled,06/17/2019,Plumbing,Apartment,No Plan Check,,,INTERNET,2019-06-07,6321,1/2,6321,1/2,N,FULTON,AVE,,,,91401.0,,,,,,,METRO RETROFITTING INC,7631 ALABAMA AVE STE A,CANOGA PARK,CA,C36,505706,FABIAN,,FRIEDMAN,2020-05-31,RAE,ROBLES,,7631 ALABAMA AVE,A,"CANOGA PARK, CA",,,0,,,,Net Applicant,,
9627,,,,RANCHO LA BREA,,,,14042-90000-05761,Refund Completed,05/23/2014,Plumbing,Commercial,No Plan Check,,,INTERNET,2014-03-28,6333,,6333,,W,3RD,ST,,e,,90036.0,,,,,,,PLUMBING SYSTEMS WEST INC,31407 OUTER HIGHWAY 10 STE B,REDLANDS,CA,C36,933542,ROBERT,JAMES,GRABLE,2015-06-30,BOB,GRABLE,,31407 OUTER HIGHWAY 10,11,"REDLANDS, CA",,,0,,,,Net Applicant,,
9647,,,,TR 72510,,15,,16042-20000-26768,Issued,12/12/2016,Plumbing,1 or 2 Family Dwelling,No Plan Check,,,VAN NUYS,2016-12-12,17322,,17322,,W,PARQUE VANOWEN,WAY,,,,91406.0,,,,,,,LANDSCAPE DEVELOPMENT INC,28447 WITHERSPOON PARKWAY,VALENCIA,CA,C27,450067,GARY,RICHARD,HORTON,2017-12-31,STEPHANIE,TERRAZAS,,,,,(Q)RD3-1-RIO,,0,1320.01,6.0,"(34.19338, -118.51076)",Agent for Contractor,,


In [318]:
data[['assessor_book', 'assessor_page', 'assessor_parcel']].fillna(np.nan, inplace=True)

In [321]:
data[['assessor_book', 'assessor_page']].astype(int)

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [66]:
#data = format_names(data)

# Convert Valuation column to float values
data['Valuation'].fillna(0, inplace=True)
data['Valuation'] = data['Valuation'].astype(float);

In [67]:
data.head()

Unnamed: 0,Assessor Book,Assessor Page,Assessor Parcel,Tract,Block,Lot,Reference # (Old Permit #),PCIS Permit #,Status,Status Date,Permit Type,Permit Sub-Type,Permit Category,Project Number,Event Code,Initiating Office,Issue Date,Address Start,Address Fraction Start,Address End,Address Fraction End,Street Direction,Street Name,Street Suffix,Suffix Direction,Unit Range Start,Unit Range End,Zip Code,Work Description,Valuation,Floor Area-L.A. Zoning Code Definition,# of Residential Dwelling Units,# of Accessory Dwelling Units,# of Stories,Contractor's Business Name,Contractor Address,Contractor City,Contractor State,License Type,License #,Principal First Name,Principal Middle Name,Principal Last Name,License Expiration Date,Applicant First Name,Applicant Last Name,Applicant Business Name,Applicant Address 1,Applicant Address 2,Applicant Address 3,Zone,Occupancy,Floor Area-L.A. Building Code Definition,Census Tract,Council District,Latitude/Longitude,Applicant Relationship,Existing Code,Proposed Code
0,5408,11,907,TR 10418,,LT 2,,15041-10000-21893,Permit Finaled,09/20/2016,Electrical,Commercial,Plan Check,,,METRO,11/06/2015,650,,650,,N,MAIN,ST,,,,90012,,0.0,,,,,M A ELECTRIC,P O BOX 1783,COVINA,CA,C10,569475,MAURICIO,ALONSO,AVELAR,05/31/2017,ABEL,MARIN,ECCO ENGINEERING FIRM,726 W BROADWAY,A,GLENDALE CA,C4-1VL,,0.0,2071.0,14,,Engineer,,
1,5151,1,24,TR 21949,,LT 1,,17041-90000-32378,Issued,09/11/2017,Electrical,Commercial,No Plan Check,,,INTERNET,09/11/2017,261,,261,,S,FIGUEROA,ST,,260,,90012,,0.0,,,,,CANDUIT ELECTRIC INC,8218 HILLROSE ST,SUNLAND,CA,C10,1013159,CHRISTOPHER,MICHAEL,TRUELOVE,09/30/2018,CHRIS,TRUELOVE,,8218 HILLROSE,,"SUNLAND, CA",C4-4D,,0.0,2075.01,14,"(34.05546, -118.25446)",Net Applicant,,
2,5151,14,31,TR 21409,,LT 1,,16042-10000-28033,Permit Finaled,02/15/2017,Plumbing,Commercial,No Plan Check,,,METRO,12/28/2016,333,,333,,S,HOPE,ST,,23RD FL,,90071,,0.0,,,,,MUIR-CHASE PLUMBING CO INC,4530 BRAZIL STREET,LOS ANGELES,CA,C36,539835,GRANT,DRAKE,MUIR,08/31/2018,,,,,,,C4-4D,,0.0,2075.02,14,"(34.05326, -118.25292)",Contractor,,
3,5144,21,31,H. F. SPENCER SUBDIVISION OF NORTH 1/2 BLOCK 5...,,LT NO 3,,19041-10000-02555,Issued,01/22/2019,Electrical,Commercial,No Plan Check,,,METRO,01/22/2019,813,,813,,S,FLOWER,ST,,,,90017,,0.0,,,,,VELRIO CONSTRUCTION INC,31805 TEMECULA PARKWAY STE 132,TEMECULA,CA,C-7,1017086,FRANCISCO,,DE JESUS VELEZ,08/31/2020,FRANCISCO,VELEZ,VELRIO CONSTRUCTION INC,,,,C2-4D,,0.0,2077.1,14,"(34.04705, -118.26051)",Contractor,,
4,5163,18,6,MILLS AND WICKS EXTENSION OF SECOND ST. AND AD...,,233,18LA03217,16016-10004-17710,Permit Finaled,06/13/2018,Bldg-Alter/Repair,Commercial,Plan Check,,,METRO,05/17/2018,1019,,1019,,E,4TH,PL,,,,90013,SUPPLEMENTAL TO 16016-10000-17710 WINDOW WASHI...,0.0,,,,,SHANGRI - LA CONSTRUCTION L P,550 S HOPE ST STE 700,LOS ANGELES,CA,B,926436,BENJAMIN,SALEM,WAHAB,12/31/2018,SHANGRI-LA,CONSTR.,,550 S. HOPE ST.,700,"LOS ANGELES, CA",M3-1-RIO,,,2060.31,14,,Agent for Contractor,13.0,
