# SQL Package

Provides simple functionality to interact with a PostgreSQL server using Python classes.

**Overview of functionality:**
* Database(self, user, password, host, dbname, port)
    * properties
        * user
        * password
        * host
        * dbname
        * port
    * methods
        * create(name) x
        * connect()
        * drop(name)
* Table(self, dbname, table, schema)
    * accepts db properties
    * properties
        * connect() --> inherited
        * fetch_data(sql, con, parse_dates)
        * get_names()
        * format_names(char_dict)
        * update_names(names_dict)
        * add_columns(columns_list, type=None)
        * compare_column_order(dataframe)
        * match_columns(dataframe)
        * save_csv(data, local_path, match_column_order=True)
        * update_values(local_path, container_path)
        * update_types(types_dict)
        * close()

## Setup

In [1]:
import os
import sys
from pathlib import Path
#sys.path[0] = str(Path(__file__).resolve().parents[2]) # Set path for custom modules
import warnings
from io import StringIO

# Set path for modules
sys.path[0] = '../'

from dotenv import load_dotenv, find_dotenv
import numpy as np
import pandas as pd

# SQL libraries
import psycopg2
from src.pipeline.dictionaries import types_dict, replace_map
from src.pipeline.transform_data import create_full_address, split_lat_long
from src.toolkits.geospatial import geocode_from_address
#from src.toolkits.postgresql import Database, Table
from src.toolkits.eda import explore_value_counts

# Set notebook display options
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Get project root directory
#root_dir = os.path.dirname(os.getcwd())

# if modulename not in sys.modules: print...
load_dotenv(find_dotenv());

In [2]:
#### Database class ####

class Database():
    
    def __init__(self, user="postgres", password="postgres",
                 dbname=None, host="localhost", port=5432):

        # Loaded from .env if not explicit
        self.user = os.getenv("POSTGRES_USER") or user
        self.password = os.getenv("POSTGRES_PASSWORD") or password
        self.dbname = os.getenv("POSTGRES_DB") or dbname
        self.host = os.getenv("DB_HOST") or host
        self.port = os.getenv("DB_PORT") or port
        
        
    def _connect(self):

        """
        Connects to PostgreSQL database using psycopg2 driver. Same
        arguments as psycopg2.connect().

        Params
        --------
        dbname
        user
        password
        host
        port
        connect_timeout
        """

        try:
            con = psycopg2.connect(dbname=self.dbname,
                                   user=self.user,
                                   password=self.password,
                                    host=self.host, 
                                    port=self.port,
                                  connect_timeout=3)            
        except Exception as e:
            print('Error:', e)
            return None

        return con
    
    @property
    def _con(self):
        try:
            con = self._connect()
            print('Connected as user "{}" to database "{}" on http://{}:{}.'.format(self.user,self.dbname,
                                                               self.host,self.port))
            con.close()
        except Exception as e:
            con.rollback()
            print('Error:', e)
        finally:
            if con is not None:
                con.close()
                
                
    def _run_query(self, sql):
        
        try:
            con = self._connect()
        except Exception as e:
            print("Error:", e)            
            
        try:
            cur = con.cursor()
            cur.execute(sql)
            con.commit()
            cur.close()
            print('Query successful on database "{}".'.format(self.dbname))
        except Exception as e:
            con.rollback()
            print("Error:", e)
        finally:
            if con is not None:
                con.close()
        
        return
    
    def create_table(self, table_name, types_dict, id_col, columns=None):
        
        # Append id_col to selected columns
        columns = None if not columns else set([id_col] + columns)
        
        # Subsets types_dict by columns argument and formats into string if no columns are specified
        types_dict = types_dict if not columns else {key:value for key, value in types_dict.items() if key in set(columns)}
        names = ',\n\t'.join(['{key} {val}'.format(key=key, val=val) for key, val in types_dict.items()])
        
        # Build queries
        sql = 'CREATE TABLE {table_name} (\n\t{names}\n);\n\n' \
                            .format(table_name=table_name, names=names) # + sql
        
        # Execute query
        self._run_query(sql)
        
        return self
    
    
    def drop_table(self, table_name):
        
        # Build queries
        sql = 'DROP TABLE IF EXISTS {table_name};\n\n'.format(table_name=table_name)
        
        # Execute query
        self._run_query(sql)
        
        return self
    
    
    def _subset_types_dict(self, types_dict, columns):
        
        #print("\n\n\n\n_subset_types_dict INPUT:\n", types_dict, "\n\n")

        types_dict = types_dict if not columns else {key:value for key, value in types_dict.items() if key in set(columns)}
        columns = self.get_names().tolist() if not columns else columns

        #print("\n\n_subset_types_dict RETURN:\n", types_dict, "\n\n", columns, "\n\n\n\n")
        
        return types_dict, columns
    

    def _create_temp_table(self, types_dict, id_col, columns=None):
        
        #print("\n\n_create_temp_table INPUT to _subset_types_dict:\n\n",types_dict, "\n\n", columns, "\n\n")
        
        # Append id_col to selected columns
        columns = None if not columns else [id_col] + columns

        # CREATE TABLE query
        tmp_table = "tmp_" + self.table

        #types_dict, _ = self._subset_types_dict(types_dict, columns)

        #print("\n\n_create_temp_table columns arg INPUT:\n\n", columns, "\n\n")
        
        # Subsets types_dict by columns argument and formats into string if no columns are specified
        #names = ',\n\t'.join(['{key} {val}'.format(key=key, val=val) for key, val in types_dict.items()])

        #print("TEMP TABLE:\n", names)
        
        # Build queries
        #sql = 'DROP TABLE IF EXISTS {tmp_table};\n\n'.format(tmp_table=tmp_table)
        #sql = sql + 'CREATE TABLE {tmp_table} (\n\t{names}\n);\n\n' \
                                #.format(tmp_table=tmp_table, names=names)

        sql = """
        DROP TABLE IF EXISTS {tmp_table};
        CREATE TABLE {tmp_table} AS (SELECT * FROM {table}) WITH NO DATA;
        """.format(tmp_table=tmp_table, table=self.table)


        # Execute query
        self._run_query(sql)

        print("TEMP TABLE:\n", sql)
        
        return self
    
    
    ## List tables
    def list_tables(self):
        
        sql = """
        SELECT tablename FROM pg_catalog.pg_tables
        WHERE schemaname NOT IN ('pg_catalog', 'information_schema');
        """
        
        try:
            con = self._connect()
            cur = con.cursor()
            cur.execute(sql)
        except Exception as e:
            con.rollback()
            print("Error:", e)
            
        results = cur.fetchall()
        
        tables = []
        
        for result in results:
            tables.append(*result)
            
        return tables

In [3]:
#### Table class ####

class Table(Database):
    def __init__(self, name, user="postgres", password="postgres",
                 dbname=None, host="localhost", port=5432):
        
        super().__init__(user, password, dbname, host, port)
        
        self.table = name
        
        # Loaded from .env if not explicit
        self.user = os.getenv("POSTGRES_USER") or user
        self.password = os.getenv("POSTGRES_PASSWORD") or password
        self.dbname = os.getenv("POSTGRES_DB") or dbname
        self.host = os.getenv("DB_HOST") or host
        self.port = os.getenv("DB_PORT") or port
        self.columns = self.get_names().tolist

    
    # Connect to database
    def __connect(self):
        return super(Table, self)._connect()
    
    # Check info on connection
    def __con(self):
        return super(Table, self)._con
    
    # Run query
    def __run_query(self, sql):
        return super(Table, self)._run_query(sql)
    
    def __subset_types_dict(self, types_dict, columns):
        return super(Table, self)._subset_types_dict(types_dict, columns)

    def __create_temp_table(self, types_dict, id_col, columns):
        return super(Table, self)._create_temp_table(types_dict, id_col, columns)
    
    
    # Fetch data from sql query
    def fetch_data(self, sql=None, coerce_float=False, parse_dates=None):
        
        sql = sql or "SELECT * FROM {};".format(self.table)
        
        con = self.__connect()
        
        # Fetch fresh data
        data = pd.read_sql_query(sql=sql, con=con, coerce_float=coerce_float, parse_dates=parse_dates)
        
        # Recast integers to preserve original types
        update_dict = permits_raw.get_types(pandas_integers=True)
        update_dict = {k: v for k, v in update_dict.items() if v}
        data = data.astype(update_dict)
        
        #print(data.dtypes)
        
        # Types dict
        #types_dict = self.get_types()
        #print(types_dict)

        # Replace None with np.nan
        data.fillna(np.nan, inplace=True)
        
        # Close db connection
        con.close()

        return data
    
    
    # Get names of column
    def get_names(self):
        
        # Specific query to retrieve table names
        sql = "SELECT * FROM information_schema.columns WHERE table_name = N'{}'".format(self.table)
        
        # Run query and extract
        try:
            con = self.__connect()
            data = pd.read_sql_query(sql, con)
            column_series = data['column_name']
            con.close()
        except Exception as e:
            print("Error:", e)
    
        return column_series

    
    # Get types of columns, returns dict
    def get_types(self, as_dataframe=False, pandas_integers=False):
        
        # Specific query to retrieve table names
        sql_to_sql = """
        SELECT column_name, 
        CASE 
            WHEN domain_name is not null then domain_name
            WHEN data_type='character varying' THEN 'varchar('||character_maximum_length||')'
            WHEN data_type='character' THEN 'char('||character_maximum_length||')'
            WHEN data_type='numeric' THEN 'numeric'
            ELSE data_type
        END AS type
        FROM information_schema.columns WHERE table_name = 'permits_raw';
        """
        
        sql_to_pandas = """
        SELECT column_name, 
        CASE 
            WHEN domain_name is not null then domain_name
            WHEN data_type='smallint' OR data_type='integer' THEN 'Int64'
        END AS type        
        FROM information_schema.columns WHERE table_name = 'permits_raw';
        """
        
        sql = sql_to_sql if not pandas_integers else sql_to_pandas
        
        # Run query and extract
        try:
            con = self.__connect()
            data = pd.read_sql_query(sql, con)
            con.close()
        except Exception as e:
            print("Error:", e)
        
        data['type'] = data['type'].str.upper() if not pandas_integers else data['type']
        
        if as_dataframe:
            return data
        
        types_dict = dict(zip(data['column_name'], data['type']))
        
        return types_dict
    
    
    # Update column names in db table
    def _update_table_names(self, series):

        # Extract current columns in table
        old_columns = self.get_names()

        # Create list of reformatted columns to replace old columns 
        new_columns = series

        # SQL query string to change column names
        sql = 'ALTER TABLE {} '.format(self.table) + 'RENAME "{old_name}" to {new_name};'

        sql_query = []

        # Iterate through old column names and replace each with reformatted name 
        for idx, name in old_columns.iteritems():
            sql_query.append(sql.format(old_name=name, new_name=new_columns[idx]))

        # Join list to string
        sql_query = '\n'.join(sql_query)

        return sql_query
    

    # Standardize column names using dictionary of character replacements
    def format_table_names(self, replace_map, update=False):
        
        series = self.get_names()
        
        def replace_chars(text):
            for oldchar, newchar in replace_map.items():
                text = text.replace(oldchar, newchar).lower()
            return text
        
        series = series.apply(replace_chars)  
        
        if not update:
            warnings.warn('No changes made. Set "update=True" to run query on database.')
            return series.apply(replace_chars)
        
        else:
            sql_query = self._update_table_names(series=series)
            
            # Execute query
            self.__run_query(sql_query)
            
            return self
                    

    # Add new columns to database
    def add_columns_from_data(self, data):
        
        # Get names of current columns in PostgreSQL table
        current_names = self.get_names().tolist()

        # Get names of updated table not in current table
        updated_names = data.columns.tolist()
        new_names = list(set(updated_names) - set(current_names))

        # Check names list is not empty
        if not new_names:
            print("Table columns are already up to date.")
            return

        # Format strings for query
        alter_table_sql = "ALTER TABLE {db_table}\n"
        add_column_sql = "\tADD COLUMN {column} TEXT,\n"

        # Create a list and append ADD column statements
        sql_query = [alter_table_sql.format(db_table=self.table)]
        for name in new_names:
            sql_query.append(add_column_sql.format(column=name))

        # Join into one string
        sql = ''.join(sql_query)[:-2] + ";"

        # Execute query
        self.__run_query(sql)
        
        return self
    
    
    # Check whether dataframe columns match database table columns before running queries
    def _match_column_order(self, data):
        
        # Get columns from database as list
        db_columns = self.get_names().tolist()

        # Select columns from dataframe as list
        data_columns = data.columns.tolist()
        
        if set(data_columns) == set(db_columns):
            if data_columns != db_columns:
                print('Rearranged dataframe columns to match table "{}".'.format(self.table))
                data = data[db_columns]
                return True
            else:
                print('Dataframe columns already match table "{}".'.format(self.table))
                return True
        else:
            if len(data_columns) > len(db_columns):
                print('Dataframe has columns not in table "{}":'.format(self.table))
                print(list(set(data_columns) - set(db_columns)))
                return False
            else:
                print('Dataframe missing columns that are in table "{}":'.format(self.table))
                print(list(set(db_columns) - set(data_columns)))
                return False
        
    
    def _copy_from_dataframe(self, data, id_col, columns=None):
        
        tmp_table = "tmp_" + self.table

        match = self._match_column_order(data)

        print("\n\nNEEDS MATCH? ", match)

        if self._match_column_order(data):
            # Get columns from database as list
            db_columns = self.get_names().tolist()
            print("DB COLUMNS:\n", db_columns)
            # Select columns from dataframe as list
            data_columns = data.columns.tolist()
            print("DATA COLUMNS:\n", db_columns)
            data = data[db_columns]
            print("MATCHED!!\n")
            print(data.columns)

        
        try:
            con = self.__connect()
        except Exception as e:
            print("Connection Error:", e)

        columns = data.columns.tolist() if not columns else columns

        dataStream = StringIO()
        data.to_csv(dataStream, index=False, header=True, sep=',')
        dataStream.seek(0)
        
        sql = """
        COPY {tmp_table} FROM STDIN WITH (FORMAT CSV, HEADER TRUE);
        """.format(tmp_table=tmp_table)
        
        # try:
        cur = con.cursor()
        print("\n\n_copy_from_dataframe: --> dataStream:\n\n")
        #cur.copy_from(file=dataStream, table=temp_table, columns=columns, sep=',')
        cur.copy_expert(sql, dataStream)
        con.commit()
        cur.close()
        print('Copy successful on table "{}".'.format(self.table))
        # except Exception as e:
        #     con.rollback()
        #     print("Error:", e)
        # finally:
        #     if con is not None:
        #         con.close()
                
        return self
                
        
    def _update_from_temp(self, id_col, columns=None):
        
        temp_table = "tmp_" + self.table
        columns = self.get_names().tolist() if not columns else columns
        sql_update = 'UPDATE {table}\n'.format(table=self.table)
        sql_set = ["SET "]
        
        for name in columns:
            line = "{name} = {tmp_name},\n\t".format(name=name, tmp_name=temp_table + '.' + name)
            sql_set.append(line)

        sql_set = ''.join(sql_set)
        sql_set = sql_set[:-3] + "\n"

        sql_from = "FROM {temp_table}\nWHERE {this_table}.{id_col} = {temp_table}.{id_col};\n\n" \
                            .format(temp_table=temp_table, this_table=self.table, id_col=id_col)
        sql_drop = 'DROP TABLE {};\n'.format(temp_table)
                
        sql = sql_update + sql_set + sql_from + sql_drop
        
        print(sql)

        # Execute query
        self.__run_query(sql)
        
        return self
        
                
    # Builds a query to update postgres from a csv file
    def update_values(self, data, id_col, types_dict, columns=None, sep=','):
        
        #print(data.dtypes)

        if data.columns.tolist() != columns:
                self.add_columns_from_data(data)
                self.update_types(types_dict=types_dict, columns=columns)        
        
        columns = self.get_names().tolist() if not columns else [id_col] + columns
        
        column_params = {"id_col":id_col, "columns":columns}

        #print("\n\nupdate_values --> types_dict:\n\n",types_dict)
        #print("\n\nupdate_values --> columns:\n\n",columns)
        
        self.__create_temp_table(types_dict=types_dict, **column_params) \
                        ._copy_from_dataframe(data=data, **column_params) \
                        ._update_from_temp(**column_params)
        
    
    # Updates column types in PostgreSQL database
    def update_types(self, types_dict, columns=None):

        # Get types already in database and only update union with columns variable
        ##### Add types in data base to columns variable: self.get_columns +
        
        # Subset types based on columns input
        types_dict, columns = self.__subset_types_dict(types_dict, columns)
        
        # Define SQL update queries
        sql_alter_table = "ALTER TABLE public.{}\n\t".format(self.table)

        # Update types
        sql_update_types = []
        
        for column, col_type in types_dict.items():
            if "DATE" in col_type.upper():
                sql_string = "ALTER {column} TYPE {col_type} USING {column}::" + "{col_type},\n\t"
            elif "INT" in col_type.upper() or "NUM" in col_type.upper():
                sql_string = "ALTER {column} TYPE {col_type} USING {column}::text::numeric::{col_type},\n\t"
            elif "NUM" in col_type.upper():
                sql_string = "ALTER {column} TYPE {col_type} USING {column}::text::numeric::{col_type},\n\t"
            else:
                sql_string = "ALTER {column} TYPE {col_type},\n\t"

            sql_alter_column = sql_string.format(column=column, col_type=col_type)
            sql_update_types.append(sql_alter_column)

        # Join strings to create full sql query
        sql_update_types = sql_alter_table + ''.join(sql_update_types)

        # Replace very last character with ";"
        sql = sql_update_types[:-3] + ";"

        self.__run_query(sql)
            
        return 

In [4]:
root_dir = os.path.dirname(os.getcwd())

In [5]:
types_dict_abbrev = {'assessor_book': 'SMALLINT',
             'assessor_page': 'SMALLINT',
             'assessor_parcel': 'CHAR(3)',
             'tract': 'VARCHAR(200)',
             'block': 'VARCHAR(50)',
             'lot': 'VARCHAR(50)',
             'reference_no_old_permit_no': 'VARCHAR(50)',
             'pcis_permit_no': 'VARCHAR(50)',
             'status': 'VARCHAR(50)',
             'status_date': 'DATE',
             'permit_type': 'VARCHAR(50)',
             'permit_sub_type': 'VARCHAR(50)',
             'permit_category': 'VARCHAR(50)',
             'project_number': 'SMALLINT',
             'event_code': 'VARCHAR(50)',
             'initiating_office': 'VARCHAR(50)',
             'issue_date': 'DATE',
             'address_start': 'INTEGER',
             'address_fraction_start': 'CHAR(3)',
             'address_end': 'INTEGER',
             'address_fraction_end': 'CHAR(3)',
             'street_direction': 'CHAR(1)',
             'street_name': 'VARCHAR(50)',
             'street_suffix': 'VARCHAR(10)',
             'suffix_direction': 'VARCHAR(10)',
             'unit_range_start': 'VARCHAR(50)',
             'unit_range_end': 'VARCHAR(50)',
             'zip_code': 'INTEGER',
             'work_description': 'TEXT',
             'valuation': 'NUMERIC',
             'floor_area_la_zoning_code_definition': 'VARCHAR(10)',
             'no_of_residential_dwelling_units': 'SMALLINT',
             'no_of_accessory_dwelling_units': 'SMALLINT',
             'no_of_stories': 'SMALLINT',
             'contractors_business_name': 'VARCHAR(100)',
             'contractor_address': 'VARCHAR(100)',
             'contractor_city': 'VARCHAR(50)',
             'contractor_state': 'CHAR(2)',
             'license_type': 'VARCHAR(10)',
             'license_no': 'INTEGER',
             'principal_first_name': 'VARCHAR(50)',
             'principal_middle_name': 'VARCHAR(50)',
             'principal_last_name': 'VARCHAR(50)',
             'license_expiration_date': 'DATE',
             'applicant_first_name': 'VARCHAR(50)',
             'applicant_last_name': 'VARCHAR(50)',
             'applicant_business_name': 'VARCHAR(100)',
             'applicant_address_1': 'VARCHAR(50)',
             'applicant_address_2': 'VARCHAR(50)',
             'applicant_address_3': 'VARCHAR(50)',
             'zone': 'VARCHAR(50)',
             'occupancy': 'VARCHAR(50)',
             'floor_area_la_building_code_definition': 'VARCHAR(10)',
             'census_tract': 'VARCHAR(10)',
             'council_district': 'SMALLINT',
             'latitude_longitude': 'VARCHAR(50)',
             'applicant_relationship': 'VARCHAR(50)',
             'existing_code': 'SMALLINT',
             'proposed_code': 'SMALLINT'}

In [6]:
permits = Database()

In [7]:
params = {"table_name":"permits_raw", "types_dict":types_dict_abbrev, "id_col":"pcis_permit_no"}
permits.drop_table('permits_raw').drop_table('tmp_permits_raw').create_table(**params)
!cd ../ && bash scripts/load_db.sh

Query successful on database "permits".
Query successful on database "permits".
Query successful on database "permits".
Copying 500 rows into table...
COPY 500


In [8]:
permits.list_tables()

['permits_raw']

### Run Pipeline

1. Standardize table names.<br>
2. Fetch raw data and transform:
    - Concatenate address columns
    - Geocode missing coordinates using street address
    - Extract latitude and longitude from coordinates into their own columns
3. Update the table with the transformed data:
    - Add new columns to PostgreSQL table
    - Update values

In [9]:
permits_raw = Table(name="permits_raw")

In [10]:
data = permits_raw.fetch_data()

In [11]:
#update_dict = permits_raw.get_types(pandas_integers=True)
#update_dict = {k: v for k, v in update_dict.items() if v}
#data = data.astype(update_dict)

In [12]:
data.dtypes

assessor_book                               Int64
assessor_page                               Int64
assessor_parcel                            object
tract                                      object
block                                      object
lot                                        object
reference_no_old_permit_no                 object
pcis_permit_no                             object
status                                     object
status_date                                object
permit_type                                object
permit_sub_type                            object
permit_category                            object
project_number                              Int64
event_code                                float64
initiating_office                          object
issue_date                                 object
address_start                               Int64
address_fraction_start                     object
address_end                                 Int64


#### Standardize table names

In [13]:
permits_raw.format_table_names(replace_map=replace_map, update=True)

Error: column "assessor_book" of relation "permits_raw" already exists



<__main__.Table at 0x1163f8550>

#### Fetch and Transform

In [14]:
data = permits_raw.fetch_data(coerce_float=True)

In [15]:
data

Unnamed: 0,assessor_book,assessor_page,assessor_parcel,tract,block,lot,reference_no_old_permit_no,pcis_permit_no,status,status_date,permit_type,permit_sub_type,permit_category,project_number,event_code,initiating_office,issue_date,address_start,address_fraction_start,address_end,address_fraction_end,street_direction,street_name,street_suffix,suffix_direction,unit_range_start,unit_range_end,zip_code,work_description,valuation,floor_area_la_zoning_code_definition,no_of_residential_dwelling_units,no_of_accessory_dwelling_units,no_of_stories,contractors_business_name,contractor_address,contractor_city,contractor_state,license_type,license_no,principal_first_name,principal_middle_name,principal_last_name,license_expiration_date,applicant_first_name,applicant_last_name,applicant_business_name,applicant_address_1,applicant_address_2,applicant_address_3,zone,occupancy,floor_area_la_building_code_definition,census_tract,council_district,latitude_longitude,applicant_relationship,existing_code,proposed_code
0,4317,3,***,TR 30210-C,,LT 1,,15044-90000-08405,Permit Finaled,2015-09-10,HVAC,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2015-08-18,1823,1/2,1823,1/2,S,THAYER,AVE,,,,90025,,,,,,,CONDITIONED AIRE MECHANICAL & ENGINEERING INC,18650 PARTHENIA STREET,NORTHRIDGE,CA,C20,532440,BRETT,MOORE,HOFFER,2016-06-30,BRETT,HOFFER,,18650 PARTHENIA ST,,"NORTHRIDGE, CA",R3-1-O,,0.0,2671.0,5,"(34.05474, -118.42628)",Net Applicant,,
1,5005,10,017,CHESTERFIELD SQUARE,,465,16SL57806,16016-70000-02464,Permit Finaled,2017-08-01,Bldg-Alter/Repair,1 or 2 Family Dwelling,No Plan Check,,,SOUTH LA,2016-02-04,2122,,2122,,W,54TH,ST,,,,90062,General rehabilitation for single family dwell...,40000.0,,,,,OWNER-BUILDER,,,,,0,JAVIER,,TALAMANTES,,JAVIER,TALAMANTES,OWNER-BUILDER,,,,C2-1VL,,,2325.0,8,"(33.99307, -118.31668)",Owner-Bldr,1.0,
2,5154,23,022,SUN-SET TRACT,D,13,14VN81535,14016-20000-13092,Issued,2014-08-13,Bldg-Alter/Repair,Apartment,Plan Check,,,VAN NUYS,2014-08-13,415,,415,,S,BURLINGTON,AVE,,1-30,1-30,90057,PHOTOVOLTAIC SOLAR PANELS ON ROOF OF (E) APT BLDG,37000.0,,,,,PERMACITY CONSTRUCTION CORP,5570 W WASHINGTON BLVD,LOS ANGELES,CA,B,827864,JONATHAN,SAUL,PORT,2015-11-30,LINDA,MARTON,,710 WILSHIRE BLVD,,"SANTA MONICA, CA",R4-1,,,2089.04,1,"(34.06012, -118.26997)",Agent for Owner,5.0,
3,4404,30,010,TR 12086,,2,,16044-30000-09658,Permit Finaled,2016-08-29,HVAC,1 or 2 Family Dwelling,No Plan Check,,,WEST LA,2016-08-22,315,,315,,S,OCEANO,DR,,,,90049,,,,,,,E/C HEATING AND AIR CONDITION,26888 CUATRO MILPAS ST,VALENCIA,CA,C20,651051,EDY,RUDOLFO,CORDON,2018-07-31,,,,,,,RS-1,,0.0,2640.0,11,"(34.05707, -118.4732)",Contractor,,
4,2646,19,011,TR 7158,,11,,17042-90000-31792,Permit Finaled,2017-12-28,Plumbing,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2017-12-26,13640,,13640,,W,PIERCE,ST,,,,91331,,,,,,,TITANIUM POWER INC,1545 S LA CIENEGA BLVD,LOS ANGELES,CA,B,989217,DENNIS,HARUO,MIYAHIRA,2017-12-31,YONI,GHERMEZI,,1545 S LA CIENEGA BLVD,,"LOS ANGELES, CA",R1-1-O,,0.0,1044.03,7,"(34.25487, -118.43002)",Net Applicant,,
5,2219,27,052,TR 73820,,52,18VN77133,17010-20000-02747,CofO Issued,2019-04-05,Bldg-New,1 or 2 Family Dwelling,Plan Check,,,VAN NUYS,2018-09-21,7111,,7111,,N,MARISA,RD,,,,91405,"NEW SFD/GARAGE - PLAN 1A, LOT-52",196660.0,1560.0,1.0,,2.0,OWNER-BUILDER,,,,,0,,,,,DAVID,LELIE,,25152 SPRINGFIELD CT,#180,"VALENCIA, CA",(T)(Q)RD2-1,,1985.0,1278.03,6,,Agent for Owner,,1.0
6,2748,27,001,TR 22446,,298,14LA,14041-10000-02537,Permit Finaled,2014-10-16,Electrical,1 or 2 Family Dwelling,Plan Check,,,METRO,2014-04-09,9672,,9672,,N,LARAMIE,AVE,,PV1,,91311,"5.8 KW DC ROOF MOUNT PV, (24) MODULES, (1) TRA...",,,,,,SMART ENERGY SOLAR INC,1641 COMMERCE STREET,CORONA,CA,C46,990049,LEOBARDO,JOAQUIN,BAUTISTA,2016-01-31,,,,,,,RS-1,,0.0,1133.03,12,"(34.24591, -118.57345)",Contractor,,
7,4234,9,009,TR 19428,,19,16WL75914,16016-30000-26238,Permit Finaled,2018-06-27,Bldg-Alter/Repair,Apartment,No Plan Check,,,WEST LA,2016-11-01,3755,,3755,,S,BUTLER,AVE,,1 & 7,,90066,KITCHEN AND BATHROOM REMODEL FOR RESIDENTIAL B...,12000.0,,,,,AMARO MARK,3230 MERRILL DRIVE UNIT 77,TORRANCE,CA,B,954226,MARK,,AMARO,2016-11-30,MARK,AMARO,,,,,R3-1,,,2719.01,11,"(34.01023, -118.42294)",Contractor,5.0,
8,5154,13,011,NORTH KNOB HILL TRACT,,27,,18042-20000-12284,Permit Finaled,2018-05-30,Plumbing,Apartment,No Plan Check,,,VAN NUYS,2018-05-21,229,,233,,S,CRANDALL,ST,,,,90057,,,,,,,L G S COMPLIANCE ALLIANCE RETROFITTING,675 S GLENWOOD PLACE,BURBANK,CA,C36,900919,GLEN,ROY,CHRISTENSEN,2019-07-31,RAMON,,,,,,R3-1,,0.0,2085.02,13,"(34.06634, -118.27401)",Agent for Contractor,,
9,2668,20,030,TR 22555,,10,,15042-90000-22993,Permit Finaled,2015-11-19,Plumbing,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2015-11-17,10435,,10435,,N,GAVIOTA,AVE,,,,91344,,,,,,,BRUININGA INC,9626 LURLINE AVENUE UNIT C,CHATSWORTH,CA,C36,769008,RONALD,WALTER,BRUININGA,2016-01-31,RONALD,BRUININGA,,9626 LURLINE AVE,C,"CHATSWORTH, CA",RS-1,,0.0,1097.0,12,"(34.26031, -118.48278)",Net Applicant,,


In [16]:
data = create_full_address(data)

In [17]:
data.head(1)

Unnamed: 0,assessor_book,assessor_page,assessor_parcel,tract,block,lot,reference_no_old_permit_no,pcis_permit_no,status,status_date,permit_type,permit_sub_type,permit_category,project_number,event_code,initiating_office,issue_date,address_start,address_fraction_start,address_end,address_fraction_end,street_direction,street_name,street_suffix,suffix_direction,unit_range_start,unit_range_end,zip_code,work_description,valuation,floor_area_la_zoning_code_definition,no_of_residential_dwelling_units,no_of_accessory_dwelling_units,no_of_stories,contractors_business_name,contractor_address,contractor_city,contractor_state,license_type,license_no,principal_first_name,principal_middle_name,principal_last_name,license_expiration_date,applicant_first_name,applicant_last_name,applicant_business_name,applicant_address_1,applicant_address_2,applicant_address_3,zone,occupancy,floor_area_la_building_code_definition,census_tract,council_district,latitude_longitude,applicant_relationship,existing_code,proposed_code,full_address
0,4317,3,***,TR 30210-C,,LT 1,,15044-90000-08405,Permit Finaled,2015-09-10,HVAC,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2015-08-18,1823,1/2,1823,1/2,S,THAYER,AVE,,,,90025,,,,,,,CONDITIONED AIRE MECHANICAL & ENGINEERING INC,18650 PARTHENIA STREET,NORTHRIDGE,CA,C20,532440,BRETT,MOORE,HOFFER,2016-06-30,BRETT,HOFFER,,18650 PARTHENIA ST,,"NORTHRIDGE, CA",R3-1-O,,0,2671.0,5,"(34.05474, -118.42628)",Net Applicant,,,1823 S THAYER AVE 90025


In [18]:
geocode_from_address(data);

Cost for geocoding 19 addresses is $0.10.
Geocoding...
19 locations were assigned coordinates.
BEFORE UPDATE:
 assessor_book                               Int64
assessor_page                               Int64
assessor_parcel                            object
tract                                      object
block                                      object
lot                                        object
reference_no_old_permit_no                 object
pcis_permit_no                             object
status                                     object
status_date                                object
permit_type                                object
permit_sub_type                            object
permit_category                            object
project_number                              Int64
event_code                                float64
initiating_office                          object
issue_date                                 object
address_start                          

In [19]:
data = split_lat_long(data)

In [20]:
data.head(10)

Unnamed: 0,assessor_book,assessor_page,assessor_parcel,tract,block,lot,reference_no_old_permit_no,pcis_permit_no,status,status_date,permit_type,permit_sub_type,permit_category,project_number,event_code,initiating_office,issue_date,address_start,address_fraction_start,address_end,address_fraction_end,street_direction,street_name,street_suffix,suffix_direction,unit_range_start,unit_range_end,zip_code,work_description,valuation,floor_area_la_zoning_code_definition,no_of_residential_dwelling_units,no_of_accessory_dwelling_units,no_of_stories,contractors_business_name,contractor_address,contractor_city,contractor_state,license_type,license_no,principal_first_name,principal_middle_name,principal_last_name,license_expiration_date,applicant_first_name,applicant_last_name,applicant_business_name,applicant_address_1,applicant_address_2,applicant_address_3,zone,occupancy,floor_area_la_building_code_definition,census_tract,council_district,latitude_longitude,applicant_relationship,existing_code,proposed_code,full_address,latitude,longitude
0,4317,3,***,TR 30210-C,,LT 1,,15044-90000-08405,Permit Finaled,2015-09-10,HVAC,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2015-08-18,1823,1/2,1823,1/2,S,THAYER,AVE,,,,90025,,,,,,,CONDITIONED AIRE MECHANICAL & ENGINEERING INC,18650 PARTHENIA STREET,NORTHRIDGE,CA,C20,532440,BRETT,MOORE,HOFFER,2016-06-30,BRETT,HOFFER,,18650 PARTHENIA ST,,"NORTHRIDGE, CA",R3-1-O,,0.0,2671.0,5,"(34.05474, -118.42628)",Net Applicant,,,1823 S THAYER AVE 90025,34.05474,-118.42628
1,5005,10,017,CHESTERFIELD SQUARE,,465,16SL57806,16016-70000-02464,Permit Finaled,2017-08-01,Bldg-Alter/Repair,1 or 2 Family Dwelling,No Plan Check,,,SOUTH LA,2016-02-04,2122,,2122,,W,54TH,ST,,,,90062,General rehabilitation for single family dwell...,40000.0,,,,,OWNER-BUILDER,,,,,0,JAVIER,,TALAMANTES,,JAVIER,TALAMANTES,OWNER-BUILDER,,,,C2-1VL,,,2325.0,8,"(33.99307, -118.31668)",Owner-Bldr,1.0,,2122 W 54TH ST 90062,33.99307,-118.31668
2,5154,23,022,SUN-SET TRACT,D,13,14VN81535,14016-20000-13092,Issued,2014-08-13,Bldg-Alter/Repair,Apartment,Plan Check,,,VAN NUYS,2014-08-13,415,,415,,S,BURLINGTON,AVE,,1-30,1-30,90057,PHOTOVOLTAIC SOLAR PANELS ON ROOF OF (E) APT BLDG,37000.0,,,,,PERMACITY CONSTRUCTION CORP,5570 W WASHINGTON BLVD,LOS ANGELES,CA,B,827864,JONATHAN,SAUL,PORT,2015-11-30,LINDA,MARTON,,710 WILSHIRE BLVD,,"SANTA MONICA, CA",R4-1,,,2089.04,1,"(34.06012, -118.26997)",Agent for Owner,5.0,,415 S BURLINGTON AVE 90057,34.06012,-118.26997
3,4404,30,010,TR 12086,,2,,16044-30000-09658,Permit Finaled,2016-08-29,HVAC,1 or 2 Family Dwelling,No Plan Check,,,WEST LA,2016-08-22,315,,315,,S,OCEANO,DR,,,,90049,,,,,,,E/C HEATING AND AIR CONDITION,26888 CUATRO MILPAS ST,VALENCIA,CA,C20,651051,EDY,RUDOLFO,CORDON,2018-07-31,,,,,,,RS-1,,0.0,2640.0,11,"(34.05707, -118.4732)",Contractor,,,315 S OCEANO DR 90049,34.05707,-118.4732
4,2646,19,011,TR 7158,,11,,17042-90000-31792,Permit Finaled,2017-12-28,Plumbing,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2017-12-26,13640,,13640,,W,PIERCE,ST,,,,91331,,,,,,,TITANIUM POWER INC,1545 S LA CIENEGA BLVD,LOS ANGELES,CA,B,989217,DENNIS,HARUO,MIYAHIRA,2017-12-31,YONI,GHERMEZI,,1545 S LA CIENEGA BLVD,,"LOS ANGELES, CA",R1-1-O,,0.0,1044.03,7,"(34.25487, -118.43002)",Net Applicant,,,13640 W PIERCE ST 91331,34.25487,-118.43002
5,2219,27,052,TR 73820,,52,18VN77133,17010-20000-02747,CofO Issued,2019-04-05,Bldg-New,1 or 2 Family Dwelling,Plan Check,,,VAN NUYS,2018-09-21,7111,,7111,,N,MARISA,RD,,,,91405,"NEW SFD/GARAGE - PLAN 1A, LOT-52",196660.0,1560.0,1.0,,2.0,OWNER-BUILDER,,,,,0,,,,,DAVID,LELIE,,25152 SPRINGFIELD CT,#180,"VALENCIA, CA",(T)(Q)RD2-1,,1985.0,1278.03,6,"(34.2003503, -118.4533963)",Agent for Owner,,1.0,7111 N MARISA RD 91405,34.20035,-118.453396
6,2748,27,001,TR 22446,,298,14LA,14041-10000-02537,Permit Finaled,2014-10-16,Electrical,1 or 2 Family Dwelling,Plan Check,,,METRO,2014-04-09,9672,,9672,,N,LARAMIE,AVE,,PV1,,91311,"5.8 KW DC ROOF MOUNT PV, (24) MODULES, (1) TRA...",,,,,,SMART ENERGY SOLAR INC,1641 COMMERCE STREET,CORONA,CA,C46,990049,LEOBARDO,JOAQUIN,BAUTISTA,2016-01-31,,,,,,,RS-1,,0.0,1133.03,12,"(34.24591, -118.57345)",Contractor,,,9672 N LARAMIE AVE 91311,34.24591,-118.57345
7,4234,9,009,TR 19428,,19,16WL75914,16016-30000-26238,Permit Finaled,2018-06-27,Bldg-Alter/Repair,Apartment,No Plan Check,,,WEST LA,2016-11-01,3755,,3755,,S,BUTLER,AVE,,1 & 7,,90066,KITCHEN AND BATHROOM REMODEL FOR RESIDENTIAL B...,12000.0,,,,,AMARO MARK,3230 MERRILL DRIVE UNIT 77,TORRANCE,CA,B,954226,MARK,,AMARO,2016-11-30,MARK,AMARO,,,,,R3-1,,,2719.01,11,"(34.01023, -118.42294)",Contractor,5.0,,3755 S BUTLER AVE 90066,34.01023,-118.42294
8,5154,13,011,NORTH KNOB HILL TRACT,,27,,18042-20000-12284,Permit Finaled,2018-05-30,Plumbing,Apartment,No Plan Check,,,VAN NUYS,2018-05-21,229,,233,,S,CRANDALL,ST,,,,90057,,,,,,,L G S COMPLIANCE ALLIANCE RETROFITTING,675 S GLENWOOD PLACE,BURBANK,CA,C36,900919,GLEN,ROY,CHRISTENSEN,2019-07-31,RAMON,,,,,,R3-1,,0.0,2085.02,13,"(34.06634, -118.27401)",Agent for Contractor,,,229 S CRANDALL ST 90057,34.06634,-118.27401
9,2668,20,030,TR 22555,,10,,15042-90000-22993,Permit Finaled,2015-11-19,Plumbing,1 or 2 Family Dwelling,No Plan Check,,,INTERNET,2015-11-17,10435,,10435,,N,GAVIOTA,AVE,,,,91344,,,,,,,BRUININGA INC,9626 LURLINE AVENUE UNIT C,CHATSWORTH,CA,C36,769008,RONALD,WALTER,BRUININGA,2016-01-31,RONALD,BRUININGA,,9626 LURLINE AVE,C,"CHATSWORTH, CA",RS-1,,0.0,1097.0,12,"(34.26031, -118.48278)",Net Applicant,,,10435 N GAVIOTA AVE 91344,34.26031,-118.48278


In [21]:
#data['no_of_residential_dwelling_units'] = data['no_of_residential_dwelling_units'].astype('Int64')
#data['no_of_stories'] = data['no_of_stories'].astype('Int64')
#data['proposed_code'] = data['proposed_code'].astype('Int64')
#data['existing_code'] = data['existing_code'].astype('Int64')
#
#data['project_number'] = data['project_number'].astype('Int64')

#### Update Table

In [22]:
permits_raw.update_values(data=data, id_col="pcis_permit_no", types_dict=types_dict)

Query successful on database "permits".
Query successful on database "permits".
Query successful on database "permits".
TEMP TABLE:
 
        DROP TABLE IF EXISTS tmp_permits_raw;
        CREATE TABLE tmp_permits_raw AS (SELECT * FROM permits_raw) WITH NO DATA;
        
Rearranged dataframe columns to match table "permits_raw".


NEEDS MATCH?  True
Rearranged dataframe columns to match table "permits_raw".
DB COLUMNS:
 ['assessor_book', 'assessor_page', 'assessor_parcel', 'tract', 'block', 'lot', 'reference_no_old_permit_no', 'pcis_permit_no', 'status', 'status_date', 'permit_type', 'permit_sub_type', 'permit_category', 'project_number', 'event_code', 'initiating_office', 'issue_date', 'address_start', 'address_fraction_start', 'address_end', 'address_fraction_end', 'street_direction', 'street_name', 'street_suffix', 'suffix_direction', 'unit_range_start', 'unit_range_end', 'zip_code', 'work_description', 'valuation', 'floor_area_la_zoning_code_definition', 'no_of_residential_dwelling_

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 62 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   assessor_book                           500 non-null    Int64  
 1   assessor_page                           500 non-null    Int64  
 2   assessor_parcel                         500 non-null    object 
 3   tract                                   497 non-null    object 
 4   block                                   116 non-null    object 
 5   lot                                     496 non-null    object 
 6   reference_no_old_permit_no              196 non-null    object 
 7   pcis_permit_no                          500 non-null    object 
 8   status                                  500 non-null    object 
 9   status_date                             500 non-null    object 
 10  permit_type                             500 non-null    object

In [24]:
data.to_csv(dataBuffer, index=False, header=False)

NameError: name 'dataBuffer' is not defined

In [None]:
dataBuffer.seek(0)

In [None]:
from dotenv import load_dotenv, find_dotenv
import csv
load_dotenv(find_dotenv());

def connect():

    try:
        con = psycopg2.connect(dbname=os.getenv("POSTGRES_DB"),
                               user=os.getenv("POSTGRES_USER"),
                               password=os.getenv("POSTGRES_PASSWORD"),
                                host=os.getenv("DB_HOST"), 
                                port=os.getenv("DB_PORT"),
                              connect_timeout=3)            
    except Exception as e:
        print('Error:', e)
        return None

    return con

In [None]:
data.dtypes

In [None]:
con = connect()
dataStream = StringIO()
data.to_csv(dataStream, index=False, header=True, sep=',')
dataStream.seek(0)
cur = con.cursor()
#cur.copy_from(file=dataStream, table="tmp_permits_raw", sep=',')
cur.copy_expert("""COPY tmp_permits_raw FROM STDIN WITH (FORMAT CSV, HEADER TRUE);""", dataStream)
con.commit()
cur.close()

In [None]:
permits_raw.get_names()

In [None]:
data.info()