In [37]:
import pandas as pd
import re
import glob
# import math
# import string

from math import nan
from string import digits

Loading data

In [None]:
# df = pd.read_csv("RESINTEL_ENERGYSOLUTIONS_2_BUILDINGPERMIT_0002.txt", delimiter='\t',encoding='latin-1')
df = pd.read_csv('Permit Mining Results Sept2024.csv')
columns = list(df)
df.info()

In [105]:
df = df.drop(columns=['combined_column', 'Existing Amp', 'Upgrade Amp'])

In [106]:
# formatting description for mining
df['Description'] = df['Description'].astype(str)
df['Description'] = df['Description'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

The next cell is big but fairly straightforward. It contains the word lists for mining different upgrades as well as all the necessary functions. However, it is built around two versions, one which is the full mining process from the older permit mining code, while the other is only for mining upgrades I added.

In [84]:
# all upgrade word pairs
product_categories = {
    'Solar PV': ['solar', 'pv', 'photovoltaic', 'photo voltaic', 'solar panel', 'solar module','inverter','rooftop','ground mount','optimizer'],
    'Battery storage': ['battery', 'energy storage', 'storage system','powerwall','LG Chem','batt','enphase'],
    'Electric Vehicle Charger': ['ev charger', 'evse', 'electric vehicle charger', 'electric car charger',
                                'charger','evc','electric vehicle','level 2','charge','charging','station', 'ev'],
    'Electrical Panel Upgrade': ['panel upgrade', 'circuit breaker','load center', 'electrical service upgrade', 'service panel',
                                'breaker','subpanel','derate','mpu','msp','main panel'],
    'Transformer': ['transformer', 'power transformer', 'distribution transformer'],
    'Water Heater': ['water heater'],
    'Cool Roof': ['cool roof'],
    'Kitchen Remodel': ['kitchen remodel', 'remodel kitchen', 'kitchen bathroom remodel'],
    'Bathroom Remodel': ['bathroom remodel', 'remodel bathroom', 'restroom remodel', 'remodel restroom',
                         'bathroom kitchen remodel', 'ada restroom', 'ada bathroom'],
    'AC': ['hvac', 'air conditioning'],
    'Reroof': ['reroof', 'roof install', 'roof replacement'],
    'Spa/Pool': ['spa', 'pool'],
    'Addition': ['sqft addition', 'square foot addition', 'room addition', 'story addition', 'attach garage', 'new garage'],
    'Lighting Install': ['install light', 'light fixtures'],
    'Other Remodel': ['interior remodel', 'bedroom remodel', 'bedroom bathroom remodel', 'remodel interior',
                      'remodel bedroom', 'remodel bathroom bedroom'],
    'Wall Heater': ['wall heater', 'wall furnace', 'block wall'],
    'Tankless Water Heater': ['tankless'],
    'Fire Damage': ['fire damage', 'burn damage', 'fire repair'],
    'Windows': ['window size', 'windows size', 'window install', 'windows install', 'fenestration', 'retrofit windows'],
    'Smoke Detectors': ['smoke detectors', 'fire alarm'],
    'Carbon Monoxide Detector': ['carbon monoxide', 'co detector', 'monoxide detector'],
    'Ductwork': ['duct'],
    'Gas Furnace': ['gas furnace', 'gas line', 'install furnace'],
    'Dishwasher': ['dishwasher replacement', 'new dishwasher', 'dishwasher install', 'install dishwasher'],
    'Washing Machine': ['washer dryer', 'washer/dryer'],
    'Dryer': ['dryer replacement', 'new dryer', 'dryer install', 'washer dryer', 'washer/dryer'],
    'Elevator': ['elevator'],
    'Skylight': ['install skylight', 'skylight addition', 'roof skylight']

}

# upgrade word pairs only for new upgrades
new_product_categories = {
    'Water Heater': ['water heater'],
    'Cool Roof': ['cool roof'],
    'Kitchen Remodel': ['kitchen remodel', 'remodel kitchen', 'kitchen bathroom remodel'],
    'Bathroom Remodel': ['bathroom remodel', 'remodel bathroom', 'restroom remodel', 'remodel restroom',
                         'bathroom kitchen remodel', 'ada restroom', 'ada bathroom'],
    'AC': ['hvac', 'air conditioning'],
    'Reroof': ['reroof', 'roof install', 'roof replacement'],
    'Spa/Pool': ['spa', 'pool'],
    'Addition': ['sqft addition', 'square foot addition', 'room addition', 'story addition', 'attach garage', 'new garage'],
    'Lighting Install': ['install light', 'light fixtures'],
    'Other Remodel': ['interior remodel', 'bedroom remodel', 'bedroom bathroom remodel', 'remodel interior',
                      'remodel bedroom', 'remodel bathroom bedroom'],
    'Wall Heater': ['wall heater', 'wall furnace', 'block wall'],
    'Tankless Water Heater': ['tankless'],
    'Fire Damage': ['fire damage', 'burn damage', 'fire repair'],
    'Windows': ['window size', 'windows size', 'window install', 'windows install', 'fenestration', 'retrofit windows'],
    'Smoke Detectors': ['smoke detectors', 'fire alarm'],
    'Carbon Monoxide Detector': ['carbon monoxide', 'co detector', 'monoxide detector'],
    'Ductwork': ['duct'],
    'Gas Furnace': ['gas furnace', 'gas line', 'install furnace'],
    'Dishwasher': ['dishwasher replacement', 'new dishwasher', 'dishwasher install', 'install dishwasher'],
    'Washing Machine': ['washer dryer', 'washer/dryer'],
    'Dryer': ['dryer replacement', 'new dryer', 'dryer install', 'washer dryer', 'washer/dryer'],
    'Elevator': ['elevator'],
    'Skylight': ['install skylight', 'skylight addition', 'roof skylight']
}

test_categories = {
    'Electrical Panel Upgrade': ['panel upgrade', 'circuit breaker','load center', 'electrical service upgrade', 'service panel',
                                'breaker','subpanel','derate','mpu','msp','main panel']
}

# OLD FUNCTIONS START HERE
def classify_product(description):
    categories = product_categories
    if isinstance(description, float):
        description = str(description)
    # initialize a dictionary to store the boolean flags for each product category in the description
    category_flags = {category: False for category in categories}

    # check if any of the keywords for each product category is present in the description
    for category, keywords in categories.items():
        for keyword in keywords:
            if keyword.lower() in description.lower():
                category_flags[category] = True

    # check if "A" is present together with a number greater than 100 in the same sentence
    if re.search(r'\b(\d{3,}A)\b', description, re.IGNORECASE):
        category_flags['Electrical Panel Upgrade'] = True
    # check if 'kW' is present along with a number or as is
    if re.search(r'\b(\d+\.?\d*)\s*kW\b', description, re.IGNORECASE) and ('panel' in description.lower() or 'solar' in description.lower()):
        category_flags['Solar PV'] = True
        # check if 'ess' is present for 'Battery storage' category
    if re.search(r'\bess\b', description, re.IGNORECASE):
        category_flags['Battery storage'] = True

    # check if 'panel' appears in the description of 'Solar PV' category and ensure 'solar' is also present
    if 'panel' in description.lower() and 'solar' in description.lower():
        category_flags['Solar PV'] = True
    if 'panel' in description.lower() and 'amp' in description.lower():
        category_flags['Electrical Panel Upgrade'] = True
        # check if 'panel' appears in the description of 'Solar PV' category and ensure 'solar' is also present
    if 'tesla' in description.lower() and 'batt' in description.lower():
        category_flags['Battery storage'] = True

# check if 'tesla' appears and ensure 'charger' is also present for 'Electric Vehicle Charger' category
    if 'tesla' in description.lower() and ('station' in description.lower() or 'electric' in description.lower() or 'ev' in description.lower() or 'charge' in description.lower() or 'vehicle' in description.lower()):
        category_flags['Electric Vehicle Charger'] = True
    # check if 'derate' is present in the description
    if 'derate' in description.lower():
        category_flags['derate'] = True
    if 'de-rate' in description.lower():
        category_flags['derate'] = True
    # check if 'upgrade' is present in the description
    if 'upgrade' in description.lower():
        category_flags['upgrade'] = True
    

    return category_flags

def extract_number_before_word(text, word):
    pattern = fr"(\d+(?:\.\d+)?)\s*{word}"
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return float(match.group(1).strip().replace(',', '.'))
    return None
# OLD FUNCTIONS END HERE

# this is a simple function to mine for certain new upgrades. It can use any new set of upgrade-word pairs
# as defined in the 'new_product_categories' dictionary. Takes a single description, returns dictionary of upgrade flags.
def classify_product_new(description):
    if isinstance(description, float):
        description = str(description)
    # initialize a dictionary to store the boolean flags for each product category in the description
    category_flags = {category: 0 for category in new_product_categories}

    # check if any of the keywords for each product category is present in the description
    for category, keywords in new_product_categories.items():
        for keyword in keywords:
            if keyword.lower() in description.lower():
                category_flags[category] = 1

    return category_flags

First, permits are mined for specific upgrades with the code below

In [None]:
# apply the classify_product function to each description in the dataframe
df = pd.concat([df, pd.DataFrame(list(df['Description'].apply(classify_product)))], axis=1)
df.head()

In [86]:
# Extract numbers before 'kwh' and create a new column
df['kwH'] = df['Description'].apply(lambda x: extract_number_before_word(x, 'kwh'))

# Extract numbers before 'kw' and create a new column
df['kw'] = df['Description'].apply(lambda x: extract_number_before_word(x, 'kw'))

In [None]:
df['kwH_final'] = df.apply(lambda row: '' if row['Solar PV'] and row['kwH'] == row['kw'] else row['kwH'], axis=1)
df['kw_final'] = df.apply(lambda row: '' if row['Battery storage'] and row['kwH'] == row['kw'] else row['kw'], axis=1)
df.loc[df['Solar PV'] == False, 'kw'] = ''
df.loc[df['Battery storage'] == False, 'kwH'] = ''
df.head()

In [88]:
df.to_csv('SDGEPermitDataRough.csv')

Code specifically for panel amp predictions starts here

In [107]:
Existing = ['from', 'existing']
Upgrade = ['to', 'new', 'upgrade', 'derate', 'downsized']

select = [Existing, Upgrade]
first = [[Existing[0]], [Upgrade[0:2]]]

def extract_numbers_AMP(text, word):
    pattern = fr"(\d+(?:\.\d+)?)\s*{word}"
    matches = re.findall(pattern, text, re.IGNORECASE)
    if matches:
        return [float(match.strip()) for match in matches]
    return None

def extract_all_numbers_before_A(text):
    pattern = r"(\d+(?:\.\d+)?)\s*A\b"
    matches = re.findall(pattern, text, re.IGNORECASE)
    if matches:
        return [float(match.strip()) for match in matches]
    return None

def get_number(value):
    for i in range(0, len(value)):
        if value[i] not in digits:
            return value[:i]
    return value

def amp_predictor(description, version):
    result = nan
    words = description.split()
    for i in range(0, len(words)):
        curr = words[i]
        if curr in select[version]:
            if curr in first[version] and i < len(words)-1:
                if words[i+1][0] in digits:
                    result = get_number(words[i+1])
            else:
                if i < len(words)-1:
                    if words[i+1][0] in digits:
                        result = get_number(words[i+1])
                elif i > 0:
                    if words[i-1][0] in digits:
                        result = get_number(words[i-1])
        
    return float(result)

def fill_predictions(row):
    options = row['combined_column']
    e = row['E']
    u = row['U']
    result = {'Existing Amp': e, 'Upgrade Amp': u}
    if options == '' or options == None:
        return result
    description = row['Description']
    if len(options) == 1:
        result['Upgrade Amp'] = options[0]
    if len(options) == 2 and 'upgrade' in description:
        if (options[0] == options[1]):
            result['Upgrade Amp'] = options[0]
        else:
            result['Upgrade Amp'] = max(options[0], options[1])
    return result

In [None]:
# Extract numbers before Amp, A, and create new columns
df['Amp_2'] = df['Description'].apply(lambda x: extract_numbers_AMP(x, 'Amp'))
df['A_2'] = df['Description'].apply(extract_all_numbers_before_A)
df['combined_column'] = df['Amp_2'].fillna(df['A_2'])
df.loc[df['Electrical Panel Upgrade'] == False, 'combined_column'] = ''
df.head()

In [None]:
df['E'] = df['Description'].apply(lambda x: amp_predictor(x, 0))
df['U'] = df['Description'].apply(lambda x: amp_predictor(x, 1))
df.loc[df['Electrical Panel Upgrade'] == False, 'E'] = nan
df.loc[df['Electrical Panel Upgrade'] == False, 'U'] = nan
df.head()

In [None]:
amp_data = pd.DataFrame(list(df.apply(lambda x: fill_predictions(x), axis=1)))

df_result = pd.concat([df, amp_data], axis=1)
epu_result = df_result[df['Electrical Panel Upgrade'] == True]
epu_result.head()

In [81]:
epu_result.to_csv('Panel upgrade data.csv')

In [None]:
clean_df = df_result.drop(columns=['Amp_2', 'A_2', 'combined_column', 'E', 'U'])
clean_df.head()

In [112]:
clean_df.to_csv('Permit Mining Results Sept2024.csv')

Simple version if you only want to add new upgrades or upgrades with expanded word lists:

In [None]:
df['Description'] = df['Description'].astype(str)

# splitting dataframe so the data we're adding goes in the right spot
front = df[columns[:62]]
back = df[columns[62:]]

# goes through every description, mines upgrades, then takes all results and makes new DF
insert = pd.DataFrame(list(df['Description'].apply(classify_product_new)))

# putting everything together for new permit data result
new_df = pd.concat([front, insert, back], axis=1)
new_df.head()

In [7]:
new_df.to_csv('filename.csv')

The next section is to add property data + use codes which makes the process of splitting the data by property and then by use codes significantly easier. Also makes calculating age, years since sale, etc. all easier.

Functions

In [96]:
# Function to connect all address parts into a single string for comparison since
# permit data contains full addresses but use code data does not. Takes row (property) as input,
# returns address (in lowercase) as a string.
def addr_full(row):
    address = ''
    number = str(row['SITE_HOUSE_NUMBER'])
    direction = str(row['SITE_DIRECTION'])
    street = str(row['SITE_STREET_NAME'])
    mode = str(row['SITE_MODE'])
    if number != 'nan':
        index = number.find('.')
        if index != -1:
            number = number[:index]
        address += number
    if direction != 'nan':
        address = address + ' ' + direction
    if street != 'nan':
        address = address + ' ' + street
    if mode != 'nan':
        address = address + ' ' + mode
    return address.lower()

# Function to handle addresses containing unit numbers, which aren't in the use code data (all units kept together)
# Takes permit and constructs partial address, returning it as a string
def addr_partial(row):
    address = ''
    number = str(row['PropertyAddressAddressHouseNumber'])
    direction = str(row['PropertyAddressAddressStreetDirection'])
    street = str(row['PropertyAddressAddressStreetName'])
    s_suffix = str(row['PropertyAddressAddressStreetSuffix'])
    spd = str(row['PropertyAddressAddressStreetPostDirection'])
    if number != 'nan':
        index = number.find('.')
        if index != -1:
            number = number[:index]
        address += number
    if direction != 'nan':
        address = address + ' ' + direction
    if street != 'nan':
        address = address + ' ' + street
    if s_suffix != 'nan':
        address = address + ' ' + s_suffix
    if spd != 'nan':
        address = address + ' ' + spd
    return address.lower()

# Builds dictionary of property -> property data pairs
def build_dict(pathname, prop_dict, version):
    county_df = pd.read_csv(pathname)
    for index, row in county_df.iterrows():
        if version == 0:
            address = addr_full(row)
            prop_dict[address] = row
        elif version == 1:
            prop_dict[row['MASTER_PARCEL_APN']] = row

# gets property data from dictionary, with an addtional check to get rid of unit/apartment numbers. 
# uncomment use code lines & add back to 'cat' list, commented  out
# because they were already added using an earlier version of this function.
def get_prop_data(row, propdict):
    cat = ['Last Sale Date', 'Year Built', 'APN']
    prop_data = {i: '' for i in cat}
    key = row['PropertyAddressFull'].lower()
    if key in propdict.keys():
        prop_data['Use Code'] = str(propdict[key]['USE_CODE_STD_LPS'])
        prop_data['Last Sale Date'] = propdict[key]['LAST_SALE_DATE_TRANSFER']
        prop_data['Year Built'] = propdict[key]['YR_BLT']
        prop_data['APN'] = str(propdict[key]['MASTER_PARCEL_APN'])
    else:
        key = addr_partial(row)
        if key in propdict.keys():
            prop_data['Use Code'] = str(propdict[key]['USE_CODE_STD_LPS'])
            prop_data['Last Sale Date'] = propdict[key]['LAST_SALE_DATE_TRANSFER']
            prop_data['Year Built'] = propdict[key]['YR_BLT']
            prop_data['APN'] = str(propdict[key]['MASTER_PARCEL_APN'])
    return prop_data

Building dictionary of property -> property data (required since the property data files are all for individual counties)

In [None]:
# this is for the case with the old ATTOM data, multiple counties

path = 'use_code_data/*.csv' # replace with own path, only works if all use code data is in one folder with no other .csv files

prop_dict = {}

key_type = 0
for pathname in glob.glob(path):
    build_dict(pathname, prop_dict, key_type)

In [None]:
# for SDGE data, all San Diego county

prop_dict = {}

build_dict('sandiego.csv', prop_dict, 0)

Getting property data

In [None]:
df = clean_df # or load from rough.csv
df.head()

In [None]:
df['PropertyAddressFull'] = df['PropertyAddressFull'].astype(str)

# goes through each permit and pulls the property data from the dictionary
df_prop_data = pd.DataFrame(list(df.apply(lambda x: get_prop_data(x, prop_dict), axis=1)))

# result
new_df = pd.concat([df, df_prop_data], axis=1)
new_df.head()

In [None]:
clean_df = new_df.drop(columns=list(new_df)[0:1])

clean_df.head()

In [103]:
new_df.to_csv('Permit Mining Results SDGE.csv')

In [None]:
new_df.info()