# St. Clair County Land Use and Parcel Data Pipeline

In [None]:
!pip3 install Selenium
!pip3 install pandas
!pip3 install lxml
!pip3 install html5lib

In [None]:
import json
import os
import pandas as pd
import pprint
import re
import requests
import time

In [None]:
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from datetime import timedelta
from io import StringIO

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select

## Extraction

### Download St. Clair Co. Property Tax Inquiry Selected Townships Parcel Listing 

In [None]:
search_pg = "https://stclairil.devnetwedge.com/"
cwd = os.getcwd()

In [None]:
options = Options()
options.add_argument("--start-maximized")
options.add_argument("--headless=new")
prefs = {"download.default_directory": f"{cwd}"}
options.add_experimental_option("prefs", prefs)

In [None]:
driver = webdriver.Chrome(options)
driver.implicitly_wait(3)
driver.get(search_pg)

In [None]:
# Click into Advanced Search Tab
advance_search_tab = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, "//a[@href='#advanced-search']"))
)
advance_search_tab.click()

In [None]:
# Select Townships
township_select = Select(WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, "advanced-search-townships"))
))
township_select.select_by_value("02")
township_select.select_by_value("11")
township_select.select_by_value("01")
township_select.select_by_value("06")

In [None]:
# Check All Years Box and Search
all_years_chkbx = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.ID, "advanced-search-include-all-years"))
)
form = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, "//form"))
)
driver.execute_script(f"document.getElementById('advanced-search-include-all-years').click()")
all_years_chkbx.submit()

In [None]:
# Export Results to CSV and Download
export_btn = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, "//a[@href='/Search/ExportClientsListToCSV']"))
)
export_btn.click()

In [None]:
driver.quit()

### Scrape Parcel Information Tables

In [None]:
parcel_list = pd.read_csv("Exported_Search_Results.csv")

In [None]:
parcel_list.describe(include="all")

In [None]:
parcel_list.head()

In [None]:
# Format PropertyAccountNumber to be Solely Numeric + 'X'
def only_numeric(str):
    return "".join(re.findall(r"[\dX]", str))

parcel_list['Property Account Number'] = parcel_list['Property Account Number'].apply(only_numeric)
parcel_list.head()

In [None]:
def scrape_parcel_pg(listing_number, listing_year):
    parcel_url = f"{search_pg}parcel/view/{listing_number}/{listing_year}"
    parcel_pg = requests.get(parcel_url)
    parcel_pg = BeautifulSoup(parcel_pg.text, "html.parser")
    panel_divs = parcel_pg.find_all(class_="panel panel-info")
        
    tables_dict = {}
    
    for div in panel_divs:
        try:
            tbl_key = div.div.h3.text
            tbl = div.div.h3.parent.find_next_sibling().find("table").prettify()
            tables_dict[tbl_key] = pd.read_html(StringIO(tbl))[0]
        except:
            continue
    
    parcel_number = listing_number
    year = int(listing_year)
    
    # Property Information Table
    parcel_address = tables_dict['Property Information'][1][0].split("Site Address")[1].strip()
    sale_status = tables_dict["Property Information"][0][2].split("Sale Status")[1].strip()
    property_class = tables_dict["Property Information"][0][3].split("-")[0].split("Property Class")[1].strip()
    tax_status = tables_dict["Property Information"][2][3].split("Tax Status")[1].strip()
    net_taxable = tables_dict["Property Information"][0][4].split("Net Taxable Value")[1].strip()
    tax_rate = tables_dict["Property Information"][1][4].split("Tax Rate")[1].strip()
    total_tax = tables_dict["Property Information"][2][4].split("$")[1].strip()
    township = tables_dict["Property Information"][0][5].split("Township")[1].strip()
    acreage = tables_dict["Property Information"][1][5].split("Acres")[1].strip()
    
    # Assessments Table
    homesite_val = tables_dict["Assessments"].get("Homesite")[0]
    dwelling_val = tables_dict["Assessments"].get("Dwelling")[0]
    dept_rev_val = tables_dict["Assessments"].get("Total")[0]

    # Billing Table
    total_billed = tables_dict["Billing"].get("Totals")[4].strip("$")
    total_unpaid = tables_dict["Billing"].get("Totals")[6].strip("$")

    # Owner Information Table
    owner_name = tables_dict["Parcel Owner Information"].get("Name")[0]
    owner_address = tables_dict["Parcel Owner Information"].get("Address")[0]

    return {
        "parcel_number": parcel_number,
        "year": year,
        "parcel_address": parcel_address,
        "owner": owner_name,
        "owner_address": owner_address,
        "sale_status": sale_status,
        "property_class": property_class,
        "tax_status": tax_status,
        "net_taxable": net_taxable, 
        "tax_rate": tax_rate,
        "total_tax": total_tax,
        "township": township,
        "acreage": acreage,
        "homesite_val": homesite_val, 
        "dwelling_val": dwelling_val,
        "dept_rev_val": dept_rev_val,
        "total_billed": total_billed,
        "total_unpaid": total_unpaid
    }


In [None]:
def write_records():
    function_start = time.perf_counter()
    headers = [
        "parcel_number", "year", "parcel_address", "owner", "owner_address", 
        "sale_status", "property_class", "tax_status", "net_taxable", 
        "tax_rate", "total_tax", "township", "acreage", "homesite_val", 
        "dwelling_val", "dept_rev_val", "total_billed", "total_unpaid"
    ]
    parcel_records = []
    records_missed = []
    info_missing_ct = 0
    hundred_loop_start = time.perf_counter()

    for index, row in parcel_list.iterrows():
        if index == 0:
            continue
        else:
            listing_year = row['Year']
            listing_number = row['Property Account Number']
            try:
                parcel_info = scrape_parcel_pg(listing_number, listing_year)
            except (TypeError, ValueError, IndexError, KeyError) as err:
                info_missing_ct += 1
                records_missed.append([listing_number, listing_year, err])
            else:
                parcel_records.append(list(parcel_info.values()))
        if index % 20 == 0:
            for_write = pd.DataFrame(parcel_records, columns=headers)
            if index == 20:
                for_write.to_csv("parcel_records.csv", index=False, mode="a")
            else:
                for_write.to_csv("parcel_records.csv", index=False, header=False, mode="a")
            if index % 100 == 0:
                hundred_loops_t = timedelta(seconds=time.perf_counter()-hundred_loop_start)
                print(f"At {index}: missed {info_missing_ct} over {hundred_loops_t}")
                missed_write = pd.DataFrame(records_missed, columns=["parcel", "year", "error"])
                if index == 100:
                    missed_write.to_csv("missed_parcels.csv", index=False, mode="a")
                else:
                    missed_write.to_csv("missed_parcels.csv", index=False, header=False, mode="a")
                records_missed.clear()
                hundred_loop_start = time.perf_counter()
            parcel_records.clear()
    write_duration = timedelta(seconds=time.perf_counter()-function_start)
    print(f"{info_missing_ct} parcel records missing information. Duration: {write_duration}")


In [None]:
records_df = pd.read_csv("parcel_records.csv")
display(records_df.dtypes)
display(records_df.sample(7))
display(records_df.describe(include="all"))
