In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

In [5]:
path = Path('/app/data/raw/2023.05.24 OGW database.xlsx')
assert path.exists()

In [6]:
# dbcp.extract.eip_infrastructure.extract(path)
# vendor the extract function so this notebook can be easily rerun in the future without maintenance
def _convert_object_to_string_dtypes(df: pd.DataFrame) -> None:
    strings = df.select_dtypes("object")
    df.loc[:, list(strings.columns)] = strings.astype(pd.StringDtype())


def _downcast_ints(df: pd.DataFrame) -> None:
    ints = df.select_dtypes(np.int64)
    for col in ints.columns:
        ser = df.loc[:, col]
        assert (
            ser.ge(0).fillna(True).all()
        )  # didn't implement this for negative numbers
        assert np.all((ser.values >> 32) == 0)  # check for high bits
        df.loc[:, col] = ser.astype(pd.Int32Dtype())


def extract(path: Path) -> dict[str, pd.DataFrame]:
    """Read EIP excel database.

    Args:
        path (Path): filepath

    Returns:
        Dict[str, pd.DataFrame]: output dictionary of dataframes
    """
    sheets_to_read = [
        "Facility",
        # 'Company',
        "Project",
        "Air Construction",  # permit status is key to identifying actionable projects
        # 'Pipelines',
        # 'NGA',
        # 'NAICS',
        # 'CWA-NPDES',
        # 'CWA Wetland',
        # 'Air Operating',
        # 'Glossary',  # useful for data dictionary
        # 'Data Sources',
        # 'Map Layers',
        # 'Other Permits',
        # 'Test Collection',
        # 'Featured Facility Descriptors',
        # 'MARAD',
        # 'TEST',
        # 'Pipeline Digitization',
    ]
    raw_dfs = pd.read_excel(path, sheet_name=sheets_to_read)
    rename_dict = {
        "Facility": "eip_facilities",
        "Project": "eip_projects",
        "Air Construction": "eip_air_constr_permits",
    }
    raw_dfs = {rename_dict[key]: df for key, df in raw_dfs.items()}
    for df in raw_dfs.values():
        _convert_object_to_string_dtypes(df)
        _downcast_ints(df)

    return raw_dfs

air = extract(path)['eip_air_constr_permits']

In [7]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [8]:
air.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 945 entries, 0 to 944
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              945 non-null    Int32  
 1   name                            943 non-null    string 
 2   created_on                      945 non-null    string 
 3   modified_on                     945 non-null    string 
 4   Date Last Checked               381 non-null    string 
 5   Facility-wide PTE: CO           46 non-null     float64
 6   Facility-wide PTE: CO2e         26 non-null     float64
 7   Facility-wide PTE: HAPs         33 non-null     string 
 8   Facility-wide PTE: NOx          46 non-null     float64
 9   Facility-wide PTE: PM2.5        42 non-null     string 
 10  Facility-wide PTE: SO2          42 non-null     float64
 11  Facility-wide PTE: VOCs         46 non-null     float64
 12  Project (ID)                    927 

# Cleaning
## Projects Cleaning
Columns I care about:
* id
* name
* project ID (1:m as arrays)
* permit type (1:m as arrays)
* permitting action (1:m as arrays)
* permit status
* description

Cleaning Checklist:
- [x] Accuracy
- [x] Atomicity
- [ ] Consistency
- [x] Completeness
- [x] Uniformity
- [x] Validity
    - [x] Range Validation
    - [x] Uniqueness Validation
    - [x] Set Membership Validation
    - [x] Type Validation
    - [x] Cross-Field Validation

### Accuracy
The most important item to spot check here is the permit status. "Final" permit statuses are of little interest and also presumably don't change over time, so I'll only check 1 of those.

Deferred.

In [9]:
filter_ = air['Permit Status'].isin({"Application Pending", "Draft Issued"})
air.loc[filter_,:].sample(3, random_state=42)

Unnamed: 0,id,name,created_on,modified_on,Date Last Checked,Facility-wide PTE: CO,Facility-wide PTE: CO2e,Facility-wide PTE: HAPs,Facility-wide PTE: NOx,Facility-wide PTE: PM2.5,Facility-wide PTE: SO2,Facility-wide PTE: VOCs,Project (ID),Project,Permit Status,Description or Purpose,Application Date,Draft Permit Issuance Date,Last Day to Comment,Final Permit Issuance Date,Deadline to Begin Construction,Detailed Permitting History,Document URL
855,5817,PSD-LA-851 and 2560-00295-V5,2022-11-14T21:18:52.696189,2023-04-13T14:40:07.109984,2023-04-13,183.08,1401099.0,,155.79,75.4,6.18,174.88,5816,KMe Optimization Project[5816],Application Pending,This permit would authorize construction of th...,2022-11-02,,,,,,
758,5190,3136-V3,2022-06-08T17:01:57.435111,2023-04-13T15:27:34.520766,2022-04-13,,,,,,,,5189,LACC - Package Boiler 2[5189],Draft Issued,This permit would authorize construction and o...,2022-05-31,2023-01-31,2023-03-07,,,,
890,6003,46-00069A,2023-01-10T19:58:42.806241,2023-01-10T21:48:16.500927,,,,,,,,,5475,Point Township Circular Manufacturing Facility...,Application Pending,Plan Approval application for Phase 1 of the E...,2022-11-16,,,,,,


In [10]:
filter_ = air['Permit Status'].eq("Final Issued")
air.loc[filter_,:].sample(1, random_state=42)

Unnamed: 0,id,name,created_on,modified_on,Date Last Checked,Facility-wide PTE: CO,Facility-wide PTE: CO2e,Facility-wide PTE: HAPs,Facility-wide PTE: NOx,Facility-wide PTE: PM2.5,Facility-wide PTE: SO2,Facility-wide PTE: VOCs,Project (ID),Project,Permit Status,Description or Purpose,Application Date,Draft Permit Issuance Date,Last Day to Comment,Final Permit Issuance Date,Deadline to Begin Construction,Detailed Permitting History,Document URL
774,5330,168854,2022-07-27T20:44:58.304291,2023-04-04T14:49:54.986680,2023-04-04,,,,,,,,4579,Project Stratos - Initial Construction[4579],Final Issued,This permit will authorize a facility to captu...,2022-04-27,2023-01-21,2023-02-21,2023-03-08,,,https://www.tceq.texas.gov/permitting/air/news...


### Atomicity
Most of the columns are 1:m values encoded as csv array strings, but most values are singletons. See Range Validation and Set Membership Validation for decisions on modeling as 1:1 vs 1:m.

### Completeness
For this purpose, I'll limit the scope of 'completeness' to only look at missing values within the data. Ensuring projects are in the dataset at all is out of scope for this notebook.

We are not interested in already-issued permits, so I'll remove those and assess completeness based on the remaining subset.

Notable missing values and lack of missing values:
* all projects have IDs

In [11]:
air['Permit Status'].value_counts()

Final Issued                                      795
Application Pending                                66
Draft Issued                                       24
Expired                                            18
Withdrawn                                          14
Withdrawn (UARG v. EPA 134 S. Ct. 2427 (2014))     12
Void                                                8
Denied                                              2
Revoked                                             1
Name: Permit Status, dtype: Int64

In [14]:
air.loc[air['Permit Status'].ne('Final Issued'),:].isna().agg(['sum', 'mean']).T

Unnamed: 0,sum,mean
id,0.0,0.0
name,0.0,0.0
created_on,0.0,0.0
modified_on,0.0,0.0
Date Last Checked,36.0,0.248276
Facility-wide PTE: CO,141.0,0.972414
Facility-wide PTE: CO2e,143.0,0.986207
Facility-wide PTE: HAPs,145.0,1.0
Facility-wide PTE: NOx,141.0,0.972414
Facility-wide PTE: PM2.5,142.0,0.97931


In [16]:
air['Description or Purpose'].isna().groupby(air['Permit Status']).agg(['sum', 'mean'])

Unnamed: 0_level_0,sum,mean
Permit Status,Unnamed: 1_level_1,Unnamed: 2_level_1
Application Pending,1,0.015152
Denied,0,0.0
Draft Issued,0,0.0
Expired,1,0.055556
Final Issued,50,0.062893
Revoked,0,0.0
Void,0,0.0
Withdrawn,1,0.071429
Withdrawn (UARG v. EPA 134 S. Ct. 2427 (2014)),3,0.25


### Consistency - defer
Defer until I've cleaned the related datasets
### Uniformity
Important columns to check consistent representation:
* all array fields -- check consistent delimiters
    * project ID (1:m as arrays)
    * statute (1:m as arrays)
    * permit type (1:m as arrays)
    * permitting action (1:m as arrays)
* modified on -- check consistent date format

#### Array Fields
Want to check for consistent array delimiters.

In [17]:
# exclude ID cols with numeric types (no arrays present)
id_cols = [
    'Project (ID)',
]

In [18]:
# mandatory opening pattern, optional delimiter, optional repeating pattern, optional closing pattern, mandatory end of line
array_pattern = r'(?:\d{3,5})(?:, ?)?(?:\d{3,5}, ?)*(?:\d{3,5})?$'

In [19]:
test_case = pd.Series([
    '1234',
    '1234,567',
    '1234, 567',
    '12345, 678, 9012',
    '1234\t5678', # tab is bad, no comma
    '12, 3456', # too short
    '1234    5678', # too many spaces, no comma
])
pd.concat([test_case, test_case.str.match(array_pattern)], axis=1)

Unnamed: 0,0,1
0,1234,True
1,1234567,True
2,"1234, 567",True
3,"12345, 678, 9012",True
4,1234\t5678,False
5,"12, 3456",False
6,1234 5678,False


In [20]:
# all pass the formatting test
for col in id_cols:
    assert air[col].str.match(array_pattern).all()

#### Date Modified

In [21]:
# to_datetime works on all values present
timestamps = pd.to_datetime(air['modified_on'], errors='raise')
timestamps.dtypes, timestamps.isna().sum()

(dtype('<M8[ns]'), 0)

### Range Validation
Check project ID and date modified

#### Project ID

In [22]:
proj_ids = air['Project (ID)'].str.split(',', expand=True)
for col in proj_ids.columns:
    proj_ids.loc[:, col] = pd.to_numeric(proj_ids.loc[:, col], errors='raise')

proj_ids.head()

Unnamed: 0,0,1,2
0,2723.0,,
1,2723.0,,
2,2728.0,,
3,2727.0,,
4,2875.0,,


In [23]:
# they all look in the same range
proj_ids.describe()

Unnamed: 0,0,1,2
count,927.0,11.0,1.0
mean,3628.423948,4281.454545,2733.0
std,1043.567402,1130.689556,
min,2723.0,2733.0,2733.0
25%,2883.5,3080.0,2733.0
50%,3060.0,4645.0,2733.0
75%,4217.0,5049.5,2733.0
max,6385.0,6006.0,2733.0


#### Date Modified
range looks fine

In [24]:
pd.to_datetime(air['modified_on']).describe()

  pd.to_datetime(air['modified_on']).describe()


count                            945
unique                           924
top       2021-05-21 15:13:50.395199
freq                              22
first     2021-05-21 15:13:50.395199
last      2023-05-24 20:42:41.252920
Name: modified_on, dtype: object

### Uniqueness Validation
Check the `id` field

In [25]:
air['id'].duplicated().sum()

0

### Set Membership Validation
* permit status

In [26]:
air.columns

Index(['id', 'name', 'created_on', 'modified_on', 'Date Last Checked',
       'Facility-wide PTE: CO', 'Facility-wide PTE: CO2e',
       'Facility-wide PTE: HAPs', 'Facility-wide PTE: NOx',
       'Facility-wide PTE: PM2.5', 'Facility-wide PTE: SO2',
       'Facility-wide PTE: VOCs', 'Project (ID)', 'Project', 'Permit Status',
       'Description or Purpose', 'Application Date',
       'Draft Permit Issuance Date', 'Last Day to Comment',
       'Final Permit Issuance Date', 'Deadline to Begin Construction',
       'Detailed Permitting History', 'Document URL'],
      dtype='object')

#### Permit Status
Will combine at least the two `withdrawn` categories, maybe even all of `expired`, `withdrawn`, `void`, `denied`, `revoked` into a single "Nope" category.

In [27]:
air['Permit Status'].value_counts()

Final Issued                                      795
Application Pending                                66
Draft Issued                                       24
Expired                                            18
Withdrawn                                          14
Withdrawn (UARG v. EPA 134 S. Ct. 2427 (2014))     12
Void                                                8
Denied                                              2
Revoked                                             1
Name: Permit Status, dtype: Int64

### Type Validation
Only the project ID and date modified fields will change type after transformation

### Cross-Field Validation
None really needed. I could check that the date columns are in a logical order (application < draft issued < last comment date < final issued < construction deadline) but I'm not planning to really use those columns. So I skipped it.

In [28]:
from dbcp.transform.eip_infrastructure import air_construction_transform

In [31]:
trans = air_construction_transform(extract(path)['eip_air_constr_permits'])

In [32]:
trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 945 entries, 0 to 944
Data columns (total 17 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   air_construction_id                 945 non-null    Int32         
 1   name                                943 non-null    string        
 2   raw_created_on                      945 non-null    string        
 3   raw_modified_on                     945 non-null    string        
 4   raw_date_last_checked               381 non-null    string        
 5   raw_project_id                      927 non-null    string        
 6   raw_permit_status                   940 non-null    string        
 7   description_or_purpose              884 non-null    string        
 8   raw_application_date                779 non-null    string        
 9   raw_draft_permit_issuance_date      125 non-null    string        
 10  raw_last_day_to_comment   