In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
path = Path('/app/data/raw/2022.03.22OGW.xlsx')
assert path.exists()

In [None]:
# eip = dbcp.extract.eip_infrastructure.extract(path)
# hardcode the extract function so this notebook can be easily rerun in the future without maintenance
air = pd.read_excel(path, sheet_name='Air Construction')

In [None]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [None]:
air.shape

# Cleaning
## Projects Cleaning
Columns I care about:
* id
* name
* modified on
* project ID (1:m as arrays)
* statute (1:m as arrays)
* permit type (1:m as arrays)
* permitting action (1:m as arrays)
* permit status
* description
* research notes

Cleaning Checklist:
- [x] Accuracy
- [x] Atomicity
- [ ] Consistency
- [x] Completeness
- [x] Uniformity
- [x] Validity
    - [x] Range Validation
    - [x] Uniqueness Validation
    - [x] Set Membership Validation
    - [x] Type Validation
    - [x] Cross-Field Validation

### Accuracy
The most important item to spot check here is the permit status. "Final" permit statuses are of little interest and also presumably don't change over time, so I'll only check 1 of those.

Results: 4/4 match dates and status 👍🏼

In [None]:
filter_ = air['Permit Status'].isin({"Application Pending", "Draft Issued"})
air.loc[filter_,:].sample(3, random_state=42)

Alaska LNG Liquifaction Plant: confirmed on [AK DEC website](https://dec.alaska.gov/Applications/Air/airtoolsweb/AirPermitsApprovalsAndPublicNotices). Dates match, status is a little more ambiguous but I think "draft" is right.

Gulf LNG: confirmed on [MS state website](https://opcgis.deq.state.ms.us/enonline/ai_info.aspx?ai=23844). Application date and status match.

Delfin LNG: confirmed on [LA DEQ website](https://deq.louisiana.gov/public-notices?keyword=delfin&startDate=&endDate=). Dates and status match.

In [None]:
filter_ = air['Permit Status'].eq("Final Issued")
air.loc[filter_,:].sample(1, random_state=42)

Heim Gas Plant Expansion: confirmed at [TX CEQ website](https://www15.tceq.texas.gov/crpub/index.cfm?fuseaction=iwr.pgmdetail&addn_id=120534092019308&re_id=578462662019220&program_code=AIRNSR&lgcy_sys_cd=NSR&program=AIR%20NEW%20SOURCE%20PERMITS&IdType=REG). Dates and status match.

### Atomicity
Most of the columns are 1:m values encoded as csv array strings, but most values are singletons. See Range Validation and Set Membership Validation for decisions on modeling as 1:1 vs 1:m.

### Completeness
For this purpose, I'll limit the scope of 'completeness' to only look at missing values within the data. For better or worse, it is EIP's job to ensure projects are in the dataset at all.

We are not interested in already-issued permits, so I'll remove those and assess completeness based on the remaining subset.

Notable missing values and lack of missing values:
* The only 3 records missing project ID + permit info are entirely NaN rows

In [None]:
def calc(num, denom=105):
    percent = 1- num/denom
    n = denom - num
    return f"{n}/{denom} ({percent*100:.1f}%)"

In [None]:
calc(102)

In [None]:
len(air)

In [None]:
air['Permit Status'].value_counts()

In [None]:
air.loc[air['Permit Status'].ne('Final Issued'),:].count().T

In [None]:
air.loc[air['Permit Status'].isna(),:]

### Consistency - defer
Defer until I've cleaned the related datasets
### Uniformity
Important columns to check consistent representation:
* all array fields -- check consistent delimiters
    * project ID (1:m as arrays)
    * statute (1:m as arrays)
    * permit type (1:m as arrays)
    * permitting action (1:m as arrays)
* modified on -- check consistent date format

#### Array Fields
Want to check for consistent array delimiters.

In [None]:
# exclude ID cols with numeric types (no arrays present)
id_cols = [
    'Project (ID)',    
]

In [None]:
# mandatory opening pattern, optional delimiter, optional repeating pattern, optional closing pattern, mandatory end of line
array_pattern = r'(?:\d{3,5})(?:, ?)?(?:\d{3,5}, ?)*(?:\d{3,5})?$'

In [None]:
test_case = pd.Series([
    '1234',
    '1234,567',
    '1234, 567',
    '12345, 678, 9012',
    '1234\t5678', # tab is bad, no comma
    '12, 3456', # too short
    '1234    5678', # too many spaces, no comma
])
pd.concat([test_case, test_case.str.match(array_pattern)], axis=1)

In [None]:
# all pass the formatting test
for col in id_cols:
    assert air[col].str.match(array_pattern).all()

In [None]:
array_cols = [
    'Permitting Action', 
    'Permit Type',
    'Statute',
]

In [None]:
special_chars = air.loc[:, array_cols].copy()
for col in array_cols:
    special_chars.loc[:, col] = special_chars.loc[:, col].str.replace('\w|\s|,', '', regex=True)

In [None]:
# no other delimiters present
special_chars.loc[special_chars.fillna('').ne('').any(axis=1),:]

#### Date Modified

In [None]:
# to_datetime works on all values present
timestamps = pd.to_datetime(air['modified_on'], errors='raise')
timestamps.dtypes, timestamps.isna().sum()

### Range Validation
Check project ID and date modified

#### Project ID

In [None]:
proj_ids = air['Project (ID)'].str.split(',', expand=True)
for col in fac_ids.columns:
    proj_ids.loc[:, col] = pd.to_numeric(proj_ids.loc[:, col], errors='raise')

proj_ids.head()

In [None]:
# they all look in the same range
proj_ids.describe()

#### Date Modified
range looks fine

In [None]:
pd.to_datetime(air['modified_on']).describe()

### Uniqueness Validation
Check the `id` field

In [None]:
air['id'].duplicated().sum()

### Set Membership Validation
* statute (1:m as arrays)
* permit type (1:m as arrays)
* permitting action (1:m as arrays)
* permit status

#### Statute

In [None]:
air['Statute'].value_counts()

In [None]:
# split and combine value counts
air['Statute'].str.split(',', expand=True).stack().str.strip().value_counts()

#### Permit Type

In [None]:
air['Permit Type'].value_counts()

In [None]:
# split and combine value counts
air['Permit Type'].str.split(',', expand=True).stack().str.strip().value_counts()

#### Permitting Action
A bunch of 1:m categories, but very few actual values

In [None]:
air['Permitting Action'].value_counts()

In [None]:
# split and combine value counts
air['Permitting Action'].str.split(',', expand=True).stack().str.strip().value_counts()

#### Permit Status
Will combine at least the two `withdrawn` categories, maybe even all of `expired`, `withdrawn`, `void`, `denied` into a single "Nope" category.

In [None]:
air['Permit Status'].value_counts()

### Type Validation
Only the project ID and date modified fields will change type after transformation

### Cross-Field Validation
None really needed. I could check that the date columns are in a logical order (application < draft issued < last comment date < final issued < construction deadline) but I'm not planning to really use those columns. So I skipped it.