In [3]:
import sqlalchemy as sa
import pandas as pd

sqlite_engine = sa.create_engine("sqlite:////app/data/output/data_warehouse.sqlite")

In [44]:
with sqlite_engine.connect() as conn:
    iso_locations_2021 = pd.read_sql_table("iso_locations_2021", conn)
    iso_projects_2021 = pd.read_sql_table("iso_projects_2021", conn)
    iso_resource_capacity_2021 = pd.read_sql_table("iso_resource_capacity_2021", conn)
    iso_locations = pd.read_sql_table("iso_locations", conn)
    iso_projects = pd.read_sql_table("iso_projects", conn)
    iso_resource_capacity = pd.read_sql_table("iso_resource_capacity", conn)
    
    iso_for_tableau = pd.read_sql_table("iso_for_tableau", conn)

## ISO Queue Schema diffs

In [43]:
def diff_lists(a, b):
    a = set(a)
    b = set(b)
    print(f"In A not in B: {a.difference(b)}\n")
    print(f"In B not in A: {b.difference(a)}\n")
    print(f"In Both: {b.intersection(a)}\n")
    
# diff_lists(iso_projects.columns, iso_projects_2021.columns)
diff_lists(iso_resource_capacity.columns, iso_resource_capacity_2021.columns)

In A not in B: {'resource_class', 'project_class'}

In B not in A: set()

In Both: {'capacity_mw', 'project_id', 'resource_clean', 'resource'}



In [41]:
iso_projects_2021.interconnection_service_type

0       Partial Capacity
1          Full Capacity
2       Partial Capacity
3       Partial Capacity
4       Partial Capacity
              ...       
7637                None
7638                None
7639                None
7640                None
7641                None
Name: interconnection_service_type, Length: 7642, dtype: object

In [38]:
iso_projects_2021.interconnection_date_raw.value_counts()

  2/9/2021    24
10/12/2021    18
 5/19/2021     8
25jan2010      6
 6/15/2021     6
              ..
 2/28/2008     1
 4/13/2018     1
10/16/2018     1
  4/1/2020     1
20feb2014      1
Name: interconnection_date_raw, Length: 378, dtype: int64

## ISO Queue duplicates

In [27]:
duplicated_locations = iso_locations_2021[iso_locations_2021.duplicated(subset=["project_id"], keep=False)].sort_values(by="project_id")
duplicated_locations.head()

Unnamed: 0,project_id,raw_county_name,raw_state_name,state_id_fips,county_id_fips,geocoded_locality_name,geocoded_locality_type,geocoded_containing_county
174,179,nye,NV,32,32023,nye,county,nye
175,179,clark,NV,32,32003,clark,county,clark
609,631,androscoggin,ME,23,23001,androscoggin,county,androscoggin
610,631,franklin,ME,23,23007,franklin,county,franklin
643,665,kennebec,ME,23,23011,kennebec,county,kennebec


In [28]:
iso_projects_2021.query("project_id == 179")

Unnamed: 0,project_id,region,queue_id,queue_status,queue_date_raw,queue_year,interconnection_date_raw,entity,project_name,developer,...,interconnection_service_type,point_of_interconnection,date_proposed_raw,year_proposed,interconnection_status_raw,interconnection_status_lbnl,resource_type_lbnl,queue_date,interconnection_date,date_proposed
175,179,CAISO,1649,active,4/15/2019,2019.0,,CAISO,BONANZA SOLAR,,...,Full Capacity,Innovation 230kV Sub,12/1/2023,2023.0,Executed,IA Executed,Solar+Battery,2019-04-15,NaT,2023-12-01


In [29]:
iso_projects_2021.queue_id.value_counts()

2.0e+10    36
1.9e+11     8
2.1e+11     7
1045        4
100         4
           ..
AF2-019     1
AF2-016     1
AF2-014     1
AF2-013     1
480         1
Name: queue_id, Length: 7181, dtype: int64

In [30]:
iso_projects_2021.query("queue_id == '100'")

Unnamed: 0,project_id,region,queue_id,queue_status,queue_date_raw,queue_year,interconnection_date_raw,entity,project_name,developer,...,interconnection_service_type,point_of_interconnection,date_proposed_raw,year_proposed,interconnection_status_raw,interconnection_status_lbnl,resource_type_lbnl,queue_date,interconnection_date,date_proposed
6131,6410,West (non-ISO),100,active,27sep2019,2019.0,,Avista,,,...,NR/ER,Benewah 230 kV Station,31dec2021,2021.0,FS,In Progress,Solar+Battery,2019-09-27,NaT,NaT
6624,6961,West (non-ISO),100,active,4/15/2021,2021.0,,PSE,,,...,NRIS,White River 115 kV,1/6/2024,2024.0,System Impact Study,In Progress,Battery,2021-04-15,NaT,2024-01-06
6919,7318,Southeast (non-ISO),100,active,4/26/2018,2018.0,,S-C,,,...,NR/ER,,30nov2020,2020.0,In Progress,In Progress,Solar,2018-04-26,NaT,NaT
7118,7556,West (non-ISO),100,active,30oct2021,2021.0,,TEP,,,...,Network,Vail 138kV Substation,05jan2024,2024.0,Application Received,Not Started,Battery,2021-10-30,NaT,NaT


## iso_locations_2021

### Missing Counties

In [181]:
iso_locations_2021[["project_id", "county_id_fips"]].isna().value_counts()

project_id  county_id_fips
False       False             7468
            True                93
dtype: int64

In [187]:
missing_locations = iso_locations_2021[iso_locations_2021.county_id_fips.isna()]
missing_locations.raw_state_name.value_counts(dropna=False)

NaN    70
MX      6
VA      6
CA      2
OK      2
NM      2
MN      1
OH      1
TX      1
AZ      1
OR      1
Name: raw_state_name, dtype: int64

In [196]:
missing_locations[missing_locations.raw_state_name.isna()].head(10)

Unnamed: 0,project_id,raw_county_name,raw_state_name,state_id_fips,county_id_fips,geocoded_locality_name,geocoded_locality_type,geocoded_containing_county
6837,7288,orangeburg,,,,Orangeburg,city,Orangeburg County
6838,7289,jasper,,,,Jasper,city,Jasper County
6839,7290,aiken,,,,Aiken,city,Aiken County
6840,7291,georgetown,,,,Georgetown,,Arlington County
6841,7293,horry,,,,Horry County,county,Horry County
6842,7294,georgetown,,,,Georgetown,,Arlington County
6843,7295,beaufort,,,,Beaufort,city,Beaufort County
6844,7295,hampton,,,,Hampton,city,hampton
6845,7296,lexington,,,,Lexington,city,Fayette County
6846,7298,marion,,,,Marion,city,Williamson County


### PK

In [203]:
iso_locations_2021_pk = iso_locations_2021.dropna(subset=["county_id_fips"])

iso_locations_2021_pk[["project_id", "county_id_fips"]].value_counts()

project_id  county_id_fips
5500        48487             2
0           06073             1
5258        46007             1
5271        29095             1
5270        30109             1
                             ..
2617        39093             1
2616        39093             1
2615        42111             1
2614        42111             1
8132        01033             1
Length: 7467, dtype: int64

In [205]:
iso_locations_2021_pk.query("project_id == 5500 & county_id_fips == '48487'")

Unnamed: 0,project_id,raw_county_name,raw_state_name,state_id_fips,county_id_fips,geocoded_locality_name,geocoded_locality_type,geocoded_containing_county
5216,5500,vernon,TX,48,48487,Vernon,city,Wilbarger County
5217,5500,wilbarger,TX,48,48487,wilbarger,county,wilbarger


## EIP Schemas

In [48]:
with sqlite_engine.connect() as conn:
    eip_air_constr_permits = pd.read_sql_table("eip_air_constr_permits", conn)
    eip_facilities = pd.read_sql_table("eip_facilities", conn)
    eip_facility_project_association = pd.read_sql_table("eip_facility_project_association", conn)
    eip_project_permit_association = pd.read_sql_table("eip_project_permit_association", conn)
    eip_projects = pd.read_sql_table("eip_projects", conn)


In [113]:
eip_project_permit_association.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871 entries, 0 to 870
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   air_construction_id  871 non-null    int64
 1   project_id           871 non-null    int64
dtypes: int64(2)
memory usage: 13.7 KB


In [114]:
def dtypes_and_nulls(df):
    return pd.concat([df.dtypes, df.isna().any()], axis=1)

def convert_to_schema(df):
    type_mapping = {"int64": "Integer", "float64": "Float", "object": "String", "datetime64[ns]": "DateTime"}
    
    dtypes = dtypes_and_nulls(df)
    
    final_str = ""
    
    for _, row in dtypes.iterrows():
        if row[1]:
            final_str += f"Column(\"{row.name}\", {type_mapping[str(row[0])]}, nullable=True),\n"
        else:
            final_str += f"Column(\"{row.name}\", {type_mapping[str(row[0])]}),\n"
    print(final_str)

convert_to_schema(eip_project_permit_association)

Column("air_construction_id", Integer),
Column("project_id", Integer),



## EIP projects fk error

In [110]:
diff_lists(eip_projects.project_id, eip_facility_project_association.project_id)

In A not in B: {4549, 5478}

In B not in A: {3034, 4234, 2830, 4335, 3033, 4538}

In Both: {4103, 4104, 4117, 4118, 4119, 4122, 4124, 4143, 4146, 4147, 4148, 4156, 4161, 4170, 4175, 4176, 4181, 4193, 4206, 4212, 4217, 4232, 4235, 4237, 4240, 4258, 4263, 4264, 2883, 4270, 4274, 4284, 4287, 4303, 4308, 4309, 4311, 4315, 4318, 4319, 4325, 4326, 4330, 4331, 4341, 4344, 4352, 4360, 4366, 4371, 4373, 4388, 4402, 4403, 4470, 4472, 4476, 4477, 4482, 4492, 4494, 4495, 4496, 4497, 4498, 4504, 4512, 4529, 4535, 4537, 4553, 4557, 4560, 4573, 4579, 4593, 4594, 4616, 4621, 4624, 4631, 4634, 4635, 4638, 4645, 4646, 4649, 4650, 2960, 4654, 4657, 4660, 4662, 4672, 4683, 4684, 4690, 4693, 4700, 2970, 4704, 4713, 4716, 4718, 4721, 4722, 4749, 2723, 2724, 2725, 2726, 2727, 2728, 2729, 2730, 2731, 2732, 2733, 2734, 2735, 2736, 2737, 2738, 2739, 2740, 2741, 2742, 2743, 2744, 2745, 2746, 2747, 2748, 2749, 2750, 2751, 2752, 2753, 2754, 2755, 2756, 2757, 2758, 2759, 2760, 2761, 2762, 2763, 2764, 2765, 2766, 27

In [111]:
diff_lists(eip_facilities.facility_id, eip_facility_project_association.facility_id)

In A not in B: {5217, 5474, 5032, 4362, 3628}

In B not in A: set()

In Both: {4106, 4108, 4114, 4115, 4116, 4120, 5080, 4138, 4139, 4141, 4142, 4144, 4166, 4174, 4178, 4204, 4210, 4238, 4268, 4272, 4283, 4291, 4301, 4316, 4321, 4323, 4334, 4340, 4342, 4347, 4348, 865, 4357, 4364, 4369, 4386, 4397, 4398, 4399, 887, 4471, 4475, 4478, 4480, 4484, 4485, 4486, 4487, 4490, 4493, 4501, 4527, 901, 4536, 4552, 4556, 4559, 4577, 911, 915, 4615, 4623, 4643, 4647, 4653, 4655, 4659, 4661, 4667, 4682, 4689, 4692, 4698, 4705, 4710, 4711, 4771, 4788, 4827, 4832, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 785, 786, 787, 788, 789, 790, 4887, 791, 792, 793, 794, 795, 797, 798, 799, 800, 801, 802, 803, 796, 4901, 804, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 4921, 825, 826, 827

### eip_project_permit_association fk  errors

In [115]:
diff_lists(eip_projects.project_id, eip_project_permit_association.project_id)

In A not in B: {5120, 3079, 5146, 3611, 3612, 5151, 3109, 4649, 4650, 5163, 4654, 3118, 3119, 4146, 3122, 4660, 5181, 3651, 4170, 3146, 5199, 5203, 5206, 5209, 3673, 3674, 4700, 4193, 5227, 5234, 5240, 5264, 5267, 5269, 5273, 3737, 5278, 4258, 4772, 4264, 4270, 5296, 5297, 4274, 5298, 2741, 5302, 2743, 5304, 5311, 5318, 5319, 2760, 5326, 4308, 4311, 5338, 4315, 4829, 4834, 5349, 5352, 4841, 2796, 5356, 5357, 4844, 4845, 5364, 4344, 5368, 5371, 5372, 5373, 5377, 5378, 4867, 2820, 5380, 5382, 5383, 2824, 5384, 5385, 2828, 2834, 5395, 5402, 5409, 4388, 5412, 5417, 5419, 5427, 2868, 5431, 5434, 3908, 3910, 3911, 3912, 3913, 3918, 3919, 5456, 3920, 5458, 3921, 2900, 5460, 3922, 3923, 3924, 3925, 3928, 3929, 3932, 3934, 2911, 5471, 3936, 4963, 5475, 3939, 5478, 3941, 4968, 5480, 5481, 5482, 3944, 3945, 2926, 5487, 3946, 3947, 5490, 4979, 5492, 3948, 4470, 5495, 3955, 3956, 5498, 3957, 4476, 3960, 3961, 3964, 3966, 3968, 3970, 3974, 3975, 3976, 3977, 3978, 3979, 4492, 5005, 4494, 4495, 4496, 

In [116]:
diff_lists(eip_air_constr_permits.air_construction_id, eip_project_permit_association.air_construction_id)

In A not in B: {3169, 3170, 2023, 4874, 5259, 2412, 4878, 1646, 1872, 4913, 1873, 1874, 1876, 1813, 4062, 3256, 1918, 2271}

In B not in A: {2184, 1901, 1903, 3631, 1905, 4376, 2045}

In Both: {2048, 2049, 2050, 2051, 2052, 2053, 2054, 2055, 4096, 2057, 2058, 2059, 2060, 2061, 2062, 2063, 2064, 2065, 4109, 4112, 2068, 2072, 2040, 4123, 2076, 2077, 2078, 2079, 2080, 2081, 2082, 2083, 2084, 2085, 2086, 4126, 2088, 4129, 2090, 2091, 4131, 2093, 2094, 2095, 4136, 4137, 2098, 2099, 2100, 4149, 2102, 2103, 2105, 4153, 2108, 4157, 2110, 2112, 2113, 4162, 2115, 2117, 2122, 2124, 2128, 2129, 2131, 2132, 2133, 2136, 2137, 2138, 2141, 4189, 4190, 2144, 4191, 4194, 2147, 2148, 2149, 2150, 2151, 2152, 2153, 2154, 4201, 2156, 2157, 2158, 4202, 2160, 4203, 4205, 2163, 2164, 2165, 2166, 2167, 2168, 2169, 2170, 2171, 2172, 4213, 2174, 2175, 4215, 4220, 4223, 2179, 2180, 2181, 2182, 2183, 4224, 2185, 2186, 2187, 2188, 2189, 2190, 2191, 2192, 2193, 2194, 2195, 2196, 2197, 2198, 2199, 2200, 4242, 2202, 22

##  RELDI Local Opposition

In [148]:
with sqlite_engine.connect() as conn:
    state_policy = pd.read_sql_table("state_policy", conn)
    local_ordinance = pd.read_sql_table("local_ordinance", conn)
    contested_project = pd.read_sql_table("contested_project", conn)

# display(state_policy)
# display(local_ordinance.head())
display(contested_project.head())

Unnamed: 0,raw_state_name,project_name,description,locality,year_enacted,energy_type,source,state_id_fips,earliest_year_mentioned,latest_year_mentioned,n_years_mentioned
0,Alabama,Shinbone Ridge Wind,32 local property owners sued Pioneer Green En...,,,,,1,2019.0,,1
1,Alabama,Noccalula Wind,A civil lawsuit filed by property owners in th...,,,,,1,2014.0,,1
2,Alabama,Turkey Heaven Mountain Wind,"In October 2015, Cleburne County commissioners...",,,,,1,2015.0,,1
3,Alaska,Eva Creek Wind,This 24-MW wind farm was successfully construc...,,,,,2,2012.0,,1
4,Arizona,Sterling Solar,The 1200-MW Sterling Solar project in Mohave C...,,,,,4,2020.0,,1


In [149]:
contested_project.project_name.value_counts()

Wind Catcher Project            3
                                2
Brady Wind Farm                 1
Eden Renewables, Duanesburg     1
South Fork Wind Farm            1
                               ..
Skipjack                        1
Dan's Mountain Wind Farm        1
Washington County Solar Farm    1
Bay State Wind                  1
Pioneer Wind Parks I and II     1
Name: project_name, Length: 159, dtype: int64

In [153]:
contested_project.project_name.isna().value_counts()

False    162
True      48
Name: project_name, dtype: int64

In [155]:
contested_project[contested_project.project_name.isna()].head()

Unnamed: 0,raw_state_name,project_name,description,locality,year_enacted,energy_type,source,state_id_fips,earliest_year_mentioned,latest_year_mentioned,n_years_mentioned
162,CA,,"Fountain Wind Project: In Shasta County, the p...",Shasta County,2021.0,Wind,"David Benda, Controversial wind farm rejected ...",6,,,0
163,CA,,"Jacumba Solar Project: In San Diego County, a ...",San Diego County,2021.0,Solar,"Camille Von Kaenel, Jacumba Residents Largely ...",6,2022.0,,1
164,CA,,Alameda County Wind Project: Brookfield Renewa...,Alameda County,2021.0,Wind,National Audubon Society Sues California Count...,6,2021.0,,1
165,CA,,Jawbone Wind Energy: A group of citizens filed...,Kern County,2014.0,Wind,Citizens Opposing a Dangerous Env’t v. Cnty. o...,6,,,0
166,CA,,Altamont Pass Wind Resource Area: A suit was f...,Alameda and Contra Costa Counties,2008.0,Wind,"Center for Biological Diversity, Inc. v. FPL G...",6,,,0


In [160]:
convert_to_schema(state_policy)

Column("raw_state_name", String),
Column("policy", String),
Column("year_enacted", Float, nullable=True),
Column("energy_type", String, nullable=True),
Column("source", String, nullable=True),
Column("state_id_fips", String),
Column("earliest_year_mentioned", Integer),
Column("latest_year_mentioned", Float, nullable=True),
Column("n_years_mentioned", Integer),



## ncsl_state_permitting

In [161]:
with sqlite_engine.connect() as conn:
    ncsl_state_permitting = pd.read_sql_table("ncsl_state_permitting", conn)

In [163]:
ncsl_state_permitting.state_id_fips.is_unique

True

In [164]:
convert_to_schema(ncsl_state_permitting)

Column("raw_state_name", String),
Column("permitting_type", String, nullable=True),
Column("description", String),
Column("link", String, nullable=True),
Column("state_id_fips", String),



## MCOE

In [165]:
with sqlite_engine.connect() as conn:
    mcoe = pd.read_sql_table("mcoe", conn)

In [169]:
mcoe

Unnamed: 0,plant_id_eia,generator_id,report_date,unit_id_pudl,plant_id_pudl,plant_name_eia,utility_id_eia,utility_id_pudl,utility_name_eia,associated_combined_heat_power,...,turbines_num,ultrasupercritical_tech,uprate_derate_completed_date,uprate_derate_during_year,winter_capacity_estimate,winter_capacity_mw,winter_estimated_capability_mw,zip_code,state_id_fips,county_id_fips
0,1,1,2020-01-01,,14614.0,Sand Point,63560.0,6409.0,"TDX Sand Point Generating, LLC",False,...,,,NaT,False,,0.4,,99661.0,2,2013
1,1,2,2020-01-01,,14614.0,Sand Point,63560.0,6409.0,"TDX Sand Point Generating, LLC",False,...,,,NaT,False,,0.3,,99661.0,2,2013
2,1,3,2020-01-01,,14614.0,Sand Point,63560.0,6409.0,"TDX Sand Point Generating, LLC",False,...,,,NaT,False,,0.3,,99661.0,2,2013
3,1,5,2020-01-01,,14614.0,Sand Point,63560.0,6409.0,"TDX Sand Point Generating, LLC",False,...,,,NaT,False,,0.3,,99661.0,2,2013
4,1,WT1,2020-01-01,,14614.0,Sand Point,63560.0,6409.0,"TDX Sand Point Generating, LLC",False,...,,,NaT,False,,0.1,,99661.0,2,2013


In [177]:
mcoe[["plant_id_eia", "generator_id", "report_date"]].value_counts().value_counts()

1    30053
dtype: int64

In [173]:
convert_to_schema(mcoe)

Column("plant_id_eia", Integer),
Column("generator_id", String),
Column("report_date", DateTime),
Column("unit_id_pudl", Float, nullable=True),
Column("plant_id_pudl", Float, nullable=True),
Column("plant_name_eia", String),
Column("utility_id_eia", Float, nullable=True),
Column("utility_id_pudl", Float, nullable=True),
Column("utility_name_eia", String, nullable=True),
Column("associated_combined_heat_power", String, nullable=True),
Column("balancing_authority_code_eia", String, nullable=True),
Column("balancing_authority_name_eia", String, nullable=True),
Column("bga_source", String, nullable=True),
Column("bypass_heat_recovery", String, nullable=True),
Column("capacity_factor", Float, nullable=True),
Column("capacity_mw", Float, nullable=True),
Column("carbon_capture", String, nullable=True),
Column("city", String, nullable=True),
Column("cofire_fuels", String, nullable=True),
Column("county", String, nullable=True),
Column("current_planned_operating_date", DateTime, nullable=True),

In [176]:
mcoe.sector_id_eia.value_counts()

2.0    12860
1.0    12329
7.0     1418
4.0     1000
5.0      759
3.0      520
6.0      497
Name: sector_id_eia, dtype: int64