In [2]:
import sqlalchemy as sa
import pandas as pd

from dbcp.helpers import get_sql_engine

engine = get_sql_engine()

In [6]:
with engine.connect() as con:
    co2_dashboard = pd.read_sql_table("co2_dashboard", con, schema="data_mart")
    counties_long_format = pd.read_sql_table("counties_long_format", con, schema="data_mart")
    counties_wide_format = pd.read_sql_table("counties_wide_format", con, schema="data_mart")
    existing_plants = pd.read_sql_table("existing_plants", con, schema="data_mart")
    fossil_infrastructure_projects = pd.read_sql_table("fossil_infrastructure_projects", con, schema="data_mart")
    iso_projects_long_format = pd.read_sql_table("iso_projects_long_format", con, schema="data_mart")
    iso_projects_wide_format = pd.read_sql_table("iso_projects_wide_format", con, schema="data_mart")
    proposed_power_dash_local_opp = pd.read_sql_table("proposed_power_dash_local_opp", con, schema="data_mart")
    proposed_power_dash_existing_plants = pd.read_sql_table("proposed_power_dash_existing_plants", con, schema="data_mart")
    proposed_power_dash_proposed_plants = pd.read_sql_table("proposed_power_dash_proposed_plants", con, schema="data_mart")

In [16]:
def dtypes_and_nulls(df):
    return pd.concat([df.dtypes, df.isna().any()], axis=1)

def convert_to_schema(df):
    type_mapping = {"int64": "Integer", "float64": "Float", "object": "String", "datetime64[ns]": "DateTime", "bool": "Boolean"}
    
    dtypes = dtypes_and_nulls(df)
    
    final_str = ""
    
    for _, row in dtypes.iterrows():
        if row[1]:
            final_str += f"Column(\"{row.name}\", {type_mapping[str(row[0])]}, nullable=True),\n"
        else:
            final_str += f"Column(\"{row.name}\", {type_mapping[str(row[0])]}),\n"
    print(final_str)

In [94]:
convert_to_schema(counties_long_format)

Column("state_id_fips", String),
Column("county_id_fips", String),
Column("state", String),
Column("county", String),
Column("facility_type", String),
Column("resource_or_sector", String),
Column("status", String),
Column("facility_count", Integer),
Column("capacity_mw", Float, nullable=True),
Column("co2e_tonnes_per_year", Float, nullable=True),
Column("pm2_5_tonnes_per_year", Float, nullable=True),
Column("nox_tonnes_per_year", Float, nullable=True),
Column("has_ordinance", Boolean),
Column("ordinance_jurisdiction_name", String, nullable=True),
Column("ordinance_jurisdiction_type", String, nullable=True),
Column("ordinance", String, nullable=True),
Column("ordinance_earliest_year_mentioned", Float, nullable=True),
Column("state_permitting_type", String, nullable=True),
Column("state_permitting_text", String),



## `proposed_power_dash_existing_plants` issues

In [40]:
proposed_power_dash_existing_plants.county_id_fips.isna().value_counts()

False    4041
True        9
Name: county_id_fips, dtype: int64

In [42]:
proposed_power_dash_existing_plants[proposed_power_dash_existing_plants.county_id_fips.isna()]

Unnamed: 0,state,county,state_id_fips,county_id_fips,resource,capacity_mw,permitting_type,has_ordinance
4041,,,,,Onshore Wind,5361.1,,False
4042,,,,,Solar,1274.0,,False
4043,,,,,Other,1197.5,,False
4044,,,,,Oil,1340.7,,False
4045,,,,,Nuclear,9487.3,,False
4046,,,,,Hydro,3115.4,,False
4047,,,,,Natural Gas,22893.6,,False
4048,,,,,Coal,9224.9,,False
4049,,,,,Battery Storage,128.8,,False


In [43]:
proposed_power_dash_existing_plants.resource.isna().value_counts()

False    4050
Name: resource, dtype: int64

## `proposed_power_dash_proposed_plants` issues

In [46]:
proposed_power_dash_proposed_plants.county_id_fips.isna().value_counts()

False    3098
True        9
Name: county_id_fips, dtype: int64

In [49]:
proposed_power_dash_proposed_plants.resource.isna().value_counts()

False    3098
True        9
Name: resource, dtype: int64

In [50]:
proposed_power_dash_proposed_plants[proposed_power_dash_proposed_plants.resource.isna()]

Unnamed: 0,state,county,state_id_fips,county_id_fips,resource,capacity_mw,project_count,permitting_type,has_ordinance
553,Illinois,Livingston County,17.0,17105.0,,,1,Local,False
1918,Ohio,Madison County,39.0,39097.0,,,1,Hybrid,False
2152,Pennsylvania,Clearfield County,42.0,42033.0,,,1,Local,False
2238,Pennsylvania,Tioga County,42.0,42117.0,,,1,Local,False
2253,Pennsylvania,York County,42.0,42133.0,,,1,Local,False
2776,Virginia,Campbell County,51.0,51031.0,,,2,Hybrid,False
2994,West Virginia,Jefferson County,54.0,54037.0,,,1,State,True
3010,West Virginia,Preston County,54.0,54077.0,,,1,State,False
3098,,,,,,,3,,False


In [51]:
proposed_power_dash_proposed_plants[proposed_power_dash_proposed_plants.county_id_fips.isna()]

Unnamed: 0,state,county,state_id_fips,county_id_fips,resource,capacity_mw,project_count,permitting_type,has_ordinance
3098,,,,,,,3,,False
3099,,,,,Unknown,34.2,4,,False
3100,,,,,Solar,13565.13,133,,False
3101,,,,,Other,86.0,6,,False
3102,,,,,Onshore Wind,3859.0,11,,False
3103,,,,,Offshore Wind,16043.0,25,,False
3104,,,,,Natural Gas,5529.0,9,,False
3105,,,,,Hydro,53.0,2,,False
3106,,,,,Battery Storage,6629.5,38,,False


## `co2_dashboard` Issues

In [65]:
co2_dashboard[["id", "county_id_fips"]].duplicated().value_counts()

False    3888
dtype: int64

In [66]:
co2_dashboard[["id"]].duplicated().value_counts()

False    3784
True      104
dtype: int64

In [71]:
co2_dashboard[co2_dashboard.id.duplicated(keep=False)].sort_values(by="id")

Unnamed: 0,state,county,state_id_fips,id,county_id_fips,co2e_tonnes_per_year,facility_type
0,California,Santa Clara County,06,7.0,06085,6.610619e+04,proposed_power
568,Alabama,Etowah County,01,7.0,01055,3.325334e+04,existing_power
1,California,Fresno County,06,130.0,06019,1.972065e+05,proposed_power
604,South Carolina,Berkeley County,45,130.0,45015,6.158802e+06,existing_power
4,Connecticut,Hartford County,09,624.0,09003,6.260524e+04,proposed_power
...,...,...,...,...,...,...,...
1828,Minnesota,Martin County,27,7925.0,27091,2.812435e+04,existing_power
1874,Wisconsin,Sawyer County,55,8013.0,55113,6.656400e+00,existing_power
275,,,,8013.0,,1.404757e+05,proposed_power
278,Tennessee,Shelby County,47,8056.0,47157,7.199602e+04,proposed_power


In [72]:
co2_dashboard[["id", "facility_type"]].duplicated().value_counts()

False    3888
dtype: int64

In [75]:
co2_dashboard[["id", "facility_type"]].isna().value_counts()

id     facility_type
False  False            3887
True   False               1
dtype: int64

In [78]:
co2_dashboard[co2_dashboard.id.isna()]

Unnamed: 0,state,county,state_id_fips,id,county_id_fips,co2e_tonnes_per_year,facility_type
566,,,,,,14190.686722,proposed_infrastructure


In [73]:
co2_dashboard.facility_type.value_counts()

existing_power             3323
proposed_power              284
proposed_infrastructure     281
Name: facility_type, dtype: int64

co2_dashboard merges proposed and existing fossil plants and facilities. They all have ids that come from different datasets which is why there are duplicates. co2_dashboard is missing one id :(

## `counties_long_format` issues

In [97]:
counties_long_format[["county_id_fips", "facility_type", "resource_or_sector", "status"]].duplicated().value_counts()

False    7088
dtype: int64

In [91]:
counties_long_format[["county_id_fips", "facility_type", "resource_or_sector", "status"]].isna().value_counts()

county_id_fips  facility_type  resource_or_sector  status
False           False          False               False     7088
dtype: int64

In [84]:
counties_long_format

Unnamed: 0,state_id_fips,county_id_fips,state,county,facility_type,resource_or_sector,status,facility_count,capacity_mw,co2e_tonnes_per_year,pm2_5_tonnes_per_year,nox_tonnes_per_year,has_ordinance,ordinance_jurisdiction_name,ordinance_jurisdiction_type,ordinance,ordinance_earliest_year_mentioned,state_permitting_type,state_permitting_text
0,01,01001,Alabama,Autauga County,power plant,Battery Storage,proposed,1,80.0,,,,False,,,,,Local,According to the Wind Energy Technology Office...
1,01,01001,Alabama,Autauga County,power plant,Solar,proposed,1,80.0,,,,False,,,,,Local,According to the Wind Energy Technology Office...
2,01,01003,Alabama,Baldwin County,power plant,Battery Storage,proposed,3,160.0,,,,True,Baldwin County,county,"Large wind energy conversion systems (WECS), U...",,Local,According to the Wind Energy Technology Office...
3,01,01003,Alabama,Baldwin County,power plant,Solar,proposed,3,240.0,,,,True,Baldwin County,county,"Large wind energy conversion systems (WECS), U...",,Local,According to the Wind Energy Technology Office...
4,01,01011,Alabama,Bullock County,power plant,Solar,proposed,1,79.0,,,,False,,,,,Local,According to the Wind Energy Technology Office...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7083,54,54051,West Virginia,Marshall County,fossil infrastructure,Petrochemicals and Plastics,proposed,2,,1.364022e+06,70.702824,162.032766,False,,,,,State,The West Virginia Public Service Commission ha...
7084,54,54073,West Virginia,Pleasants County,fossil infrastructure,Petrochemicals and Plastics,proposed,1,,,13.617754,71.697552,False,,,,,State,The West Virginia Public Service Commission ha...
7085,54,54095,West Virginia,Tyler County,fossil infrastructure,Natural Gas,proposed,1,,1.342706e+05,7.919271,124.610932,False,,,,,State,The West Virginia Public Service Commission ha...
7086,54,54103,West Virginia,Wetzel County,fossil infrastructure,Natural Gas,proposed,1,,1.211047e+05,0.000000,41.701480,False,,,,,State,The West Virginia Public Service Commission ha...
