In [88]:
from __future__ import annotations

import numpy as np
import pandas as pd
import pandera.pandas as pa

from power_plant_emissions.config import RAW_DATA_DIR, PROCESSED_DATA_DIR


def str_to_float(s: pd.Series[str]) -> pd.Series[np.float64]:
    return s.replace("No Data", np.nan).str.replace(",", "").astype(np.float64)


df = pd.read_csv(RAW_DATA_DIR / "rggi-emissions-annual-facility.csv")

numeric_columns = [
    "Op Time",
    "Op Hours",
    "CO2 Mass (Tons)",
    "Heat Input (mmBtu)",
    "Eligible Biomass (Tons) (State Value)",
    "Eligible CHP Thermal Output (Tons) (State Value)",
]

for c in numeric_columns:
    df[c] = str_to_float(df[c])

schema = pa.DataFrameSchema(
    {
        "Year": pa.Column(int, checks=pa.Check.between(2009, 2025), nullable=False),
        "Source Name": pa.Column(str, checks=pa.Check.ne(""), nullable=False),
        "ORIS Code": pa.Column(int, checks=pa.Check.gt(0), nullable=False),
        "State": pa.Column(
            str,
            checks=pa.Check.isin(
                ["CT", "DE", "MA", "MD", "ME", "NH", "NJ", "NY", "PA", "RI", "VA", "VT"]
            ),
            nullable=False,
        ),
        **{
            c: pa.Column(np.float64, checks=pa.Check.ge(0), nullable=True) for c in numeric_columns
        },
        "Reporting Status": pa.Column(str, checks=pa.Check.isin(["Complete", "Incomplete"])),
    },
    checks=pa.Check(
        lambda df: df.duplicated(["Year", "ORIS Code"]).sum() == 0,
        name="ORIS-year_is_unique_identifier",
    ),
)
df = schema.validate(df)

df.to_csv(PROCESSED_DATA_DIR / "rggi-emissions-annual-facility.csv", index=False)

In [None]:
import numpy as np
import pandas as pd
import pandera.pandas as pa

import geopandas

from power_plant_emissions.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

fname = "Power_Plants"

gdf = geopandas.read_file(RAW_DATA_DIR / fname)

mw_columns = [
    "Install_MW",
    "Total_MW",
    "Bat_MW",
    "Bio_MW",
    "Coal_MW",
    "Geo_MW",
    "Hydro_MW",
    "HydroPS_MW",
    "NG_MW",
    "Nuclear_MW",
    "Crude_MW",
    "Solar_MW",
    "Wind_MW",
    "Other_MW",
]
# Check that there are no frequencies at suspicious/missing values like 999.
# [print(df[c].value_counts()) for c in mw_columns]

schema = pa.DataFrameSchema(
    {
        **{
            c: pa.Column(np.int32, unique=True, nullable=False) for c in ["OBJECTID", "Plant_Code"]
        },
        **{c: pa.Column(np.float64, checks=pa.Check.ge(0), nullable=False) for c in mw_columns},
        "geometry": pa.Column(nullable=False),
    },
)
gdf = schema.validate(gdf)

gdf.to_file(PROCESSED_DATA_DIR / fname)

In [None]:
import numpy as np
import pandas as pd
import pandera.pandas as pa

import geopandas

from power_plant_emissions.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

fname = "geoBoundariesCGAZ_ADM0"

gdf = geopandas.read_file(RAW_DATA_DIR / fname)


# gdf.to_file(PROCESSED_DATA_DIR / fname)

gdf