# Compare output from new branch to base branch

This can be done by running `make all` on the base branch, copying the output to a temporary folder, and doing the same for the new branch.

In [9]:
import os
import pandas as pd
from pprint import pprint

In [6]:
BASE_BRANCH = "dev"
NEW_BRANCH = "refactor-eia860m-data"
PATH_ROOT = "../../data/tmp"

In [13]:
base_branch_files = os.listdir(os.path.join(PATH_ROOT, BASE_BRANCH))
new_branch_files = os.listdir(os.path.join(PATH_ROOT, NEW_BRANCH))

print("base:")
pprint(base_branch_files)
print("new:")
pprint(new_branch_files)

base:
['pudl_eia860m_status_codes.parquet',
 'pudl_eia860m_changelog.parquet',
 'projects_history_860m.parquet',
 'projects_status_codes_860m.parquet',
 'projects_current_860m.parquet',
 'pudl_generators.parquet',
 'projects_transition_dates_860m.parquet']
new:
['projects_current_eia860m.parquet',
 'projects_status_yearly_eia860m.parquet',
 'pudl_eia860m_status_codes.parquet',
 'projects_status_quarterly_eia860m.parquet',
 'pudl_eia860m_changelog.parquet',
 'projects_status_transition_dates_eia860m.parquet',
 'pudl_generators.parquet',
 'projects_status_codes_eia860m.parquet']


# Compare similar tables before and after

## Changelog

In [50]:
changelog_base = pd.read_parquet(os.path.join(PATH_ROOT, BASE_BRANCH, "pudl_eia860m_changelog.parquet"))
changelog_new = pd.read_parquet(os.path.join(PATH_ROOT, NEW_BRANCH, "pudl_eia860m_changelog.parquet"))

In [51]:
print("columns removed:", [col for col in changelog_base if col not in changelog_new])
print("columns added:", [col for col in changelog_new if col not in changelog_base])

columns removed: ['county', 'operational_status', 'state']
columns added: ['raw_county', 'operational_status_category', 'raw_state', 'iso_region']


In [52]:
changelog_base_compare = (
    changelog_base.rename(
        columns={"county": "raw_county", "state": "raw_state", "operational_status": "operational_status_category"}
    )
)

In [53]:
changelog_base_compare.compare(changelog_new[[c for c in changelog_base_compare]])

In [54]:
changelog_base_compare.equals(changelog_new[[c for c in changelog_base_compare]])

True

## EIA860M current

In [62]:
current_base = pd.read_parquet(os.path.join(PATH_ROOT, BASE_BRANCH, "projects_current_860m.parquet"))
current_new = pd.read_parquet(os.path.join(PATH_ROOT, NEW_BRANCH, "projects_current_eia860m.parquet"))

In [63]:
current_base.head()

Unnamed: 0,report_date,plant_name_eia,plant_id_eia,generator_id,state,county,utility_id_eia,utility_name_eia,operational_status_code,operational_status_category,...,planned_derate_date,planned_generator_retirement_date,planned_net_summer_capacity_derate_mw,planned_net_summer_capacity_uprate_mw,planned_uprate_date,technology_description,state_id_fips,county_id_fips,raw_state,raw_county
0,2024-12-01,Cow Creek,229,1,California,Shasta,14328,Pacific Gas & Electric Co.,8,retired,...,NaT,NaT,,,NaT,Conventional Hydroelectric,6,6089,CA,Shasta
1,2024-12-01,Cow Creek,229,2,California,Shasta,14328,Pacific Gas & Electric Co.,8,retired,...,NaT,NaT,,,NaT,Conventional Hydroelectric,6,6089,CA,Shasta
2,2024-12-01,Kilarc,253,1,California,Shasta,14328,Pacific Gas & Electric Co.,8,retired,...,NaT,NaT,,,NaT,Conventional Hydroelectric,6,6089,CA,Shasta
3,2024-12-01,Indian River Generating Station,594,4,Delaware,Sussex,9332,Indian River Operations Inc,7,existing,...,NaT,2025-02-01,,,NaT,Conventional Steam Coal,10,10005,DE,Sussex
4,2024-12-01,Big Bend,645,ST4,Florida,Hillsborough,18454,Tampa Electric Co,7,existing,...,NaT,NaT,37.0,,NaT,Conventional Steam Coal,12,12057,FL,Hillsborough


In [64]:
current_new.head()

Unnamed: 0,report_date,plant_name_eia,plant_id_eia,generator_id,state,county,utility_id_eia,utility_name_eia,operational_status_code,operational_status_category,...,planned_derate_date,planned_generator_retirement_date,planned_net_summer_capacity_derate_mw,planned_net_summer_capacity_uprate_mw,planned_uprate_date,technology_description,state_id_fips,county_id_fips,raw_state,raw_county
0,2024-12-01,Cow Creek,229,1,California,Shasta,14328,Pacific Gas & Electric Co.,8,retired,...,NaT,NaT,,,NaT,Conventional Hydroelectric,6,6089,CA,Shasta
1,2024-12-01,Cow Creek,229,2,California,Shasta,14328,Pacific Gas & Electric Co.,8,retired,...,NaT,NaT,,,NaT,Conventional Hydroelectric,6,6089,CA,Shasta
2,2024-12-01,Kilarc,253,1,California,Shasta,14328,Pacific Gas & Electric Co.,8,retired,...,NaT,NaT,,,NaT,Conventional Hydroelectric,6,6089,CA,Shasta
3,2024-12-01,Indian River Generating Station,594,4,Delaware,Sussex,9332,Indian River Operations Inc,7,existing,...,NaT,2025-02-01,,,NaT,Conventional Steam Coal,10,10005,DE,Sussex
4,2024-12-01,Big Bend,645,ST4,Florida,Hillsborough,18454,Tampa Electric Co,7,existing,...,NaT,NaT,37.0,,NaT,Conventional Steam Coal,12,12057,FL,Hillsborough


In [65]:
current_base.equals(current_new)

True

## Quarterly project status

In [66]:
projects_quarterly_base = pd.read_parquet(os.path.join(PATH_ROOT, BASE_BRANCH, "projects_history_860m.parquet"))
projects_quarterly_new = pd.read_parquet(os.path.join(PATH_ROOT, NEW_BRANCH, "projects_status_quarterly_eia860m.parquet"))

In [67]:
projects_quarterly_base.head()

Unnamed: 0,plant_name_eia,plant_id_eia,generator_id,quarter_end,operational_status_code
0,Sand Point,1,1,2022-03-31,7
1,Sand Point,1,1,2022-06-30,7
2,Sand Point,1,1,2022-09-30,7
3,Sand Point,1,1,2022-12-31,7
4,Sand Point,1,1,2023-03-31,7


In [68]:
projects_quarterly_new.head()

Unnamed: 0,plant_name_eia,plant_id_eia,generator_id,quarter_start,quarter_end,operational_status_code
0,Sand Point,1,1,2022-01-01,2022-03-31,7
1,Sand Point,1,1,2022-04-01,2022-06-30,7
2,Sand Point,1,1,2022-07-01,2022-09-30,7
3,Sand Point,1,1,2022-10-01,2022-12-31,7
4,Sand Point,1,1,2023-01-01,2023-03-31,7


In [69]:
projects_quarterly_base.dtypes

plant_name_eia                     string
plant_id_eia                        Int64
generator_id                       string
quarter_end                datetime64[ns]
operational_status_code             Int64
dtype: object

In [70]:
projects_quarterly_new.dtypes

plant_name_eia                     string
plant_id_eia                        Int64
generator_id                       string
quarter_start              datetime64[ns]
quarter_end                datetime64[ns]
operational_status_code             Int64
dtype: object

In [71]:
projects_quarterly_base.shape, projects_quarterly_new.shape

((429828, 5), (429828, 6))

In [72]:
projects_quarterly_base.head()

Unnamed: 0,plant_name_eia,plant_id_eia,generator_id,quarter_end,operational_status_code
0,Sand Point,1,1,2022-03-31,7
1,Sand Point,1,1,2022-06-30,7
2,Sand Point,1,1,2022-09-30,7
3,Sand Point,1,1,2022-12-31,7
4,Sand Point,1,1,2023-03-31,7


In [73]:
projects_quarterly_new.head()

Unnamed: 0,plant_name_eia,plant_id_eia,generator_id,quarter_start,quarter_end,operational_status_code
0,Sand Point,1,1,2022-01-01,2022-03-31,7
1,Sand Point,1,1,2022-04-01,2022-06-30,7
2,Sand Point,1,1,2022-07-01,2022-09-30,7
3,Sand Point,1,1,2022-10-01,2022-12-31,7
4,Sand Point,1,1,2023-01-01,2023-03-31,7


In [75]:
projects_quarterly_base.equals(projects_quarterly_new[[c for c in projects_quarterly_base]])

True

# New tables: yearly and monthly

In [76]:
projects_yearly_new = pd.read_parquet(os.path.join(PATH_ROOT, NEW_BRANCH, "projects_status_yearly_eia860m.parquet"))
projects_monthly_new = pd.read_parquet(os.path.join(PATH_ROOT, NEW_BRANCH, "projects_status_monthly_eia860m.parquet"))

In [78]:
projects_yearly_new.head()

Unnamed: 0,plant_name_eia,plant_id_eia,generator_id,year_start,year_end,operational_status_code
0,Sand Point,1,1,2022-01-01,2022-12-31,7
1,Sand Point,1,1,2023-01-01,2023-12-31,7
2,Sand Point,1,1,2024-01-01,2024-12-31,7
3,Sand Point,1,2,2022-01-01,2022-12-31,7
4,Sand Point,1,2,2023-01-01,2023-12-31,7


In [79]:
projects_monthly_new.head()

Unnamed: 0,plant_name_eia,plant_id_eia,generator_id,month_start,month_end,operational_status_code
0,Sand Point,1,1,2022-02-01,2022-02-28,7
1,Sand Point,1,1,2022-03-01,2022-03-31,7
2,Sand Point,1,1,2022-04-01,2022-04-30,7
3,Sand Point,1,1,2022-05-01,2022-05-31,7
4,Sand Point,1,1,2022-06-01,2022-06-30,7


In [86]:
changelog_base.query("plant_name_eia == 'Sand Point' and plant_id_eia == 1 and generator_id == '1'")[["report_date", "operational_status_code"]]

Unnamed: 0,report_date,operational_status_code
153186,2020-07-01,7
178803,2022-07-01,7
196797,2024-04-01,7
