In [1]:
from dbcp.helpers import get_sql_engine
import pandas as pd


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [3]:
engine = get_sql_engine()

In [9]:
end_date = (
    pd.read_sql(
        "SELECT max(valid_until_date) FROM data_warehouse.pudl_eia860m_changelog",
        engine,
    )
    .iat[0, 0]
    .strftime("%Y-%m-%d")
)
query = f"""
SELECT
    plant_id_eia,
    generator_id,
    operational_status_code,
    min(report_date) as start_date,
    max(COALESCE(valid_until_date, timestamp '{end_date}')) as end_date
FROM data_warehouse.pudl_eia860m_changelog
GROUP BY 1,2,3
ORDER BY 1,2,3,4  -- must be sorted by date for the pandas groupby.first() to work
"""
status_history = pd.read_sql(query, engine)
# The date fields are literally the first day of each month but in reality they
# represent the whole month. I want to convert them to intervals, but first I need
# to change end_date to the last day of the month.
status_history["end_date"] += pd.offsets.MonthEnd()
date_intervals = pd.IntervalIndex.from_arrays(
    status_history['start_date'], status_history['end_date'], closed='both'
)

In [10]:
end_date

'2023-12-01'

In [11]:
status_history

Unnamed: 0,plant_id_eia,generator_id,operational_status_code,start_date,end_date
0,1,1,7,2020-07-01,2023-12-31
1,1,2,7,2020-07-01,2023-12-31
2,1,3,7,2020-07-01,2023-12-31
3,1,5,7,2020-07-01,2023-02-28
4,1,5,8,2023-02-01,2023-12-31
...,...,...,...,...,...
50771,67117,EAST,7,2023-12-01,2023-12-31
50772,67117,WEST,7,2023-12-01,2023-12-31
50773,67118,1,4,2023-12-01,2023-12-31
50774,67119,686,4,2023-12-01,2023-12-31


In [12]:
date_intervals

IntervalIndex([[2020-07-01, 2023-12-31], [2020-07-01, 2023-12-31], [2020-07-01, 2023-12-31], [2020-07-01, 2023-02-28], [2023-02-01, 2023-12-31] ... [2023-12-01, 2023-12-31], [2023-12-01, 2023-12-31], [2023-12-01, 2023-12-31], [2023-12-01, 2023-12-31], [2023-12-01, 2023-12-31]], dtype='interval[datetime64[ns], both]', length=50776)

In [13]:
status_history.set_index(date_intervals, inplace=True)
status_history.head()

Unnamed: 0,plant_id_eia,generator_id,operational_status_code,start_date,end_date
"[2020-07-01, 2023-12-31]",1,1,7,2020-07-01,2023-12-31
"[2020-07-01, 2023-12-31]",1,2,7,2020-07-01,2023-12-31
"[2020-07-01, 2023-12-31]",1,3,7,2020-07-01,2023-12-31
"[2020-07-01, 2023-02-28]",1,5,7,2020-07-01,2023-02-28
"[2023-02-01, 2023-12-31]",1,5,8,2023-02-01,2023-12-31


In [31]:
pd.Timestamp(end_date) + pd.offsets.MonthEnd()

Timestamp('2023-12-31 00:00:00')

In [32]:
end_date_adjusted = pd.Timestamp(end_date) + pd.offsets.MonthEnd()
quarter_end_dates = pd.date_range(end=end_date_adjusted, periods=12, freq="Q")
quarter_end_dates

DatetimeIndex(['2021-03-31', '2021-06-30', '2021-09-30', '2021-12-31', '2022-03-31', '2022-06-30', '2022-09-30', '2022-12-31', '2023-03-31', '2023-06-30', '2023-09-30', '2023-12-31'], dtype='datetime64[ns]', freq='Q-DEC')

In [16]:
status_history.loc[quarter_end_dates[0],:].shape

(30593, 5)

In [17]:
status_history.loc[quarter_end_dates[-1],:].shape

(33912, 5)

In [33]:
out_cols = ["plant_id_eia", "generator_id", "operational_status_code"]
out = pd.concat(
    (status_history.loc[date, :].assign(quarter_end=date) for date in quarter_end_dates),
    ignore_index=True,
).sort_values(["plant_id_eia", "generator_id", "quarter_end"])
out.reset_index(inplace=True, drop=True)

In [34]:
out

Unnamed: 0,plant_id_eia,generator_id,operational_status_code,start_date,end_date,quarter_end
0,1,1,7,2020-07-01,2023-12-31,2021-03-31
1,1,1,7,2020-07-01,2023-12-31,2021-06-30
2,1,1,7,2020-07-01,2023-12-31,2021-09-30
3,1,1,7,2020-07-01,2023-12-31,2021-12-31
4,1,1,7,2020-07-01,2023-12-31,2022-03-31
...,...,...,...,...,...,...
390623,67117,EAST,7,2023-12-01,2023-12-31,2023-12-31
390624,67117,WEST,7,2023-12-01,2023-12-31,2023-12-31
390625,67118,1,4,2023-12-01,2023-12-31,2023-12-31
390626,67119,686,4,2023-12-01,2023-12-31,2023-12-31


In [48]:
dupes = out.duplicated(subset=out_cols[:2] + ['quarter_end'], keep=False)
dupes.agg(['mean', 'sum'])

mean       0.011661
sum     4555.000000
dtype: float64

In [40]:
# some duplicates are caused by the coalesce() function in the SQL query. It is usually, but not always, appropriate.
# The root cause is NULL values in the valid_until_date coming from PUDL, which shouldn't exist.
out.loc[dupes,:]
# I can make a workaround here by selecting the value with the latest start_date.
# I believe this is also equivalent to the max(operational_status_code), but there may be exceptions.
# Using the dates is more principled.

Unnamed: 0,plant_id_eia,generator_id,operational_status_code,start_date,end_date,quarter_end
1585,51,1,7,2015-07-01,2021-12-31,2021-12-31
1586,51,1,8,2021-12-01,2023-12-31,2021-12-31
1809,56,LEC1,6,2023-08-01,2023-09-30,2023-09-30
1810,56,LEC1,7,2023-09-01,2023-12-31,2023-09-30
1822,56,LEC2,6,2023-08-01,2023-09-30,2023-09-30
...,...,...,...,...,...,...
390562,67055,706,5,2023-12-01,2023-12-31,2023-12-31
390563,67056,708,4,2023-11-01,2023-12-31,2023-12-31
390564,67056,708,5,2023-12-01,2023-12-31,2023-12-31
390565,67057,707,4,2023-11-01,2023-12-31,2023-12-31


In [59]:
is_last_start_date = out.loc[dupes,:].groupby(out_cols[:2] + ['quarter_end'], as_index=False)['start_date'].transform(lambda x: x.eq(x.max())).squeeze()
is_last_start_date.head()

1585    False
1586     True
1809    False
1810     True
1822    False
Name: start_date, dtype: bool

In [61]:
idxs_to_drop = is_last_start_date.index[~is_last_start_date]
idxs_to_drop

Int64Index([  1585,   1809,   1822,   2804,   2847,   2851,   2855,   4370,   4376,   4384,
            ...
            390526, 390531, 390533, 390535, 390537, 390539, 390559, 390561, 390563, 390565], dtype='int64', length=2281)

In [62]:
dedupe = out.drop(idxs_to_drop, axis=0)
dedupe.shape

(388347, 6)

In [64]:
dedupe.shape[0] - out.shape[0]

-2281

In [65]:
dedupe.duplicated(subset=out_cols[:2] + ['quarter_end'], keep=False).agg(['mean', 'sum'])

mean    0.0
sum     0.0
dtype: float64

In [77]:
# slow and not dense
#dedupe.set_index("quarter_end").groupby(out_cols[:2], as_index=False).resample("Q", dropna=False).first()

42.6 s ± 382 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [67]:
dedupe.head()

Unnamed: 0,plant_id_eia,generator_id,operational_status_code,start_date,end_date,quarter_end
0,1,1,7,2020-07-01,2023-12-31,2021-03-31
1,1,1,7,2020-07-01,2023-12-31,2021-06-30
2,1,1,7,2020-07-01,2023-12-31,2021-09-30
3,1,1,7,2020-07-01,2023-12-31,2021-12-31
4,1,1,7,2020-07-01,2023-12-31,2022-03-31


In [68]:
dedupe.drop(columns=['start_date', 'end_date']).pivot(index=out_cols[:2], columns='quarter_end').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code
Unnamed: 0_level_1,quarter_end,2021-03-31,2021-06-30,2021-09-30,2021-12-31,2022-03-31,2022-06-30,2022-09-30,2022-12-31,2023-03-31,2023-06-30,2023-09-30,2023-12-31
plant_id_eia,generator_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
1,1.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
1,2.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
1,3.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
1,5.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,8.0,8.0,8.0,8.0
1,5.1,,,,,,,,,7.0,7.0,7.0,7.0


In [69]:
dedupe.drop(columns=['start_date', 'end_date']).set_index(out_cols[:2] + ['quarter_end']).unstack().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code,operational_status_code
Unnamed: 0_level_1,quarter_end,2021-03-31,2021-06-30,2021-09-30,2021-12-31,2022-03-31,2022-06-30,2022-09-30,2022-12-31,2023-03-31,2023-06-30,2023-09-30,2023-12-31
plant_id_eia,generator_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
1,1.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
1,2.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
1,3.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
1,5.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,8.0,8.0,8.0,8.0
1,5.1,,,,,,,,,7.0,7.0,7.0,7.0


In [73]:
dedupe.drop(columns=['start_date', 'end_date']).set_index(out_cols[:2] + ['quarter_end']).unstack().stack(dropna=False).shape

(411096, 1)

In [80]:
dedupe.drop(columns=['start_date', 'end_date']).set_index(out_cols[:2] + ['quarter_end']).unstack().stack(dropna=False).reset_index()

Unnamed: 0,plant_id_eia,generator_id,quarter_end,operational_status_code
0,1,1,2021-03-31,7.0
1,1,1,2021-06-30,7.0
2,1,1,2021-09-30,7.0
3,1,1,2021-12-31,7.0
4,1,1,2022-03-31,7.0
...,...,...,...,...
411091,67121,BAYCS,2022-12-31,
411092,67121,BAYCS,2023-03-31,
411093,67121,BAYCS,2023-06-30,
411094,67121,BAYCS,2023-09-30,
