From d555e592fb922454182cd6b4929f583a7c0d00d8 Mon Sep 17 00:00:00 2001 From: Pierre Segonne <32778266+pierresegonne@users.noreply.github.com> Date: Tue, 23 Apr 2024 09:33:32 +0200 Subject: [PATCH] chore(DO): update parser (#6675) * chore(DO): update parser * fix NaN checks * fix type error * format * update snapshots --------- Co-authored-by: Pierre Segonne Co-authored-by: Viktor Andersson <30777521+VIKTORVAV99@users.noreply.github.com> --- electricitymap/contrib/lib/models/events.py | 14 +- .../contrib/lib/tests/test_events.py | 16 + parsers/DO.py | 307 ++++++++---------- parsers/test/snapshots/snap_test_CNDC.py | 36 +- parsers/test/test_DO.py | 167 ++++++++++ 5 files changed, 342 insertions(+), 198 deletions(-) create mode 100644 parsers/test/test_DO.py diff --git a/electricitymap/contrib/lib/models/events.py b/electricitymap/contrib/lib/models/events.py index 4dd4317efd..8dd672a373 100644 --- a/electricitymap/contrib/lib/models/events.py +++ b/electricitymap/contrib/lib/models/events.py @@ -30,7 +30,7 @@ def _none_safe_round(value: float | None, precision: int = 6) -> float | None: If the value is None, it is returned as is. The default precision is 6 decimal places, which gives us a precision of 1 W. """ - return None if value is None else round(value, precision) + return None if value is None or math.isnan(value) else round(value, precision) class Mix(BaseModel, ABC): @@ -107,7 +107,9 @@ def __init__(self, **data: Any): for attr, value in data.items(): if value is not None and value < 0: self._corrected_negative_values.add(attr) - self.__setattr__(attr, None) + value = None + # Ensure that the value is rounded to 6 decimal places and set to None if it is NaN. + self.__setattr__(attr, value) def dict( # noqa: A003 self, @@ -232,6 +234,14 @@ class StorageMix(Mix): battery: float | None = None hydro: float | None = None + def __init__(self, **data: Any): + """ + Overriding the constructor to check for NaN values and set them to None. + """ + super().__init__(**data) + for attr, value in data.items(): + self.__setattr__(attr, value) + def __setattr__(self, name: str, value: float | None) -> None: """ Overriding the setattr method to raise an error if the mode is unknown. diff --git a/electricitymap/contrib/lib/tests/test_events.py b/electricitymap/contrib/lib/tests/test_events.py index c3c0d27834..96a17130e2 100644 --- a/electricitymap/contrib/lib/tests/test_events.py +++ b/electricitymap/contrib/lib/tests/test_events.py @@ -763,6 +763,14 @@ def test_production_with_nan_using_numpy(self): assert mix.wind == 10 assert mix.corrected_negative_modes == set() + def test_production_with_nan_init(self): + mix = ProductionMix(wind=math.nan) + assert mix.wind is None + + def test_production_with_nan_using_numpy_init(self): + mix = ProductionMix(wind=np.nan) + assert mix.wind is None + def test_storage(self): mix = StorageMix() mix.add_value("hydro", 10) @@ -804,6 +812,14 @@ def test_storage_with_nan_using_numpy(self): mix.add_value("hydro", np.nan) assert mix.hydro == -5 + def test_storage_with_nan_init(self): + mix = StorageMix(hydro=math.nan) + assert mix.hydro is None + + def test_storage_with_nan_using_numpy_init(self): + mix = StorageMix(hydro=np.nan) + assert mix.hydro is None + class TestMixUpdate: def test_update_production(self): diff --git a/parsers/DO.py b/parsers/DO.py index 49d311b93b..3c5e119bdb 100644 --- a/parsers/DO.py +++ b/parsers/DO.py @@ -1,28 +1,32 @@ #!/usr/bin/env python3 -from collections import defaultdict from datetime import datetime, timedelta from logging import Logger, getLogger -from math import isnan -from operator import itemgetter from zoneinfo import ZoneInfo -import numpy as np import pandas as pd from bs4 import BeautifulSoup from requests import Session +from electricitymap.contrib.lib.models.event_lists import ProductionBreakdownList +from electricitymap.contrib.lib.models.events import ProductionMix +from electricitymap.contrib.lib.types import ZoneKey + # This parser gets hourly electricity generation data from oc.org.do for the Dominican Republic. # The data is in MWh but since it is updated hourly we can view it as MW. # Solar generation now has some data available but multiple projects are planned/under construction. -url = "https://apps.oc.org.do/reportesgraficos/reportepostdespacho.aspx" +DO_SOURCE = "oc.org.do" +URL = "https://apps.oc.org.do/reportesgraficos/reportepostdespacho.aspx" + +TOTAL_RENEWABLES_MAPPING = { + "Total E\xf3lico": "wind", + "Total Hidroel\xe9ctrica": "hydro", + "Total Solar": "solar", +} -total_mapping = { +TOTAL_MAPPING = { "Total T\xe9rmico": "Thermal", - "Total E\xf3lico": "Wind", - "Total Hidroel\xe9ctrica": "Hydro", - "Total Solar": "Solar", "Total Generado": "Generated", } @@ -30,7 +34,7 @@ # http://www.sie.gob.do/images/Estadisticas/MEM/GeneracionDiariaEnero2017/ # Reporte_diario_de_generacion_31_enero_2017_merged2.pdf -thermal_plants = { +THERMAL_PLANTS = { "AES ANDRES": "gas", "BARAHONA CARBON": "coal", "BERSAL": "oil", @@ -87,23 +91,30 @@ } -def get_data(session: Session | None = None) -> list: +def get_datetime_from_hour(now: datetime, hour: int) -> datetime: + return now + timedelta(hours=int(hour) - 1) + + +def get_data(session: Session | None = None) -> list[list[str]]: """ Makes a request to source url. Finds main table and creates a list of all table elements in string format. """ - data = [] s = session or Session() - data_req = s.get(url) + data_req = s.get(URL) soup = BeautifulSoup(data_req.content, "lxml") tbs = soup.find("table", id="PostdespachoUnidadesTermicasGrid_DXMainTable") - rows = tbs.find_all("td") + rows = tbs.find_all("tr") + data = [] for row in rows: - num = row.getText().strip() - data.append(str(num)) + row_data = [] + cols = row.find_all("td") + for col in cols: + row_data.append(str(col.getText().strip())) + data.append(row_data) return data @@ -141,151 +152,101 @@ def chunker(big_lst) -> dict: return chunked_list -def data_formatter(data) -> dict: +def data_formatter(data: list[list[str]]) -> list[list[str]]: """ - Takes data and finds relevant sections. - Formats and breaks data into usable parts. + Aligns the tabular data to a standard format: (ID, hour_0, hour_1, ... , hour_23, hour_24) """ - find_thermal_index = data.index("GRUPO: T\xe9rmica") - find_totals_index = data.index("Total T\xe9rmico") - find_totals_end = data.index("Total Programado") - - ufthermal = data[find_thermal_index + 3 : find_totals_index - 59] - total_data = data[find_totals_index:find_totals_end] - - # Remove all company names. - for val in ufthermal: - if ":" in val: - i = ufthermal.index(val) - del ufthermal[i : i + 3] - - formatted_thermal = chunker([floater(item) for item in ufthermal]) - mapped_totals = [total_mapping.get(x, x) for x in total_data] - formatted_totals = chunker([floater(item) for item in mapped_totals]) + INIT_ROWS_TO_DROP = 26 + data = data[INIT_ROWS_TO_DROP:] + + def format_row(row: list[str]) -> list[str]: + # Case Grupo: X + match_grupo = len(row) == 2 and row[0] == "" and "grupo" in row[1].lower() + # Case Empresa: X + match_empresa = ( + len(row) == 3 + and all(c == "" for c in row[:2]) + and "empresa" in row[2].lower() + ) + # Case Unit: X + match_unit = len(row) == 27 and all(c == "" for c in row[:2]) + + if match_grupo: + return [row[1]] + [""] * 24 + elif match_empresa: + return [row[2]] + [""] * 24 + elif match_unit: + return row[2:] + else: + raise ValueError(f"Unexpected row format: {row}") - return {"totals": formatted_totals, "thermal": formatted_thermal} + data = [format_row(row) for row in data] + return data -def data_parser(formatted_data): +def correct_solar_production(production: pd.DataFrame) -> pd.DataFrame: """ - Converts formatted data into a pandas dataframe. - Removes any empty rows. - Returns a DataFrame. + Solar production is not reported when it's zero. """ - - hours = list(range(1, 24)) + [0] + [25, 26] - dft = pd.DataFrame(formatted_data, index=hours) - - dft = dft.drop(dft.index[[-1, -2]]) - dft = dft.replace("", np.nan) - dft = dft.dropna(how="all") - - return dft - - -def thermal_production(df, logger: Logger) -> list[dict]: + if production.solar.isnull().all() or production.solar.notnull().all(): + return production + production = production.copy() + null_production_index = production[production.solar.isnull()].index + max_non_null_solar_idx = production.solar.last_valid_index() + indices_to_set_to_zero = [ + idx for idx in null_production_index if idx < max_non_null_solar_idx + ] + # Replace all NaN values with 0 up to the first non-null value + production.loc[indices_to_set_to_zero] = 0 + return production + + +def extract_renewable_production(data: list[list[str]], dt: datetime) -> pd.DataFrame: """ - Takes DataFrame and finds thermal generation for each hour. - Removes any non generating plants then maps plants to type. + Extract renewable production data from the total rows. """ - - therms = [] - unmapped = set() - for hour in df.index.values: - dt = hour - currentt = df.loc[[hour]] - - # Create current plant output. - tp = {} - for item in list(df): - v = currentt.iloc[0][item] - tp[item] = v - - current_plants = {k: tp[k] for k in tp if not isnan(tp[k])} - - for plant in current_plants: - if plant not in thermal_plants: - unmapped.add(plant) - - mapped_plants = [ - (thermal_plants.get(plant, "unknown"), val) - for plant, val in current_plants.items() - ] - - thermalDict = defaultdict(lambda: 0.0) - - # Sum values for duplicate keys. - for key, val in mapped_plants: - thermalDict[key] += val - - thermalDict["datetime"] = dt - thermalDict = dict(thermalDict) - therms.append(thermalDict) - - for plant in unmapped: - logger.warning( - f"{plant} is missing from the DO plant mapping!", - extra={"key": "DO"}, - ) - - return therms - - -def total_production(df) -> list[dict]: - """Takes DataFrame and finds generation totals for each hour.""" - - vals = [] - # The Dominican Republic does not observe daylight savings time. - for hour in df.index.values: - dt = hour - current = df.loc[[hour]] - hydro = current.iloc[0]["Hydro"] - wind = current.iloc[0]["Wind"] - solar = current.iloc[0]["Solar"] - if wind > -10: - wind = max(wind, 0) - - # Wind and hydro totals do not always update exactly on the new hour. - # In this case we set them to None because they are unknown rather than zero. - if isnan(wind): - wind = None - if isnan(hydro): - hydro = None - - prod = {"wind": wind, "hydro": hydro, "solar": solar, "datetime": dt} - vals.append(prod) - - return vals - - -def merge_production(thermal, total) -> list[dict]: + renewable_indices = [ + i for i, row in enumerate(data) if row[0] in TOTAL_RENEWABLES_MAPPING + ] + renewable_data = [] + for i in renewable_indices: + row = data[i] + renewable_data.append([TOTAL_RENEWABLES_MAPPING[row[0]]] + row[1:]) + df = pd.DataFrame(renewable_data, columns=["mode"] + list(range(1, 25))) + # pivot to have hours as index and mode as columns + df = df.set_index("mode").T + df.index = [get_datetime_from_hour(dt, hour) for hour in df.index] + df.index.name = "datetime" + # Convert to numeric + df = df.apply(pd.to_numeric) + df = correct_solar_production(df) + return df + + +def extract_thermal_production(data: list[list[str]], dt: datetime) -> pd.DataFrame: """ - Takes thermal generation and total generation and merges them using 'datetime' key. + Extract thermal production from individual power plants. """ - - d = defaultdict(dict) - for each in (thermal, total): - for elem in each: - d[elem["datetime"]].update(elem) - - final = sorted(d.values(), key=itemgetter("datetime")) - - def get_datetime(hour): - return datetime.now(tz=ZoneInfo("America/Dominica")).replace( - hour=0, minute=0, second=0, microsecond=0 - ) + timedelta(hours=int(hour) - 1) - - for item in final: - i = item["datetime"] - j = get_datetime(i) - item["datetime"] = j - - return final + thermal_indices = [i for i, row in enumerate(data) if row[0] in THERMAL_PLANTS] + thermal_data = [] + for i in thermal_indices: + row = data[i] + thermal_data.append([THERMAL_PLANTS.get(row[0], "unknown")] + row[1:]) + df = pd.DataFrame(thermal_data, columns=["mode"] + list(range(1, 25))) + # Convert numeric + df = df.apply(pd.to_numeric, errors="ignore") + # Group by sum per mode + df = df.groupby("mode").sum(min_count=1) + # pivot to have hours as index and mode as columns + df = df.T + df.index = [get_datetime_from_hour(dt, hour) for hour in df.index] + df.index.name = "datetime" + return df def fetch_production( - zone_key: str = "DO", + zone_key: ZoneKey = ZoneKey("DO"), session: Session | None = None, target_datetime: datetime | None = None, logger: Logger = getLogger(__name__), @@ -294,38 +255,28 @@ def fetch_production( if target_datetime: raise NotImplementedError("This parser is not yet able to parse past dates") - dat = data_formatter(get_data(session=session)) - tot = data_parser(dat["totals"]) - th = data_parser(dat["thermal"]) - thermal = thermal_production(th, logger) - total = total_production(tot) - merge = merge_production(thermal, total) - - production_mix_by_hour = [] - for hour in merge: - production_mix = { - "zoneKey": zone_key, - "datetime": hour["datetime"], - "production": { - "biomass": hour.get("biomass", 0.0), - "coal": hour.get("coal", 0.0), - "gas": hour.get("gas", 0.0), - "hydro": hour.get("hydro", 0.0), - "nuclear": 0.0, - "oil": hour.get("oil", 0.0), - "solar": hour.get("solar", 0.0), - "wind": hour.get("wind", 0.0), - "geothermal": 0.0, - "unknown": hour.get("unknown", 0.0), - }, - "storage": { - "hydro": None, - }, - "source": "oc.org.do", - } - production_mix_by_hour.append(production_mix) - - return production_mix_by_hour + now = datetime.now(tz=ZoneInfo("America/Dominica")).replace( + hour=0, minute=0, second=0, microsecond=0 + ) + + data = data_formatter(get_data(session=session)) + renewable_production = extract_renewable_production(data, now) + thermal_production = extract_thermal_production(data, now) + production = pd.concat([renewable_production, thermal_production], axis=1) + # only keep rows with at least one non-null value + production = production.dropna(how="all") + + production_list = ProductionBreakdownList(logger) + for ts, mix in production.iterrows(): + production_mix = ProductionMix(**mix.to_dict()) + production_list.append( + zoneKey=zone_key, + datetime=ts.to_pydatetime(), + source=DO_SOURCE, + production=production_mix, + ) + + return production_list.to_list() if __name__ == "__main__": diff --git a/parsers/test/snapshots/snap_test_CNDC.py b/parsers/test/snapshots/snap_test_CNDC.py index 555fd04f91..c770bbbeb9 100644 --- a/parsers/test/snapshots/snap_test_CNDC.py +++ b/parsers/test/snapshots/snap_test_CNDC.py @@ -155,9 +155,9 @@ { "datetime": "2023-12-20T00:00:00-04:00", "production": { - "biomass": 0.0, + "biomass": 0, "hydro": 384.86, - "solar": 0.0, + "solar": 0, "unknown": 744.75, "wind": 45.39, }, @@ -168,9 +168,9 @@ { "datetime": "2023-12-20T01:00:00-04:00", "production": { - "biomass": 0.0, + "biomass": 0, "hydro": 402.78, - "solar": 0.0, + "solar": 0, "unknown": 696.4, "wind": 47.16, }, @@ -181,9 +181,9 @@ { "datetime": "2023-12-20T02:00:00-04:00", "production": { - "biomass": 0.0, + "biomass": 0, "hydro": 406.37, - "solar": 0.0, + "solar": 0, "unknown": 661.4, "wind": 48.76, }, @@ -194,9 +194,9 @@ { "datetime": "2023-12-20T03:00:00-04:00", "production": { - "biomass": 0.0, + "biomass": 0, "hydro": 380.18, - "solar": 0.0, + "solar": 0, "unknown": 662.76, "wind": 47.33, }, @@ -207,10 +207,10 @@ { "datetime": "2023-12-20T04:00:00-04:00", "production": { - "biomass": 0.0, + "biomass": 0, "hydro": 379.26, - "solar": 0.0, - "unknown": 661.1500000000001, + "solar": 0, + "unknown": 661.15, "wind": 51.32, }, "source": "cndc.bo", @@ -220,7 +220,7 @@ { "datetime": "2023-12-20T05:00:00-04:00", "production": { - "biomass": 0.0, + "biomass": 0, "hydro": 350.42, "solar": 0.39, "unknown": 647.5, @@ -233,7 +233,7 @@ { "datetime": "2023-12-20T06:00:00-04:00", "production": { - "biomass": 0.0, + "biomass": 0, "hydro": 422.66, "solar": 13.45, "unknown": 670.38, @@ -246,7 +246,7 @@ { "datetime": "2023-12-20T07:00:00-04:00", "production": { - "biomass": 0.0, + "biomass": 0, "hydro": 500.35, "solar": 47.85, "unknown": 717.29, @@ -259,7 +259,7 @@ { "datetime": "2023-12-20T08:00:00-04:00", "production": { - "biomass": 0.0, + "biomass": 0, "hydro": 505.62, "solar": 90.01, "unknown": 769.71, @@ -272,10 +272,10 @@ { "datetime": "2023-12-20T09:00:00-04:00", "production": { - "biomass": 0.0, + "biomass": 0, "hydro": 523.88, "solar": 113.07, - "unknown": 845.1199999999999, + "unknown": 845.12, "wind": 32.52, }, "source": "cndc.bo", @@ -285,7 +285,7 @@ { "datetime": "2023-12-20T10:00:00-04:00", "production": { - "biomass": 0.0, + "biomass": 0, "hydro": 511.72, "solar": 124.97, "unknown": 920.99, diff --git a/parsers/test/test_DO.py b/parsers/test/test_DO.py new file mode 100644 index 0000000000..242df8b1be --- /dev/null +++ b/parsers/test/test_DO.py @@ -0,0 +1,167 @@ +import pandas as pd +import pytest +from numpy import nan + +from electricitymap.contrib.parsers.DO import correct_solar_production + + +@pytest.fixture +def production_df(): + d = { + "wind": { + pd.Timestamp("2024-04-18 00:00:00-0400", tz="America/Dominica"): 117.42, + pd.Timestamp("2024-04-18 01:00:00-0400", tz="America/Dominica"): 76.37, + pd.Timestamp("2024-04-18 02:00:00-0400", tz="America/Dominica"): 53.67, + pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica"): 57.76, + pd.Timestamp("2024-04-18 04:00:00-0400", tz="America/Dominica"): 64.95, + pd.Timestamp("2024-04-18 05:00:00-0400", tz="America/Dominica"): 52.9, + pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica"): 46.25, + pd.Timestamp("2024-04-18 07:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 08:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 09:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 10:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 11:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 12:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 13:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 14:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 15:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 16:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 17:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 18:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 19:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 20:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 21:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 22:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 23:00:00-0400", tz="America/Dominica"): nan, + }, + "solar": { + pd.Timestamp("2024-04-18 00:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 01:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 02:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 04:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 05:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 07:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 08:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 09:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 10:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 11:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 12:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 13:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 14:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 15:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 16:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 17:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 18:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 19:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 20:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 21:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 22:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 23:00:00-0400", tz="America/Dominica"): nan, + }, + "hydro": { + pd.Timestamp("2024-04-18 00:00:00-0400", tz="America/Dominica"): 144.47, + pd.Timestamp("2024-04-18 01:00:00-0400", tz="America/Dominica"): 75.34, + pd.Timestamp("2024-04-18 02:00:00-0400", tz="America/Dominica"): 72.94, + pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica"): 84.23, + pd.Timestamp("2024-04-18 04:00:00-0400", tz="America/Dominica"): 84.52, + pd.Timestamp("2024-04-18 05:00:00-0400", tz="America/Dominica"): 84.68, + pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica"): 87.36, + pd.Timestamp("2024-04-18 07:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 08:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 09:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 10:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 11:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 12:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 13:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 14:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 15:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 16:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 17:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 18:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 19:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 20:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 21:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 22:00:00-0400", tz="America/Dominica"): nan, + pd.Timestamp("2024-04-18 23:00:00-0400", tz="America/Dominica"): nan, + }, + } + return pd.DataFrame(d) + + +def test_correct_solar_production_all_nan(production_df): + corrected_df = correct_solar_production(production_df) + assert corrected_df["solar"].isna().all() + + +def test_correct_solar_production_nan_then_prod(production_df): + production_df["solar"].loc[ + pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica") + ] = 12 + corrected_df = correct_solar_production(production_df) + assert ( + corrected_df["solar"].loc[ + pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica") + ] + == 12 + ) + assert all( + corrected_df["solar"].loc[ + : pd.Timestamp("2024-04-18 05:00:00-0400", tz="America/Dominica") + ] + == 0 + ) + assert all( + corrected_df["solar"] + .loc[pd.Timestamp("2024-04-18 07:00:00-0400", tz="America/Dominica") :] + .isnull() + ) + + +def test_correct_solar_production_prod_then_nan(production_df): + production_df["solar"].loc[ + : pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica") + ] = 12 + corrected_df = correct_solar_production(production_df) + assert all( + corrected_df["solar"].loc[ + : pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica") + ] + == 12 + ) + + +def test_correct_solar_production_prod_then_nan_then_prod(production_df): + production_df["solar"].loc[ + : pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica") + ] = 12 + production_df["solar"].loc[ + pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica") + ] = 14 + corrected_df = correct_solar_production(production_df) + assert all( + corrected_df["solar"].loc[ + : pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica") + ] + == 12 + ) + assert ( + corrected_df["solar"].loc[ + pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica") + ] + == 14 + ) + assert all( + corrected_df["solar"].loc[ + pd.Timestamp( + "2024-04-18 04:00:00-0400", tz="America/Dominica" + ) : pd.Timestamp("2024-04-18 05:00:00-0400", tz="America/Dominica") + ] + == 0 + ) + assert all( + corrected_df["solar"] + .loc[pd.Timestamp("2024-04-18 07:00:00-0400", tz="America/Dominica") :] + .isnull() + )