From d555e592fb922454182cd6b4929f583a7c0d00d8 Mon Sep 17 00:00:00 2001
From: Pierre Segonne <32778266+pierresegonne@users.noreply.github.com>
Date: Tue, 23 Apr 2024 09:33:32 +0200
Subject: [PATCH] chore(DO): update parser (#6675)

* chore(DO): update parser

* fix NaN checks

* fix type error

* format

* update snapshots

---------

Co-authored-by: Pierre Segonne <pierre.segonne@electricitymaps.com>
Co-authored-by: Viktor Andersson <30777521+VIKTORVAV99@users.noreply.github.com>
---
 electricitymap/contrib/lib/models/events.py   |  14 +-
 .../contrib/lib/tests/test_events.py          |  16 +
 parsers/DO.py                                 | 307 ++++++++----------
 parsers/test/snapshots/snap_test_CNDC.py      |  36 +-
 parsers/test/test_DO.py                       | 167 ++++++++++
 5 files changed, 342 insertions(+), 198 deletions(-)
 create mode 100644 parsers/test/test_DO.py

diff --git a/electricitymap/contrib/lib/models/events.py b/electricitymap/contrib/lib/models/events.py
index 4dd4317efd..8dd672a373 100644
--- a/electricitymap/contrib/lib/models/events.py
+++ b/electricitymap/contrib/lib/models/events.py
@@ -30,7 +30,7 @@ def _none_safe_round(value: float | None, precision: int = 6) -> float | None:
     If the value is None, it is returned as is.
     The default precision is 6 decimal places, which gives us a precision of 1 W.
     """
-    return None if value is None else round(value, precision)
+    return None if value is None or math.isnan(value) else round(value, precision)
 
 
 class Mix(BaseModel, ABC):
@@ -107,7 +107,9 @@ def __init__(self, **data: Any):
         for attr, value in data.items():
             if value is not None and value < 0:
                 self._corrected_negative_values.add(attr)
-                self.__setattr__(attr, None)
+                value = None
+            # Ensure that the value is rounded to 6 decimal places and set to None if it is NaN.
+            self.__setattr__(attr, value)
 
     def dict(  # noqa: A003
         self,
@@ -232,6 +234,14 @@ class StorageMix(Mix):
     battery: float | None = None
     hydro: float | None = None
 
+    def __init__(self, **data: Any):
+        """
+        Overriding the constructor to check for NaN values and set them to None.
+        """
+        super().__init__(**data)
+        for attr, value in data.items():
+            self.__setattr__(attr, value)
+
     def __setattr__(self, name: str, value: float | None) -> None:
         """
         Overriding the setattr method to raise an error if the mode is unknown.
diff --git a/electricitymap/contrib/lib/tests/test_events.py b/electricitymap/contrib/lib/tests/test_events.py
index c3c0d27834..96a17130e2 100644
--- a/electricitymap/contrib/lib/tests/test_events.py
+++ b/electricitymap/contrib/lib/tests/test_events.py
@@ -763,6 +763,14 @@ def test_production_with_nan_using_numpy(self):
         assert mix.wind == 10
         assert mix.corrected_negative_modes == set()
 
+    def test_production_with_nan_init(self):
+        mix = ProductionMix(wind=math.nan)
+        assert mix.wind is None
+
+    def test_production_with_nan_using_numpy_init(self):
+        mix = ProductionMix(wind=np.nan)
+        assert mix.wind is None
+
     def test_storage(self):
         mix = StorageMix()
         mix.add_value("hydro", 10)
@@ -804,6 +812,14 @@ def test_storage_with_nan_using_numpy(self):
         mix.add_value("hydro", np.nan)
         assert mix.hydro == -5
 
+    def test_storage_with_nan_init(self):
+        mix = StorageMix(hydro=math.nan)
+        assert mix.hydro is None
+
+    def test_storage_with_nan_using_numpy_init(self):
+        mix = StorageMix(hydro=np.nan)
+        assert mix.hydro is None
+
 
 class TestMixUpdate:
     def test_update_production(self):
diff --git a/parsers/DO.py b/parsers/DO.py
index 49d311b93b..3c5e119bdb 100644
--- a/parsers/DO.py
+++ b/parsers/DO.py
@@ -1,28 +1,32 @@
 #!/usr/bin/env python3
 
-from collections import defaultdict
 from datetime import datetime, timedelta
 from logging import Logger, getLogger
-from math import isnan
-from operator import itemgetter
 from zoneinfo import ZoneInfo
 
-import numpy as np
 import pandas as pd
 from bs4 import BeautifulSoup
 from requests import Session
 
+from electricitymap.contrib.lib.models.event_lists import ProductionBreakdownList
+from electricitymap.contrib.lib.models.events import ProductionMix
+from electricitymap.contrib.lib.types import ZoneKey
+
 # This parser gets hourly electricity generation data from oc.org.do for the Dominican Republic.
 # The data is in MWh but since it is updated hourly we can view it as MW.
 # Solar generation now has some data available but multiple projects are planned/under construction.
 
-url = "https://apps.oc.org.do/reportesgraficos/reportepostdespacho.aspx"
+DO_SOURCE = "oc.org.do"
+URL = "https://apps.oc.org.do/reportesgraficos/reportepostdespacho.aspx"
+
+TOTAL_RENEWABLES_MAPPING = {
+    "Total E\xf3lico": "wind",
+    "Total Hidroel\xe9ctrica": "hydro",
+    "Total Solar": "solar",
+}
 
-total_mapping = {
+TOTAL_MAPPING = {
     "Total T\xe9rmico": "Thermal",
-    "Total E\xf3lico": "Wind",
-    "Total Hidroel\xe9ctrica": "Hydro",
-    "Total Solar": "Solar",
     "Total Generado": "Generated",
 }
 
@@ -30,7 +34,7 @@
 # http://www.sie.gob.do/images/Estadisticas/MEM/GeneracionDiariaEnero2017/
 # Reporte_diario_de_generacion_31_enero_2017_merged2.pdf
 
-thermal_plants = {
+THERMAL_PLANTS = {
     "AES ANDRES": "gas",
     "BARAHONA CARBON": "coal",
     "BERSAL": "oil",
@@ -87,23 +91,30 @@
 }
 
 
-def get_data(session: Session | None = None) -> list:
+def get_datetime_from_hour(now: datetime, hour: int) -> datetime:
+    return now + timedelta(hours=int(hour) - 1)
+
+
+def get_data(session: Session | None = None) -> list[list[str]]:
     """
     Makes a request to source url.
     Finds main table and creates a list of all table elements in string format.
     """
 
-    data = []
     s = session or Session()
-    data_req = s.get(url)
+    data_req = s.get(URL)
     soup = BeautifulSoup(data_req.content, "lxml")
 
     tbs = soup.find("table", id="PostdespachoUnidadesTermicasGrid_DXMainTable")
-    rows = tbs.find_all("td")
+    rows = tbs.find_all("tr")
 
+    data = []
     for row in rows:
-        num = row.getText().strip()
-        data.append(str(num))
+        row_data = []
+        cols = row.find_all("td")
+        for col in cols:
+            row_data.append(str(col.getText().strip()))
+        data.append(row_data)
 
     return data
 
@@ -141,151 +152,101 @@ def chunker(big_lst) -> dict:
     return chunked_list
 
 
-def data_formatter(data) -> dict:
+def data_formatter(data: list[list[str]]) -> list[list[str]]:
     """
-    Takes data and finds relevant sections.
-    Formats and breaks data into usable parts.
+    Aligns the tabular data to a standard format: (ID, hour_0, hour_1, ... , hour_23, hour_24)
     """
 
-    find_thermal_index = data.index("GRUPO: T\xe9rmica")
-    find_totals_index = data.index("Total T\xe9rmico")
-    find_totals_end = data.index("Total Programado")
-
-    ufthermal = data[find_thermal_index + 3 : find_totals_index - 59]
-    total_data = data[find_totals_index:find_totals_end]
-
-    # Remove all company names.
-    for val in ufthermal:
-        if ":" in val:
-            i = ufthermal.index(val)
-            del ufthermal[i : i + 3]
-
-    formatted_thermal = chunker([floater(item) for item in ufthermal])
-    mapped_totals = [total_mapping.get(x, x) for x in total_data]
-    formatted_totals = chunker([floater(item) for item in mapped_totals])
+    INIT_ROWS_TO_DROP = 26
+    data = data[INIT_ROWS_TO_DROP:]
+
+    def format_row(row: list[str]) -> list[str]:
+        # Case Grupo: X
+        match_grupo = len(row) == 2 and row[0] == "" and "grupo" in row[1].lower()
+        # Case Empresa: X
+        match_empresa = (
+            len(row) == 3
+            and all(c == "" for c in row[:2])
+            and "empresa" in row[2].lower()
+        )
+        # Case Unit: X
+        match_unit = len(row) == 27 and all(c == "" for c in row[:2])
+
+        if match_grupo:
+            return [row[1]] + [""] * 24
+        elif match_empresa:
+            return [row[2]] + [""] * 24
+        elif match_unit:
+            return row[2:]
+        else:
+            raise ValueError(f"Unexpected row format: {row}")
 
-    return {"totals": formatted_totals, "thermal": formatted_thermal}
+    data = [format_row(row) for row in data]
+    return data
 
 
-def data_parser(formatted_data):
+def correct_solar_production(production: pd.DataFrame) -> pd.DataFrame:
     """
-    Converts formatted data into a pandas dataframe.
-    Removes any empty rows.
-    Returns a DataFrame.
+    Solar production is not reported when it's zero.
     """
-
-    hours = list(range(1, 24)) + [0] + [25, 26]
-    dft = pd.DataFrame(formatted_data, index=hours)
-
-    dft = dft.drop(dft.index[[-1, -2]])
-    dft = dft.replace("", np.nan)
-    dft = dft.dropna(how="all")
-
-    return dft
-
-
-def thermal_production(df, logger: Logger) -> list[dict]:
+    if production.solar.isnull().all() or production.solar.notnull().all():
+        return production
+    production = production.copy()
+    null_production_index = production[production.solar.isnull()].index
+    max_non_null_solar_idx = production.solar.last_valid_index()
+    indices_to_set_to_zero = [
+        idx for idx in null_production_index if idx < max_non_null_solar_idx
+    ]
+    # Replace all NaN values with 0 up to the first non-null value
+    production.loc[indices_to_set_to_zero] = 0
+    return production
+
+
+def extract_renewable_production(data: list[list[str]], dt: datetime) -> pd.DataFrame:
     """
-    Takes DataFrame and finds thermal generation for each hour.
-    Removes any non generating plants then maps plants to type.
+    Extract renewable production data from the total rows.
     """
-
-    therms = []
-    unmapped = set()
-    for hour in df.index.values:
-        dt = hour
-        currentt = df.loc[[hour]]
-
-        # Create current plant output.
-        tp = {}
-        for item in list(df):
-            v = currentt.iloc[0][item]
-            tp[item] = v
-
-        current_plants = {k: tp[k] for k in tp if not isnan(tp[k])}
-
-        for plant in current_plants:
-            if plant not in thermal_plants:
-                unmapped.add(plant)
-
-        mapped_plants = [
-            (thermal_plants.get(plant, "unknown"), val)
-            for plant, val in current_plants.items()
-        ]
-
-        thermalDict = defaultdict(lambda: 0.0)
-
-        # Sum values for duplicate keys.
-        for key, val in mapped_plants:
-            thermalDict[key] += val
-
-        thermalDict["datetime"] = dt
-        thermalDict = dict(thermalDict)
-        therms.append(thermalDict)
-
-    for plant in unmapped:
-        logger.warning(
-            f"{plant} is missing from the DO plant mapping!",
-            extra={"key": "DO"},
-        )
-
-    return therms
-
-
-def total_production(df) -> list[dict]:
-    """Takes DataFrame and finds generation totals for each hour."""
-
-    vals = []
-    # The Dominican Republic does not observe daylight savings time.
-    for hour in df.index.values:
-        dt = hour
-        current = df.loc[[hour]]
-        hydro = current.iloc[0]["Hydro"]
-        wind = current.iloc[0]["Wind"]
-        solar = current.iloc[0]["Solar"]
-        if wind > -10:
-            wind = max(wind, 0)
-
-        # Wind and hydro totals do not always update exactly on the new hour.
-        # In this case we set them to None because they are unknown rather than zero.
-        if isnan(wind):
-            wind = None
-        if isnan(hydro):
-            hydro = None
-
-        prod = {"wind": wind, "hydro": hydro, "solar": solar, "datetime": dt}
-        vals.append(prod)
-
-    return vals
-
-
-def merge_production(thermal, total) -> list[dict]:
+    renewable_indices = [
+        i for i, row in enumerate(data) if row[0] in TOTAL_RENEWABLES_MAPPING
+    ]
+    renewable_data = []
+    for i in renewable_indices:
+        row = data[i]
+        renewable_data.append([TOTAL_RENEWABLES_MAPPING[row[0]]] + row[1:])
+    df = pd.DataFrame(renewable_data, columns=["mode"] + list(range(1, 25)))
+    # pivot to have hours as index and mode as columns
+    df = df.set_index("mode").T
+    df.index = [get_datetime_from_hour(dt, hour) for hour in df.index]
+    df.index.name = "datetime"
+    # Convert to numeric
+    df = df.apply(pd.to_numeric)
+    df = correct_solar_production(df)
+    return df
+
+
+def extract_thermal_production(data: list[list[str]], dt: datetime) -> pd.DataFrame:
     """
-    Takes thermal generation and total generation and merges them using 'datetime' key.
+    Extract thermal production from individual power plants.
     """
-
-    d = defaultdict(dict)
-    for each in (thermal, total):
-        for elem in each:
-            d[elem["datetime"]].update(elem)
-
-    final = sorted(d.values(), key=itemgetter("datetime"))
-
-    def get_datetime(hour):
-        return datetime.now(tz=ZoneInfo("America/Dominica")).replace(
-            hour=0, minute=0, second=0, microsecond=0
-        ) + timedelta(hours=int(hour) - 1)
-
-    for item in final:
-        i = item["datetime"]
-        j = get_datetime(i)
-        item["datetime"] = j
-
-    return final
+    thermal_indices = [i for i, row in enumerate(data) if row[0] in THERMAL_PLANTS]
+    thermal_data = []
+    for i in thermal_indices:
+        row = data[i]
+        thermal_data.append([THERMAL_PLANTS.get(row[0], "unknown")] + row[1:])
+    df = pd.DataFrame(thermal_data, columns=["mode"] + list(range(1, 25)))
+    # Convert numeric
+    df = df.apply(pd.to_numeric, errors="ignore")
+    # Group by sum per mode
+    df = df.groupby("mode").sum(min_count=1)
+    # pivot to have hours as index and mode as columns
+    df = df.T
+    df.index = [get_datetime_from_hour(dt, hour) for hour in df.index]
+    df.index.name = "datetime"
+    return df
 
 
 def fetch_production(
-    zone_key: str = "DO",
+    zone_key: ZoneKey = ZoneKey("DO"),
     session: Session | None = None,
     target_datetime: datetime | None = None,
     logger: Logger = getLogger(__name__),
@@ -294,38 +255,28 @@ def fetch_production(
     if target_datetime:
         raise NotImplementedError("This parser is not yet able to parse past dates")
 
-    dat = data_formatter(get_data(session=session))
-    tot = data_parser(dat["totals"])
-    th = data_parser(dat["thermal"])
-    thermal = thermal_production(th, logger)
-    total = total_production(tot)
-    merge = merge_production(thermal, total)
-
-    production_mix_by_hour = []
-    for hour in merge:
-        production_mix = {
-            "zoneKey": zone_key,
-            "datetime": hour["datetime"],
-            "production": {
-                "biomass": hour.get("biomass", 0.0),
-                "coal": hour.get("coal", 0.0),
-                "gas": hour.get("gas", 0.0),
-                "hydro": hour.get("hydro", 0.0),
-                "nuclear": 0.0,
-                "oil": hour.get("oil", 0.0),
-                "solar": hour.get("solar", 0.0),
-                "wind": hour.get("wind", 0.0),
-                "geothermal": 0.0,
-                "unknown": hour.get("unknown", 0.0),
-            },
-            "storage": {
-                "hydro": None,
-            },
-            "source": "oc.org.do",
-        }
-        production_mix_by_hour.append(production_mix)
-
-    return production_mix_by_hour
+    now = datetime.now(tz=ZoneInfo("America/Dominica")).replace(
+        hour=0, minute=0, second=0, microsecond=0
+    )
+
+    data = data_formatter(get_data(session=session))
+    renewable_production = extract_renewable_production(data, now)
+    thermal_production = extract_thermal_production(data, now)
+    production = pd.concat([renewable_production, thermal_production], axis=1)
+    # only keep rows with at least one non-null value
+    production = production.dropna(how="all")
+
+    production_list = ProductionBreakdownList(logger)
+    for ts, mix in production.iterrows():
+        production_mix = ProductionMix(**mix.to_dict())
+        production_list.append(
+            zoneKey=zone_key,
+            datetime=ts.to_pydatetime(),
+            source=DO_SOURCE,
+            production=production_mix,
+        )
+
+    return production_list.to_list()
 
 
 if __name__ == "__main__":
diff --git a/parsers/test/snapshots/snap_test_CNDC.py b/parsers/test/snapshots/snap_test_CNDC.py
index 555fd04f91..c770bbbeb9 100644
--- a/parsers/test/snapshots/snap_test_CNDC.py
+++ b/parsers/test/snapshots/snap_test_CNDC.py
@@ -155,9 +155,9 @@
     {
         "datetime": "2023-12-20T00:00:00-04:00",
         "production": {
-            "biomass": 0.0,
+            "biomass": 0,
             "hydro": 384.86,
-            "solar": 0.0,
+            "solar": 0,
             "unknown": 744.75,
             "wind": 45.39,
         },
@@ -168,9 +168,9 @@
     {
         "datetime": "2023-12-20T01:00:00-04:00",
         "production": {
-            "biomass": 0.0,
+            "biomass": 0,
             "hydro": 402.78,
-            "solar": 0.0,
+            "solar": 0,
             "unknown": 696.4,
             "wind": 47.16,
         },
@@ -181,9 +181,9 @@
     {
         "datetime": "2023-12-20T02:00:00-04:00",
         "production": {
-            "biomass": 0.0,
+            "biomass": 0,
             "hydro": 406.37,
-            "solar": 0.0,
+            "solar": 0,
             "unknown": 661.4,
             "wind": 48.76,
         },
@@ -194,9 +194,9 @@
     {
         "datetime": "2023-12-20T03:00:00-04:00",
         "production": {
-            "biomass": 0.0,
+            "biomass": 0,
             "hydro": 380.18,
-            "solar": 0.0,
+            "solar": 0,
             "unknown": 662.76,
             "wind": 47.33,
         },
@@ -207,10 +207,10 @@
     {
         "datetime": "2023-12-20T04:00:00-04:00",
         "production": {
-            "biomass": 0.0,
+            "biomass": 0,
             "hydro": 379.26,
-            "solar": 0.0,
-            "unknown": 661.1500000000001,
+            "solar": 0,
+            "unknown": 661.15,
             "wind": 51.32,
         },
         "source": "cndc.bo",
@@ -220,7 +220,7 @@
     {
         "datetime": "2023-12-20T05:00:00-04:00",
         "production": {
-            "biomass": 0.0,
+            "biomass": 0,
             "hydro": 350.42,
             "solar": 0.39,
             "unknown": 647.5,
@@ -233,7 +233,7 @@
     {
         "datetime": "2023-12-20T06:00:00-04:00",
         "production": {
-            "biomass": 0.0,
+            "biomass": 0,
             "hydro": 422.66,
             "solar": 13.45,
             "unknown": 670.38,
@@ -246,7 +246,7 @@
     {
         "datetime": "2023-12-20T07:00:00-04:00",
         "production": {
-            "biomass": 0.0,
+            "biomass": 0,
             "hydro": 500.35,
             "solar": 47.85,
             "unknown": 717.29,
@@ -259,7 +259,7 @@
     {
         "datetime": "2023-12-20T08:00:00-04:00",
         "production": {
-            "biomass": 0.0,
+            "biomass": 0,
             "hydro": 505.62,
             "solar": 90.01,
             "unknown": 769.71,
@@ -272,10 +272,10 @@
     {
         "datetime": "2023-12-20T09:00:00-04:00",
         "production": {
-            "biomass": 0.0,
+            "biomass": 0,
             "hydro": 523.88,
             "solar": 113.07,
-            "unknown": 845.1199999999999,
+            "unknown": 845.12,
             "wind": 32.52,
         },
         "source": "cndc.bo",
@@ -285,7 +285,7 @@
     {
         "datetime": "2023-12-20T10:00:00-04:00",
         "production": {
-            "biomass": 0.0,
+            "biomass": 0,
             "hydro": 511.72,
             "solar": 124.97,
             "unknown": 920.99,
diff --git a/parsers/test/test_DO.py b/parsers/test/test_DO.py
new file mode 100644
index 0000000000..242df8b1be
--- /dev/null
+++ b/parsers/test/test_DO.py
@@ -0,0 +1,167 @@
+import pandas as pd
+import pytest
+from numpy import nan
+
+from electricitymap.contrib.parsers.DO import correct_solar_production
+
+
+@pytest.fixture
+def production_df():
+    d = {
+        "wind": {
+            pd.Timestamp("2024-04-18 00:00:00-0400", tz="America/Dominica"): 117.42,
+            pd.Timestamp("2024-04-18 01:00:00-0400", tz="America/Dominica"): 76.37,
+            pd.Timestamp("2024-04-18 02:00:00-0400", tz="America/Dominica"): 53.67,
+            pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica"): 57.76,
+            pd.Timestamp("2024-04-18 04:00:00-0400", tz="America/Dominica"): 64.95,
+            pd.Timestamp("2024-04-18 05:00:00-0400", tz="America/Dominica"): 52.9,
+            pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica"): 46.25,
+            pd.Timestamp("2024-04-18 07:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 08:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 09:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 10:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 11:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 12:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 13:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 14:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 15:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 16:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 17:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 18:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 19:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 20:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 21:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 22:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 23:00:00-0400", tz="America/Dominica"): nan,
+        },
+        "solar": {
+            pd.Timestamp("2024-04-18 00:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 01:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 02:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 04:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 05:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 07:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 08:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 09:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 10:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 11:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 12:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 13:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 14:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 15:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 16:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 17:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 18:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 19:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 20:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 21:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 22:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 23:00:00-0400", tz="America/Dominica"): nan,
+        },
+        "hydro": {
+            pd.Timestamp("2024-04-18 00:00:00-0400", tz="America/Dominica"): 144.47,
+            pd.Timestamp("2024-04-18 01:00:00-0400", tz="America/Dominica"): 75.34,
+            pd.Timestamp("2024-04-18 02:00:00-0400", tz="America/Dominica"): 72.94,
+            pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica"): 84.23,
+            pd.Timestamp("2024-04-18 04:00:00-0400", tz="America/Dominica"): 84.52,
+            pd.Timestamp("2024-04-18 05:00:00-0400", tz="America/Dominica"): 84.68,
+            pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica"): 87.36,
+            pd.Timestamp("2024-04-18 07:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 08:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 09:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 10:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 11:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 12:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 13:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 14:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 15:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 16:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 17:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 18:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 19:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 20:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 21:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 22:00:00-0400", tz="America/Dominica"): nan,
+            pd.Timestamp("2024-04-18 23:00:00-0400", tz="America/Dominica"): nan,
+        },
+    }
+    return pd.DataFrame(d)
+
+
+def test_correct_solar_production_all_nan(production_df):
+    corrected_df = correct_solar_production(production_df)
+    assert corrected_df["solar"].isna().all()
+
+
+def test_correct_solar_production_nan_then_prod(production_df):
+    production_df["solar"].loc[
+        pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica")
+    ] = 12
+    corrected_df = correct_solar_production(production_df)
+    assert (
+        corrected_df["solar"].loc[
+            pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica")
+        ]
+        == 12
+    )
+    assert all(
+        corrected_df["solar"].loc[
+            : pd.Timestamp("2024-04-18 05:00:00-0400", tz="America/Dominica")
+        ]
+        == 0
+    )
+    assert all(
+        corrected_df["solar"]
+        .loc[pd.Timestamp("2024-04-18 07:00:00-0400", tz="America/Dominica") :]
+        .isnull()
+    )
+
+
+def test_correct_solar_production_prod_then_nan(production_df):
+    production_df["solar"].loc[
+        : pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica")
+    ] = 12
+    corrected_df = correct_solar_production(production_df)
+    assert all(
+        corrected_df["solar"].loc[
+            : pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica")
+        ]
+        == 12
+    )
+
+
+def test_correct_solar_production_prod_then_nan_then_prod(production_df):
+    production_df["solar"].loc[
+        : pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica")
+    ] = 12
+    production_df["solar"].loc[
+        pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica")
+    ] = 14
+    corrected_df = correct_solar_production(production_df)
+    assert all(
+        corrected_df["solar"].loc[
+            : pd.Timestamp("2024-04-18 03:00:00-0400", tz="America/Dominica")
+        ]
+        == 12
+    )
+    assert (
+        corrected_df["solar"].loc[
+            pd.Timestamp("2024-04-18 06:00:00-0400", tz="America/Dominica")
+        ]
+        == 14
+    )
+    assert all(
+        corrected_df["solar"].loc[
+            pd.Timestamp(
+                "2024-04-18 04:00:00-0400", tz="America/Dominica"
+            ) : pd.Timestamp("2024-04-18 05:00:00-0400", tz="America/Dominica")
+        ]
+        == 0
+    )
+    assert all(
+        corrected_df["solar"]
+        .loc[pd.Timestamp("2024-04-18 07:00:00-0400", tz="America/Dominica") :]
+        .isnull()
+    )