In [4]:
import contextlib
import logging
import math
from datetime import datetime, timedelta, timezone
from typing import Any

import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
from pydantic import BaseModel, ValidationError, root_validator, validator

#rom hetdesrun.utils import plotly_fig_to_json_dict

In [5]:
class ComponentInputValidationException(Exception):
    """In code input validation failures"""

    def __init__(
        self,
        *args: Any,
        invalid_component_inputs: list[str],
        error_code: int | str = "",
        **kwargs: Any
    ):
        raise ValueError("Hier könnte ihr Error stehen.")

In [108]:
class GapDetectionParameters(BaseModel):
    start_date: str  # pydantic kann auch direkt datetime, muss aber getestet werden
    end_date: str
    auto_stepsize: bool = True
    history_end_date: str = None
    step_size_str: str = None
    percentil: float = 0.95
    min_amount_datapoints: int
    interpolation_method: str = "nearest"
    fill_value: Any = None

    @validator(
        "start_date", "end_date"
    )  # TODO was ist mit dem Fall Attribut der Zeitreihe?
    def verify_date_strings(cls, date) -> datetime:
        date = datetime.fromisoformat(date).replace(tzinfo=timezone.utc)
        return date

    @validator("end_date")
    def verify_dates(cls, end_date, values: dict):
        start_date = values["start_date"]
        if start_date > end_date:
            raise ComponentInputValidationException(
                "The value start_date must not be later than the end_date, while it is "
                f"{start_date} > {end_date}.",
                error_code=422,
                invalid_component_inputs=["end_date_str", "start_date_str"],
            )
        return values

    @validator("history_end_date")
    def verify_history_end_date(cls, history_end_date, values: dict) -> datetime | None:
        start_date = values["start_date"]
        end_date = values["end_date"]
        if history_end_date is not None:
            try:
                history_end_date = datetime.fromisoformat(history_end_date).replace(
                    tzinfo=timezone.utc
                )
            except ValueError as err:
                raise ComponentInputValidationException(
                    "The date in history_end_date has to be formatted in iso format to allow "
                    "conversion to datetime type for gap detection.",
                    error_code=422,
                    invalid_component_inputs=["history_end_date_str"],
                ) from err

            if start_date > history_end_date:
                raise ComponentInputValidationException(
                    "The value history_end_date has to be inbetween start_date and end_date, while "
                    f"it is {history_end_date} < {start_date}.",
                    error_code=422,
                    invalid_component_inputs=["history_end_date_str"],
                )
            if end_date < history_end_date:
                raise ComponentInputValidationException(
                    "The value history_end_date has to be inbetween start_date and end_date, while "
                    f"it is {history_end_date} > {end_date}.",
                    error_code=422,
                    invalid_component_inputs=["history_end_date_str"],
                )
        else:
            history_end_date = None
        return history_end_date

    @validator("step_size_str")  # TODO auf freq string überprüfen
    def verify_step_size(cls, step_size, values: dict) -> str:
        auto_stepsize = values["auto_stepsize"]
        if (auto_stepsize is False) and (step_size is None):
            raise ComponentInputValidationException(
                "A step_size is required for gap detection, if it is not automatically determined.",
                error_code=422,
                invalid_component_inputs=["step_size_str"],
            )
        return step_size

    @validator("percentil")
    def verify_percentile(cls, percentil) -> float:
        if (percentil < 0) or (percentil > 1):
            raise ComponentInputValidationException(
                "The percentil value has to be a non-negative float less or equal to 1.",
                error_code=422,
                invalid_component_inputs=["percentil"],
            )
        return percentil

    @validator("min_amount_datapoints")
    def verify_min_amount_datapoints(cls, min_amount) -> int:
        if min_amount < 0:
            raise ComponentInputValidationException(
                "The minimum amount of datapoints has to be a non-negative integer.",
                error_code=422,
                invalid_component_inputs=["min_amount_datapoints"],
            )
        return min_amount

In [54]:
def constrict_series_to_dates(
    timeseries_data: pd.Series | pd.DataFrame, start_date: str, end_date: str
) -> pd.Series | pd.DataFrame:
    return timeseries_data[
        (timeseries_data.index >= pd.to_datetime(start_date, utc=True))
        & (timeseries_data.index <= pd.to_datetime(end_date, utc=True))
    ]

In [8]:
def check_amount_datapoints(series:pd.Series, min_amount_datapoints:int):
    if len(series) < min_amount_datapoints:
            raise ComponentInputValidationException(
                f"The timeseries must contain at least {min_amount_datapoints} datapoints.",
                error_code=422,
                invalid_component_inputs=["timeseries"],
            )

In [9]:
def determine_timestep_gapsize_percentile(
    timeseries_data: pd.Series | pd.DataFrame, percentil:float,interpolation_method:str
) -> pd.Timedelta:
    gaps = timeseries_data.index.to_series().diff().dropna()

    percentile_gapsize = gaps.quantile(
        percentil, interpolation= interpolation_method
    )

    return percentile_gapsize

In [10]:
def freqstr2dateoffset(freqstr: str) -> pd.DateOffset:
    """Transform frequency string to Pandas DateOffset."""
    return pd.tseries.frequencies.to_offset(freqstr)

def freqstr2timedelta(freqstr: str) -> pd.Timedelta:
    """Transform frequency string to Pandas Timedelta."""
    try:
        return pd.to_timedelta(freqstr)
    except ValueError:
        return pd.to_timedelta(freqstr2dateoffset(freqstr))

In [11]:
def determine_gap_length(
    timeseries: pd.Series, stepsize=timedelta(minutes=1)
) -> pd.DataFrame:
    gaps = timeseries.index.to_series().diff().to_numpy()

    stepsize_seconds = stepsize.total_seconds()

    normalized_gaps = [
        pd.Timedelta(gap).total_seconds() / stepsize_seconds if pd.notna(gap) else None
        for gap in gaps
    ] #TODO in Doku erklären was eine Gap sein soll

    result_df = pd.DataFrame(
        {"value": timeseries.to_numpy(), "gap": normalized_gaps}, index=timeseries.index
    )

    return result_df

In [101]:
def check_add_boundary_dates(
    timeseries: pd.Series, start_date: datetime, end_date: datetime, fill_value=None
) -> pd.Series:
    if start_date not in timeseries.index:
        timeseries[start_date] = fill_value

    if end_date not in timeseries.index:
        timeseries[end_date] = fill_value

    timeseries = timeseries.sort_index()

    return timeseries

In [102]:
def return_gap_boundary_timestamps(frame_with_gapsizes: pd.DataFrame,series:pd.Series) -> pd.DataFrame:
    # Identify rows where gap is greater than 1
    large_gap_indices = frame_with_gapsizes[frame_with_gapsizes["gap"] > 1].index.to_numpy()
    # Extract the start and end timestamps of the gaps
    gap_starts = [
        frame_with_gapsizes.index[index - 1]
        for index, large_gap_index in enumerate(frame_with_gapsizes.index)
        if large_gap_index in large_gap_indices
    ]

    left_values = series[gap_starts].to_numpy()
    right_values = series[large_gap_indices].to_numpy()

    # Create a DataFrame to store the results
    result_df = pd.DataFrame(
        {"start": gap_starts,
         "end": large_gap_indices,
         "start_inclusive":True,
         "end_inclusive":True,
         "gap_size": large_gap_indices-gap_starts,
         "value_to_left":left_values,
         "value_to_right":right_values,
         "mean_left_right":(left_values+right_values)/2
         }
         )

    return result_df

In [128]:
length_ts=366
timeseries = pd.Series(data=range(length_ts),index=pd.to_datetime(range(length_ts),utc=True,unit="D",origin="2020-01-01T01:15:27.000"))
start_date = "2019-01-01T01:15:27.000Z"
end_date= "2021-12-31T01:15:27.000Z"
auto_stepsize = False
history_end_date= "2020-01-01T01:15:27.000Z"
step_size_str= "D"
percentil = 0.95
min_amount_datapoints = 21
interpolation_method = "nearest"
fill_value = None

In [129]:
timeseries = timeseries.drop(timeseries.index[[9,13,17]])

In [130]:
def main(
    timeseries=timeseries,
    start_date_str=start_date,
    end_date_str=end_date,
    auto_stepsize=auto_stepsize,
    history_end_date_str=history_end_date,
    step_size_str=step_size_str,
    percentil=percentil,
    min_amount_datapoints=min_amount_datapoints,
    interpolation_method=interpolation_method,
    fill_value=fill_value,
):
    timeseries = timeseries.sort_index().dropna()

    input_params = GapDetectionParameters(
        start_date=start_date_str,
        end_date=end_date_str,
        auto_stepsize=auto_stepsize,
        history_end_date_str=history_end_date_str,
        step_size=step_size_str,
        percentil=percentil,
        min_amount_datapoints=min_amount_datapoints,
        interpolation_method=interpolation_method,
        fill_value=fill_value,
    )
    constricted_series = constrict_series_to_dates(
        timeseries, start_date_str, end_date_str
    )
    check_amount_datapoints(
        series=constricted_series,
        min_amount_datapoints=input_params.min_amount_datapoints,
    )
    if auto_stepsize:
        if input_params.history_end_date is not None:
            training_series = constrict_series_to_dates(
                timeseries, input_params.start_date, input_params.history_end_date
            )
        else:
            training_series = constricted_series
        step_size = determine_timestep_gapsize_percentile(training_series, percentil)
    else:
        step_size = freqstr2timedelta(step_size_str)
    series_with_bounds = check_add_boundary_dates(
        constricted_series, start_date_str, end_date_str
    )

    df_with_gaps = determine_gap_length(series_with_bounds, step_size)
    gap_boundaries = return_gap_boundary_timestamps(df_with_gaps, series_with_bounds)

    return gap_boundaries

In [131]:
main()

Unnamed: 0,start,end,start_inclusive,end_inclusive,gap_size,value_to_left,value_to_right,mean_left_right
0,2019-01-01 01:15:27+00:00,2020-01-01 00:00:00+00:00,True,True,364 days 22:44:33,,0.0,
1,2020-01-09 00:00:00+00:00,2020-01-11 00:00:00+00:00,True,True,2 days 00:00:00,8.0,10.0,9.0
2,2020-01-13 00:00:00+00:00,2020-01-15 00:00:00+00:00,True,True,2 days 00:00:00,12.0,14.0,13.0
3,2020-01-17 00:00:00+00:00,2020-01-19 00:00:00+00:00,True,True,2 days 00:00:00,16.0,18.0,17.0
4,2020-12-31 00:00:00+00:00,2021-12-31 01:15:27+00:00,True,True,365 days 01:15:27,365.0,,


In [18]:
GapDetectionParameters(start_date=start_date,end_date=end_date,
                       auto_stepsize=auto_stepsize,step_size_str=step_size_str,percentil=percentil,
                       min_amount_datapoints=min_amount_datapoints,add_gapsize_column=add_gapsize_column)

GapDetectionParameters(start_date=datetime.datetime(2020, 1, 1, 1, 15, 27, tzinfo=datetime.timezone.utc), end_date={'start_date': datetime.datetime(2020, 1, 1, 1, 15, 27, tzinfo=datetime.timezone.utc), 'end_date': {...}, 'auto_stepsize': True, 'history_end_date': None, 'step_size_str': '', 'percentil': 95.0, 'min_amount_datapoints': 21, 'add_gapsize_column': True}, auto_stepsize=True, history_end_date=None, step_size_str='', percentil=95.0, min_amount_datapoints=21, add_gapsize_column=True)

In [52]:

def timestamp_str_to_pd_timestamp(timestamp: str) -> datetime:
    try:
        date = pd.to_datetime(timestamp, utc=True)
    except ValueError as error:
        raise ComponentInputValidationException(
            str(error), error_code=422, invalid_component_inputs=["..."]
        ) from error
    return date
