In [None]:
import pandas as pd

def add_degree_day_features(
    df: pd.DataFrame,
    temp_col: str = "temp_c",
    hdd_base: float = 18.0,
    cdd_base: float = 22.0,
    gdd_base: float = 10.0,
    gdd_cap: float | None = 30.0,
    prefix: str = ""
) -> pd.DataFrame:
    """
    Add HDD, CDD and GDD columns to *df* and return the new DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        Must contain a column with outdoor temperature in °C.
    temp_col : str
        Name of the temperature column in *df*.
    hdd_base : float
        Base temperature (°C) below which heating load starts.
    cdd_base : float
        Base temperature (°C) above which cooling load starts.
    gdd_base : float
        Base temperature (°C) used in agriculture to start plant growth.
    gdd_cap : float | None
        Optional upper cap on temperature when computing GDD
        (standard agronomic practice is to cap at 30 °C).
        Use None to disable the cap.
    prefix : str
        Optional prefix for the new feature names if you need to
        distinguish multiple weather stations.

    Returns
    -------
    pd.DataFrame
        Original DataFrame with three extra columns:
        • f"{prefix}HDD{int(hdd_base)}"
        • f"{prefix}CDD{int(cdd_base)}"
        • f"{prefix}GDD{int(gdd_base)}"
    """
    T = df[temp_col]

    # Heating Degree Days
    df[f"{prefix}HDD{int(hdd_base)}"] = (hdd_base - T).clip(lower=0)

    # Cooling Degree Days
    df[f"{prefix}CDD{int(cdd_base)}"] = (T - cdd_base).clip(lower=0)

    # Growing Degree Days
    # 1. Apply lower base
    gdd = (T - gdd_base).clip(lower=0)
    # 2. Apply optional upper cap (truncated GDD)
    if gdd_cap is not None:
        gdd = pd.Series(
            (T.clip(upper=gdd_cap) - gdd_base).clip(lower=0),
            index=df.index
        )
    df[f"{prefix}GDD{int(gdd_base)}"] = gdd

    return df


In [None]:
# Temperature ramps 
df["T_ramp_1h"]   = df["temp_c"].diff()
df["T_ramp_3h"]   = df["temp_c"].diff(3)/3      # °C per h
df["T_accel_1h"]  = df["T_ramp_1h"].diff()      # second derivative

In [None]:
# How volatile or unusual weather has been 
df["T_mean_24h"] = df["temp_c"].rolling(24).mean()
df["T_std_24h"]  = df["temp_c"].rolling(24).std()
df["T_range_day"]= df["temp_c"].rolling(24).max() - df["temp_c"].rolling(24).min()

#### Curve Features

In [None]:
"""
feature_momentum_meanrev_shape.py
---------------------------------
Tools for electricity‑price feature engineering:
• momentum / oscillator metrics
• mean‑reversion diagnostics
• intraday shape & ramp descriptors
All formulas work with negative prices (thanks to the optional asinh transform).

Author: <you> – July 2025
"""

from __future__ import annotations
import numpy as np
import pandas as pd


# ------------------------------------------------------------------
# helpers
# ------------------------------------------------------------------

def _safe_transform(x: pd.Series, fn):
    """Apply *fn* (e.g. np.arcsinh) only if it is not None."""
    return fn(x) if fn is not None else x


def _ols_slope(y: np.ndarray) -> float:
    """
    Slope of OLS line fitted to y against x = 0..n‑1.
    Returns NaN if variance is zero (flat window).
    """
    n = y.size
    x = np.arange(n)
    x_centered = x - x.mean()
    y_centered = y - y.mean()
    denom = np.dot(x_centered, x_centered)
    if denom == 0.0:
        return np.nan
    return np.dot(x_centered, y_centered) / denom


# ------------------------------------------------------------------
# 1) Slope / regression‑based momentum
# ------------------------------------------------------------------

def rolling_slope(
    series: pd.Series,
    window: int = 12,
    by_hour: bool = False,
    transform=np.arcsinh,
    min_periods: int | None = None,
) -> pd.Series:
    """
    Rolling OLS slope over *window* observations.
    If *by_hour* is True, compute slopes separately for each hour‑of‑day.
    """
    if min_periods is None:
        min_periods = window
    s = _safe_transform(series, transform)

    if by_hour:
        parts = []
        for h, grp in s.groupby(series.index.hour):
            part = grp.rolling(window, min_periods=min_periods) \
                      .apply(_ols_slope, raw=True)
            parts.append(part)
        out = pd.concat(parts).sort_index()
    else:
        out = s.rolling(window, min_periods=min_periods) \
               .apply(_ols_slope, raw=True)
    out.name = f"slope_{window}"
    return out


# ------------------------------------------------------------------
# 2) Rate of Change (ROC)
# ------------------------------------------------------------------

def rate_of_change(
    series: pd.Series,
    lag: int = 1,
    pct: bool = True,
) -> pd.Series:
    """
    Simple rate of change.  If *pct* is True, return % change; otherwise the
    absolute difference.
    """
    diff = series - series.shift(lag)
    roc = diff / series.shift(lag).abs() if pct else diff
    roc.name = f"roc_{lag}{'pct' if pct else 'diff'}"
    return roc


# ------------------------------------------------------------------
# 3) Relative Strength Index (RSI)
# ------------------------------------------------------------------

def rsi(
    series: pd.Series,
    window: int = 14,
    transform=np.arcsinh,
) -> pd.Series:
    """
    Classic (Wilder) RSI on *transform(series)*.
    """
    s = _safe_transform(series, transform)
    delta = s.diff()

    gain = delta.clip(lower=0)
    loss = (-delta).clip(lower=0)

    avg_gain = gain.rolling(window).mean()
    avg_loss = loss.rolling(window).mean()

    rs = avg_gain / avg_loss.replace(0, np.nan)
    rsi_val = 100 - (100 / (1 + rs))
    rsi_val.name = f"rsi_{window}"
    return rsi_val


# ------------------------------------------------------------------
# 4) Half‑life estimator (rolling)
# ------------------------------------------------------------------

def _half_life_window(y: np.ndarray) -> float:
    """
    Half‑life τ for the AR(1) style model:
        Δy_t = α - λ y_{t-1} + ε_t
    τ = ln(2)/λ.  Returns NaN if λ ≤ 0 or if window too small.
    """
    if y.size < 3:
        return np.nan
    dy = np.diff(y)
    ylag = y[:-1]

    # OLS slope of dy ~ ylag (no intercept, centred not needed here)
    denom = np.dot(ylag, ylag)
    if denom == 0.0:
        return np.nan
    beta = np.dot(ylag, dy) / denom          # beta = -λ
    lam = -beta
    if lam <= 0:
        return np.nan
    return np.log(2) / lam                   # half‑life


def rolling_half_life(
    series: pd.Series,
    window: int = 168,        # one week of hourly data
    transform=np.arcsinh,
    min_periods: int | None = None,
) -> pd.Series:
    s = _safe_transform(series, transform)
    if min_periods is None:
        min_periods = window
    hl = s.rolling(window, min_periods=min_periods).apply(
        _half_life_window, raw=True
    )
    hl.name = f"half_life_{window}"
    return hl


# ------------------------------------------------------------------
# 5) Spike‑decay slope
# ------------------------------------------------------------------

def spike_decay_slope(
    series: pd.Series,
    z_window: int = 24,
    z_thresh: float = 3.0,
    horizon: int = 6,
    agg_window: int = 168,
    transform=np.arcsinh,
) -> pd.Series:
    """
    Average slope (€/MWh per hour) seen after recent spikes.
    For every |z|>z_thresh event, measure (P_{t+horizon} - P_t)/horizon,
    then keep a rolling mean of those slopes over *agg_window* observations.
    """
    s = _safe_transform(series, transform)
    mu = s.rolling(z_window).mean()
    sig = s.rolling(z_window).std()
    z = (s - mu) / sig
    spikes = z.abs() > z_thresh
    decay = (s.shift(-horizon) - s) / horizon
    decay = decay.where(spikes)

    decay_avg = decay.rolling(agg_window, min_periods=1).mean()
    decay_avg.name = f"spike_decay_{horizon}h"
    return decay_avg


# ------------------------------------------------------------------
# 6) Time‑since‑extreme (grouped by hour‑of‑day)
# ------------------------------------------------------------------

def time_since_extreme(
    series: pd.Series,
    z_window: int = 24,
    z_thresh: float = 2.0,
    transform=np.arcsinh,
) -> pd.Series:
    """
    Hours since last |z| > z_thresh *within the same hour‑of‑day*.
    NaN until the first extreme event for that hour occurs.
    """
    s = _safe_transform(series, transform)
    mu = s.rolling(z_window).mean()
    sig = s.rolling(z_window).std()
    z = (s - mu) / sig
    extreme = (z.abs() > z_thresh)

    out = pd.Series(index=series.index, dtype=float)
    last_extreme = {h: pd.NaT for h in range(24)}

    for ts, flag in extreme.items():
        h = ts.hour
        if flag:
            last_extreme[h] = ts
            out.loc[ts] = 0.0
        else:
            if pd.isna(last_extreme[h]):
                out.loc[ts] = np.nan
            else:
                delta = (ts - last_extreme[h]).total_seconds() / 3600.0
                out.loc[ts] = delta
    out.name = "time_since_extreme"
    return out


# ------------------------------------------------------------------
# 7) Direct ramp metrics
# ------------------------------------------------------------------

def direct_ramp_metrics(
    series: pd.Series,
    morning_hours: range = range(5, 8),      # 05→07
    evening_hours: range = range(17, 21),    # 17→20
) -> pd.DataFrame:
    """
    Returns:
      ramp_1h          – first difference
      morning_ramp     – daily sum of ramp_1h across *morning_hours*
      evening_ramp     – daily sum across *evening_hours*
    """
    ramp_1h = series.diff()
    df = pd.DataFrame({"ramp_1h": ramp_1h})

    # aggregate within each day
    morning_mask = series.index.hour.isin(morning_hours)
    evening_mask = series.index.hour.isin(evening_hours)

    df["morning_ramp"] = (
        ramp_1h.where(morning_mask)
               .groupby(series.index.normalize())
               .transform("sum")
    )
    df["evening_ramp"] = (
        ramp_1h.where(evening_mask)
               .groupby(series.index.normalize())
               .transform("sum")
    )
    return df


# ------------------------------------------------------------------
# 8) Gradient & curvature of *yesterday* curve
# ------------------------------------------------------------------

def gradient_curvature_prev_day(series: pd.Series) -> pd.DataFrame:
    """
    First and second difference of yesterday’s curve, aligned to today’s hours.
    grad_prev  = P_{d-1,h} - P_{d-1,h-1}
    curv_prev  = P_{d-1,h+1} - 2P_{d-1,h} + P_{d-1,h-1}
    """
    grad_prev = series.shift(24) - series.shift(25)
    curv_prev = series.shift(23) - 2 * series.shift(24) + series.shift(25)

    return pd.DataFrame({
        "grad_prev": grad_prev,
        "curv_prev": curv_prev,
    })


# ------------------------------------------------------------------
# 9) Shape against baseline (yesterday vs yesterday’s mean)
# ------------------------------------------------------------------

def shape_against_baseline(series: pd.Series) -> pd.Series:
    """
    δ_{d-1,h} = P_{d-1,h} - mean(P_{d-1,·}).
    Gives each hour’s deviation from the previous day’s daily mean.
    """
    daily_mean = series.groupby(series.index.floor("D")).transform("mean")
    dev = series.shift(24) - daily_mean.shift(24)
    dev.name = "shape_dev_prev_day"
    return dev