### 강수 기반 파생특성 (최우선)

핵심 메커니즘: 강수 → 우수 유입 → 토사/유기물 동반 유입 → SS·TOC 급증

#### (1) 강수 변화율·집중도

- ΔRN_15m = RN_15m(t) − RN_15m(t−15m)

- ΔRN_60m, ΔRN_12H

- RN_15m / RN_60m (단기 집중도)

#### (2) 누적 강수 메모리 (Antecedent Rainfall)

- AR_3H = Σ RN_15m (최근 3시간)

- AR_6H, AR_12H, AR_24H

- log(1 + AR_x)

#### (3) 무강수 지속시간

- dry_duration = 마지막 강수 이후 경과 시간 (hour)

### 기상 결합 특성 (TA, HM, TD 활용)

핵심 메커니즘: 증발·응결·생물활성 → 유기물 상태 변화

#### (4) 열·습 복합지표

- HeatIndex (TA + HM 기반)

- VaporPressureDeficit (VPD ≈ f(TA, HM))

- DewPointDepression = TA − TD

#### (5) 기상 안정성

- rolling_std(TA, 3H / 6H)

- rolling_std(HM)

### 공정 내부 변수 조합 (PH, FLUX, TN, TP)

핵심 메커니즘: “같은 TOC라도 상태가 다르다”

#### (6) 부하 관련 특성 (아주 중요)

- TOC_proxy_load = FLUX × PH

- SS_proxy_load = FLUX × (TN + TP)

#### (7) 영양염 비율

- TN/TP

- log(TN + TP)

- PH × TN, PH × TP

#### (8) 공정 상태 플래그

- PH_zone = {산성 / 중성 / 염기성}

- TN_high_flag (상위 20%)

- TP_spike_flag (z-score > 2)

### 시계열 메모리 특성 (TOC·SS 예측에 결정적)

핵심 메커니즘: TOC·SS는 관성(inertia)이 큼

#### (9) Lag & Rolling

- lag_10m / 30m / 1H: PH, FLUX, TN, TP

- rolling_mean / max / std (30m, 1H, 3H)

#### (10) 변화율

- ΔPH, ΔFLUX, ΔTN, ΔTP

- |ΔFLUX| (급변 여부)

### 상호작용 특성 (트리 계열에서 특히 강력)

- RN_15m × FLUX

- RN_60m × SS(t−1)

- (TN/TP) × PH

- dry_duration × RN_15m

In [None]:
import numpy as np
import pandas as pd

In [None]:
def resample_5min(
    df,
    time_col = None,
    rule = "5min",
    sum_cols=("RN_15m", "RN_60m", "RN_12H", "RN_DAY", "FLUX_VU"),
    mean_cols=("TA", "HM", "TD", "PH_VU", "TN_VU", "TP_VU"),
    # 타깃(TOC/SS)도 같이 리샘플링해 학습셋 만들 때 쓰고 싶으면 여기에 넣어도 됨(기본 제외)
    extra_mean_cols=(),
    interp_limit: int = 12,  # 5분 기준 12칸=1시간까지 보간 허용
):
    """
    1분(or irregular) -> 5분 리샘플링.
    - rain_cols: sum
    - mean_cols: mean
    - 센서값은 time interpolation, 강수는 0 채움
    """
    x = df.copy()

    # datetime index 만들기
    if time_col is not None:
        x[time_col] = pd.to_datetime(x[time_col])
        x = x.set_index(time_col)
    if not isinstance(x.index, pd.DatetimeIndex):
        raise ValueError("df must have a DatetimeIndex or provide time_col.")
    x = x.sort_index()

    # 숫자화
    for c in set(sum_cols) | set(mean_cols) | set(extra_mean_cols):
        if c in x.columns:
            x[c] = pd.to_numeric(x[c], errors="coerce")

    # 집계 dict 만들기
    agg = {}
    for c in sum_cols:
        if c in x.columns:
            agg[c] = "sum"
    for c in list(mean_cols) + list(extra_mean_cols):
        if c in x.columns:
            agg[c] = "mean"

    if not agg:
        raise ValueError("No columns found to resample based on provided col lists.")

    # 리샘플
    out = x.resample(rule).agg(agg)

    # 결측 처리
    # 1) 강수: 없으면 0 (무강수)
    for c in sum_cols:
        if c in out.columns:
            out[c] = out[c].fillna(0.0)

    # 2) 센서/상태: 시간 보간(너무 긴 공백은 남김)
    for c in list(mean_cols) + list(extra_mean_cols):
        if c in out.columns:
            out[c] = out[c].interpolate(method="time", limit=interp_limit)

    return out

In [None]:
def make_modelA_features(
    df,
    time_col = None,
    do_resample = True,
    rule = "5min",
    # 리샘플했기 때문에 window/lag는 "스탭" 기준
    roll_windows_steps=(6, 12, 36), # 6=30분, 12=60분, 36=3시간 (5분 간격)
    lags_steps=(2, 6, 12),           # 2=10분, 6=30분, 12=60분
    wet_thr_mm_5min = 0.1,    # 5분 누적 기준
    # for safety
    clip_rain_max=(50.0, 200.0, 500.0, 1000.0),  # for RN_15m, RN_60m, RN_12H, RN_DAY
):
    """
    Model A(TOC + SS) 특성 엔지니어링.
    - Assumes 1-min time index OR a datetime column given by time_col.
    - Returns X (features only).
    """

    base_cols=[
        "TA", "RN_15m", "RN_60m", "RN_12H", "RN_DAY", "HM", "TD",
        "PH_VU", "FLUX_VU", "TN_VU", "TP_VU"
    ]

    if do_resample:
        df = resample_5min(
            df, time_col=time_col, rule=rule,
            sum_cols=("RN_15m", "RN_60m", "RN_12H", "RN_DAY", "FLUX_VU"),
            mean_cols=("TA", "HM", "TD", "PH_VU", "TN_VU", "TP_VU"),
        )
        time_col = None

    x = df.copy()

    if time_col is not None:
        x[time_col] = pd.to_datetime(x[time_col])
        x = x.set_index(time_col)
    if not isinstance(x.index, pd.DatetimeIndex):
        raise ValueError("df must have a DatetimeIndex or provide time_col.")
    x = x.sort_index()

    # base만 시작
    missing = [c for c in base_cols if c not in x.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")

    X = x[base_cols].copy()

    # ----------------------------
    # 1) Rain features (priority)
    # ----------------------------
    # (1) deltas: "change" signals
    for c in ["RN_15m", "RN_60m", "RN_12H"]:
        X[f"d_{c}"] = X[c].diff()

    # (2) intensity ratio: short-term concentration
    eps = 1e-6
    X["RN_15m_div_RN_60m"] = X["RN_15m"] / (X["RN_60m"] + eps)

    # (3) antecedent rainfall memory using rolling sums of RN_15m
    # AR_3H(최근 3시간동안의 강수량) / AR_6H / AR_12H / AR_24H (in minutes)
    steps_per_hour = int(pd.Timedelta("1h") / pd.Timedelta(rule))
    for h in (3, 6, 12, 24):
        win = h * steps_per_hour
        X[f"AR_{h}H"] = X["RN_15m"].rolling(win, min_periods=max(2, win // 10)).sum()
    for h in (3, 12, 24):
        X[f"log1p_AR_{h}H"] = np.log1p(X[f"AR_{h}H"])

    # (4) dry duration: minutes since last "wet" time
    wet = (X["RN_15m"].fillna(0) >= wet_thr_mm)
    last_wet_ts = pd.Series(X.index.where(wet), index=X.index).ffill()
    dry_minutes = (X.index - last_wet_ts).astype("timedelta64[m]")
    X["dry_duration_min"] = dry_minutes
    X["dry_duration_hr"] = X["dry_duration_min"] / 60.0
    X["is_wet"] = wet.astype(np.int8)

    # ----------------------------
    # 2) Weather combined features
    # ----------------------------
    # dew point depression
    X["TA_minus_TD"] = X["TA"] - X["TD"]

    # approximate VPD (kPa) using TA (°C) and RH (%)
    # e_s(T)=0.6108*exp(17.27*T/(T+237.3)), VPD = e_s*(1-RH/100)
    T = X["TA"]
    RH = X["HM"].clip(0, 100)
    e_s = 0.6108 * np.exp((17.27 * T) / (T + 237.3))
    X["VPD_kPa"] = e_s * (1 - RH / 100.0)

    # ----------------------------
    # 3) Process combo / load-like features
    # ----------------------------
    # proxy loads (scale-free; units not perfect but predictive)
    X["load_proxy_NP"] = X["FLUX_VU"] * (X["TN_VU"] + X["TP_VU"])
    X["PHxFLUX"] = X["PH_VU"] * X["FLUX_VU"]

    # TN/TP ratio
    X["TN_div_TP"] = X["TN_VU"] / (X["TP_VU"] + eps)
    X["log1p_TN_TP"] = np.log1p(X["TN_VU"] + X["TP_VU"])

    # pH zone flags (nonlinear boundary awareness)
    X["pH_acid"] = (X["PH_VU"] < 6.5).astype(np.int8)
    X["pH_neutral"] = ((X["PH_VU"] >= 6.5) & (X["PH_VU"] <= 8.5)).astype(np.int8)
    X["pH_basic"] = (X["PH_VU"] > 8.5).astype(np.int8)

    # ----------------------------
    # 4) Rolling stats + lags (NO target lags)
    # ----------------------------
    roll_targets = ["PH_VU", "TN_VU", "TP_VU", "TA", "HM", "TD"]

    for w in roll_windows:
        for c in roll_targets:
            s = X[c]
            X[f"{c}_rm{w}"] = s.rolling(w, min_periods=max(2, w // 5)).mean()
            X[f"{c}_rs{w}"] = s.rolling(w, min_periods=max(2, w // 5)).std()
            X[f"{c}_rmax{w}"] = s.rolling(w, min_periods=max(2, w // 5)).max()

    # lags
    for lag in lags:
        for c in ["RN_15m", "RN_60m", "RN_12H", "TA", "HM", "TD", "PH_VU", "FLUX_VU", "TN_VU", "TP_VU"]:
            if c in X.columns:
                X[f"{c}_lag{lag}"] = X[c].shift(lag)

    # ----------------------------
    # 5) Interaction features
    # ----------------------------
    X["RN15xFLUX"] = X["RN_15m"] * X["FLUX_VU"]
    X["RN60xFLUX"] = X["RN_60m"] * X["FLUX_VU"]
    X["dryHr_x_RN15"] = X["dry_duration_hr"] * X["RN_15m"]

    # ----------------------------
    # 6) time features (optional but usually helpful)
    # ----------------------------
    X["hour"] = X.index.hour.astype(np.int16)
    X["dow"] = X.index.dayofweek.astype(np.int8)  # 0=Mon
    X["is_weekend"] = (X["dow"] >= 5).astype(np.int8)

    # cyclical encoding (keeps continuity)
    X["hour_sin"] = np.sin(2 * np.pi * X["hour"] / 24.0)
    X["hour_cos"] = np.cos(2 * np.pi * X["hour"] / 24.0)

    # ----------------------------
    # final: remove inf, keep numeric
    # ----------------------------
    X = X.replace([np.inf, -np.inf], np.nan)

    return X

In [None]:
X5 = make_modelA_features_5min(df_1min, time_col="SYS_TIME", do_resample=True, rule="5min")

y_toc = resample_5min(df_1min, time_col="SYS_TIME", extra_mean_cols=("TOC_VU","SS_VU"))["TOC_VU"].shift(-1)
y_ss  = resample_5min(df_1min, time_col="SYS_TIME", extra_mean_cols=("TOC_VU","SS_VU"))["SS_VU"].shift(-1)

data = X5.join(pd.DataFrame({"y_toc": y_toc, "y_ss": y_ss})).dropna()