### 강우 기반 파생

#### 강수 변화율·집중도

- ΔRN_15m, ΔRN_60m, ΔRN_12H

- RN_15m / RN_60m (단기 집중도)

#### 누적 강수 메모리(Antecedent Rainfall / AR)

- AR_3H, AR_6H, AR_12H, AR_24H = Σ RN_15m(최근 x시간)

- log(1 + AR_x)

#### 무강수 지속시간 + first flush

- dry_duration_h (마지막 강수 이후 경과 시간)

- rain_start / rain_end 플래그

- post_rain_6H 플래그(종료 후 잔류 효과)

#### API 지수(감쇠 누적)

- API(RN_15m, k, N) = Σ RN_15m(t−i)·exp(−k·i)

In [35]:
import pandas as pd
import numpy as np

In [36]:
def _add_rain_features(X, station_ids, rain_cols, rule, antecedent_hours, wet_thr_mm, eps, add_api, api_k, api_hours):
    """강수 관련 특성 생성"""
    new_cols = {}
    
    # Delta (변화율)
    for sid in station_ids:
        for rc in rain_cols[:3]:  # RN_15m, RN_60m, RN_12H만
            col = f"{rc}_{sid}"
            if col in X.columns:
                new_cols[f"d_{col}"] = X[col].diff()
    
    # Intensity ratio (단기 집중도)
    for sid in station_ids:
        rn15 = f"RN_15m_{sid}"
        rn60 = f"RN_60m_{sid}"
        if rn15 in X.columns and rn60 in X.columns:
            new_cols[f"RN_15m_div_RN_60m_{sid}"] = X[rn15] / (X[rn60] + eps)
    
    # Antecedent rainfall (누적 강수)
    steps_per_hour = int(pd.Timedelta("1h") / pd.Timedelta(rule))
    for sid in station_ids:
        col = f"RN_15m_{sid}"
        if col in X.columns:
            for h in antecedent_hours:
                win = h * steps_per_hour
                ar_col = X[col].rolling(win, min_periods=max(2, win // 10)).sum()
                new_cols[f"AR_{h}H_{sid}"] = ar_col
                new_cols[f"log1p_AR_{h}H_{sid}"] = np.log1p(ar_col)
    
    # Dry duration + first flush
    for sid in station_ids:
        col = f"RN_15m_{sid}"
        if col in X.columns:
            wet = (X[col].fillna(0) >= wet_thr_mm)
            last_wet_ts = pd.Series(X.index.where(wet), index=X.index).ffill()
            dry_timedelta = (X.index - last_wet_ts)
            
            new_cols[f"dry_duration_min_{sid}"] = dry_timedelta.dt.total_seconds().fillna(0) / 60.0
            new_cols[f"dry_duration_hr_{sid}"] = dry_timedelta.dt.total_seconds().fillna(0) / 3600.0
            new_cols[f"is_wet_{sid}"] = wet.astype(np.int8)
            
            # First flush 힌트
            rain_start = wet & (~wet.shift(1, fill_value=False))
            rain_end = (~wet) & (wet.shift(1, fill_value=False))
            new_cols[f"rain_start_{sid}"] = rain_start.astype(np.int8)
            new_cols[f"rain_end_{sid}"] = rain_end.astype(np.int8)
            
            # 종료 후 6시간 잔류
            post_win = 6 * steps_per_hour
            new_cols[f"post_rain_6H_{sid}"] = (
                pd.Series(rain_end.values, index=X.index)
                .rolling(post_win, min_periods=1).max()
                .fillna(0).astype(np.int8)
            )
    
    # API (감쇠 누적) - 선택적
    if add_api:
        api_steps = api_hours * steps_per_hour
        weights = np.exp(-api_k * np.arange(1, api_steps + 1, dtype=np.float32))
        for sid in station_ids:
            col = f"RN_15m_{sid}"
            if col in X.columns:
                rain = X[col].to_numpy(dtype=np.float32)
                api = np.full_like(rain, np.nan, dtype=np.float32)
                for t in range(len(rain)):
                    start = max(0, t - api_steps)
                    seg = rain[start:t]
                    if seg.size == 0:
                        api[t] = 0.0
                    else:
                        w = weights[-seg.size:]
                        api[t] = float(np.sum(seg[::-1] * w))
                new_cols[f"API_RN_15m_k{api_k}_H{api_hours}_{sid}"] = api
    
    X = pd.concat([X, pd.DataFrame(new_cols, index=X.index)], axis=1)
    return X

### 기상 파생

#### 열·습 결합

- DewPointDepression = TA − TD (추천)

(선택) VPD / HeatIndex (효과는 케이스바이케이스 → uncertain)

#### 기상 안정성(변동성)

- rolling_std(TA, 3H/6H)

- rolling_std(HM, 3H/6H)

In [37]:
def _add_weather_features(X, station_ids, weather_cols, rule, stability_windows, eps):
    """기상 관련 특성 생성"""
    new_cols = {}
    steps_per_hour = int(pd.Timedelta("1h") / pd.Timedelta(rule))
    
    for sid in station_ids:
        ta_col = f"TA_{sid}"
        td_col = f"TD_{sid}"
        hm_col = f"HM_{sid}"
        
        # Dew point depression
        if ta_col in X.columns and td_col in X.columns:
            new_cols[f"TA_minus_TD_{sid}"] = X[ta_col] - X[td_col]
        
        # VPD 
        if ta_col in X.columns and hm_col in X.columns:
            T = X[ta_col]
            RH = X[hm_col].clip(0, 100)
            e_s = 0.6108 * np.exp((17.27 * T) / (T + 237.3))
            new_cols[f"VPD_kPa_{sid}"] = e_s * (1 - RH / 100.0)
        
        # Stability (rolling std)
        for c in [ta_col, hm_col]:
            if c not in X.columns:
                continue
            for w in stability_windows:
                win_steps = int(pd.Timedelta(w) / pd.Timedelta(rule))
                minp = max(2, int(win_steps * 0.1))
                new_cols[f"{c}_std_{w}"] = X[c].rolling(win_steps, min_periods=minp).std(ddof=0)
    
    X = pd.concat([X, pd.DataFrame(new_cols, index=X.index)], axis=1)
    return X

### 시계열 메모리: lag & rolling

#### Lag features

추천 lag:

- 10min, 30min, 60min

적용 컬럼:

- TN_VU, TP_VU, SS_VU, TOC_VU, TA, RN_15m(또는 RN_60m)

- (선택) TN_lag, TP_lag 

#### Rolling 통계량

윈도우:

- 30min, 60min, 3H

통계:

- rolling_mean, rolling_std, rolling_max (강우는 rolling_sum도 추가)


#### 변화율(derivative)

- ΔTN, ΔTP, ΔSS, ΔTOC, ΔTA



In [38]:
def _add_lag_roll_delta_features(X, process_cols, station_ids, rain_cols, weather_cols, rule, lag_list, roll_windows, eps):
    """시계열 특성 (lag, rolling, delta) 생성"""
    new_cols = {}
    
    # 대상 컬럼 구성
    base_cols = list(process_cols) + [f"{wc}_{sid}" for sid in station_ids for wc in weather_cols]
    base_cols += [f"RN_15m_{sid}" for sid in station_ids]
    base_cols = [c for c in base_cols if c in X.columns]
    
    # Lag
    for c in base_cols:
        for L in lag_list:
            lag_steps = int(pd.Timedelta(L) / pd.Timedelta(rule))
            new_cols[f"{c}_lag_{L}"] = X[c].shift(lag_steps)
        
        # Delta
        new_cols[f"d_{c}"] = X[c].diff()
    
    # Rolling
    for c in base_cols:
        for w in roll_windows:
            win_steps = int(pd.Timedelta(w) / pd.Timedelta(rule))
            minp = max(2, int(win_steps * 0.1))
            r = X[c].rolling(win_steps, min_periods=minp)
            new_cols[f"{c}_roll_mean_{w}"] = r.mean()
            new_cols[f"{c}_roll_std_{w}"] = r.std(ddof=0)
            new_cols[f"{c}_roll_max_{w}"] = r.max()
    
    X = pd.concat([X, pd.DataFrame(new_cols, index=X.index)], axis=1)
    return X

### 공정·수질 결합 특성 (누수 없이 재설계)

#### 조성/비율(Composition ratios)
- TOC_div_SS = TOC_VU / SS_VU
- SS_div_TOC = SS_VU / TOC_VU

- TN_div_TP = TN_VU / TP_VU
- TP_div_TN = TP_VU / TN_VU

- TOC_div_TN = TOC_VU / TN_VU
- TN_div_TOC = TN_VU / TOC_VU

#### 상호결합(비선형) — 모델이 약할수록(선형/트리 얕음) 도움 큼
- TOC_x_SS = TOC_VU * SS_VU
- TN_x_TP = TN_VU * TP_VU
    
#### Spike flags

#### 온도×(TN/TOC) : pH/반응 간접 대리로 자주 유효

#### 습도/이슬점 상호작용은 과하면 독. 최소만.

#### 강수 상호작용(희석/충격)

#### RN × (SS, TOC) 상호작용 (top pick)


In [39]:
def _add_process_features(X, process_cols, rule, spike_z, spike_window, eps):
    """공정 관련 특성 생성"""
    new_cols = {}
    
    req = ["TOC_VU", "SS_VU", "TN_VU", "TP_VU"]
    has = {c: (c in X.columns) for c in req}

    # 1) 조성/비율(Composition ratios)
    if has["TOC_VU"] and has["SS_VU"]:
        new_cols["TOC_div_SS"] = X["TOC_VU"] / (X["SS_VU"] + eps)
        new_cols["SS_div_TOC"] = X["SS_VU"] / (X["TOC_VU"] + eps)

    if has["TN_VU"] and has["TP_VU"]:
        new_cols["TN_div_TP"] = X["TN_VU"] / (X["TP_VU"] + eps)
        new_cols["TP_div_TN"] = X["TP_VU"] / (X["TN_VU"] + eps)

    if has["TOC_VU"] and has["TN_VU"]:
        new_cols["TOC_div_TN"] = X["TOC_VU"] / (X["TN_VU"] + eps)
        new_cols["TN_div_TOC"] = X["TN_VU"] / (X["TOC_VU"] + eps)

    # 2) 상호결합(비선형) — 모델이 약할수록(선형/트리 얕음) 도움 큼
    if has["TOC_VU"] and has["SS_VU"]:
        new_cols["TOC_x_SS"] = X["TOC_VU"] * X["SS_VU"]
    if has["TN_VU"] and has["TP_VU"]:
        new_cols["TN_x_TP"] = X["TN_VU"] * X["TP_VU"]
    
    # Spike flags
    steps_per_hour = int(pd.Timedelta("1h") / pd.Timedelta(rule))
    win_steps = int(pd.Timedelta(spike_window) / pd.Timedelta(rule))
    minp = max(5, int(win_steps * 0.1))
    
    for c in has:
        mu = X[c].rolling(win_steps, min_periods=minp).mean()
        sd = X[c].rolling(win_steps, min_periods=minp).std(ddof=0)
        z = (X[c] - mu) / (sd + eps)
        new_cols[f"{c}_spike_z{spike_z:g}"] = (z > spike_z).astype(np.int8)
    
    X = pd.concat([X, pd.DataFrame(new_cols, index=X.index)], axis=1)
    return X

In [40]:
def _add_interaction_features(
    X,
    station_ids,
    eps = 1e-6,
    wet_thr_mm = 0.1,
    use_only_top = True,  # 상호작용 개수 제한
):
    new = {}

    # 공정 입력
    proc = ["TOC_VU", "SS_VU", "TN_VU", "TP_VU"]
    hasP = {c: (c in X.columns) for c in proc}

    # 기상(온도/습도/이슬점) — 스테이션별
    for sid in station_ids:
        ta = f"TA_{sid}"
        hm = f"HM_{sid}"
        td = f"TD_{sid}"

        # 1) 온도×(TN/TOC) : pH/반응 간접 대리로 자주 유효
        if ta in X.columns:
            if hasP["TN_VU"]:
                new[f"TA_x_TN_{sid}"] = X[ta] * X["TN_VU"]
            if hasP["TOC_VU"]:
                new[f"TA_x_TOC_{sid}"] = X[ta] * X["TOC_VU"]

        # 2) (선택) 습도/이슬점 상호작용은 과하면 독. 최소만.
        if not use_only_top:
            if hm in X.columns and hasP["SS_VU"]:
                new[f"HM_x_SS_{sid}"] = X[hm] * X["SS_VU"]
            if td in X.columns and hasP["TN_VU"]:
                new[f"TD_x_TN_{sid}"] = X[td] * X["TN_VU"]

    # 강수 상호작용(희석/충격)
    rain_bases = ["RN_15m", "RN_60m", "RN_12H", "RN_DAY"]
    for sid in station_ids:
        rn15 = f"RN_15m_{sid}"
        rn60 = f"RN_60m_{sid}"
        rn12 = f"RN_12H_{sid}"
        rnday = f"RN_DAY_{sid}"

        # Wet/Dry state (regime)
        if rn60 in X.columns:
            wet = (X[rn60] > wet_thr_mm).astype("int8")
            new[f"is_wet_60m_{sid}"] = wet

            # wet 상태에서 부하가 다르게 작동하는 효과
            for c in ["SS_VU", "TOC_VU", "TN_VU", "TP_VU"]:
                if hasP.get(c, False):
                    new[f"is_wet_60m_x_{c}_{sid}"] = wet * X[c]

        # RN × (SS, TOC) 상호작용 (top pick)
        if rn15 in X.columns and hasP["SS_VU"]:
            new[f"RN15_x_SS_{sid}"] = X[rn15] * X["SS_VU"]
        if rn60 in X.columns and hasP["SS_VU"]:
            new[f"RN60_x_SS_{sid}"] = X[rn60] * X["SS_VU"]
        if rn15 in X.columns and hasP["TOC_VU"]:
            new[f"RN15_x_TOC_{sid}"] = X[rn15] * X["TOC_VU"]
        if rn60 in X.columns and hasP["TOC_VU"]:
            new[f"RN60_x_TOC_{sid}"] = X[rn60] * X["TOC_VU"]

    return X.join(pd.DataFrame(new, index=X.index))


### 시간(주기) 특성 (안 넣으면 손해)

- hour_sin, hour_cos

- is_weekend

In [41]:
def _add_time_features(X):
    """시간 관련 특성 생성"""
    new_cols = {}
    
    new_cols["hour"] = X.index.hour.astype(np.int16)
    new_cols["dow"] = X.index.dayofweek.astype(np.int8)
    new_cols["is_weekend"] = (X.index.dayofweek >= 5).astype(np.int8)
    
    # Cyclical encoding
    hour_values = X.index.hour
    new_cols["hour_sin"] = np.sin(2 * np.pi * hour_values / 24.0)
    new_cols["hour_cos"] = np.cos(2 * np.pi * hour_values / 24.0)
    
    X = pd.concat([X, pd.DataFrame(new_cols, index=X.index)], axis=1)
    
    return X

In [42]:
def resample_5min(
    df,
    time_col=None,
    rule="5min",
    sum_cols=None,
    mean_cols=None,
    extra_mean_cols=(),
    interp_limit: int = 12,
):
    """
    1분(or irregular) -> 5분 리샘플링.
    """
    x = df.copy()

    if time_col is not None:
        x[time_col] = pd.to_datetime(x[time_col])
        x = x.set_index(time_col)
    if not isinstance(x.index, pd.DatetimeIndex):
        raise ValueError("df must have a DatetimeIndex or provide time_col.")
    x = x.sort_index()

    # 숫자화
    all_cols = set(sum_cols or []) | set(mean_cols or []) | set(extra_mean_cols)
    for c in all_cols:
        if c in x.columns:
            x[c] = pd.to_numeric(x[c], errors="coerce")

    # 집계 dict
    agg = {}
    for c in sum_cols or []:
        if c in x.columns:
            agg[c] = "sum"
    for c in list(mean_cols or []) + list(extra_mean_cols):
        if c in x.columns:
            agg[c] = "mean"

    if not agg:
        raise ValueError("No columns found to resample.")

    # 리샘플
    out = x.resample(rule).agg(agg)

    # 결측 처리
    for c in sum_cols or []:
        if c in out.columns:
            out[c] = out[c].fillna(0.0)

    for c in list(mean_cols or []) + list(extra_mean_cols):
        if c in out.columns:
            out[c] = out[c].interpolate(method="time", limit=interp_limit)

    return out

In [43]:
def make_modelC_features(
    df,
    time_col=None,
    do_resample=True,
    rule="5min",
    station_ids=("368", "541", "569"),
    rain_cols=("RN_15m", "RN_60m", "RN_12H", "RN_DAY"),
    weather_cols=("TA", "HM", "TD"),
    process_cols=("TOC_VU", "SS_VU", "TN_VU", "TP_VU"),
    lag_list=("10min", "30min", "1h"),
    roll_windows=("30min", "1h", "3h"),
    antecedent_hours=(3, 6, 12, 24),
    stability_windows=("3h", "6h"),
    wet_thr_mm=0.1,
    spike_z=2.0,
    spike_window="24h",
    add_api=False,
    api_k=0.01,
    api_hours=24,
    eps=1e-6,
):
    
    # 1. 기본 컬럼 구성
    base_cols = []
    for sid in station_ids:
        base_cols.extend([f"{wc}_{sid}" for wc in weather_cols])
        base_cols.extend([f"{rc}_{sid}" for rc in rain_cols])
    base_cols.extend(process_cols)
    
    # 2. 리샘플링
    if do_resample:
        sum_cols = tuple(f"{rc}_{sid}" for sid in station_ids for rc in rain_cols)
        mean_cols = (
            tuple(f"{wc}_{sid}" for sid in station_ids for wc in weather_cols) + 
            ("SS_VU", "TOC_VU", "TN_VU", "TP_VU")
        )
        df = resample_5min(df, time_col=time_col, rule=rule, sum_cols=sum_cols, mean_cols=mean_cols)
        time_col = None
    
    # 3. 인덱스 설정
    x = df.copy()
    if time_col is not None:
        x[time_col] = pd.to_datetime(x[time_col])
        x = x.set_index(time_col)
    if not isinstance(x.index, pd.DatetimeIndex):
        raise ValueError("df must have a DatetimeIndex or provide time_col.")
    x = x.sort_index()
    
    # 4. 컬럼 검증
    missing = [c for c in base_cols if c not in x.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    
    X = x[base_cols].copy()
    
    # 5. 강수 특성 생성
    X = _add_rain_features(X, station_ids, rain_cols, rule, antecedent_hours, wet_thr_mm, eps, add_api, api_k, api_hours)
    
    # 6. 기상 특성 생성
    X = _add_weather_features(X, station_ids, weather_cols, rule, stability_windows, eps)
    
    # 7. 시계열 특성 (lag, rolling, delta)
    X = _add_lag_roll_delta_features(X, process_cols, station_ids, rain_cols, weather_cols, rule, lag_list, roll_windows, eps)
    
    # 8. 공정 특성 생성
    X = _add_process_features(X, process_cols, rule, spike_z, spike_window, eps)
    
    # 9. 강수-공정 상호작용
    X = _add_interaction_features(X, station_ids, eps)
    
    # 10. 시간 특성
    X = _add_time_features(X)
    
    # 11. 정리
    X = X.replace([np.inf, -np.inf], np.nan)
    
    return X

In [44]:
tms = pd.read_csv("../../data/processed/TMS_cleaned.csv")
aws = pd.read_csv("../../data/processed/AWS_cleaned.csv")

tms['SYS_TIME'] = pd.to_datetime(tms['SYS_TIME'])
aws['SYS_TIME'] = pd.to_datetime(aws['SYS_TIME'])

tms = tms.sort_values('SYS_TIME')
aws = aws.sort_values('SYS_TIME')

df = pd.merge_asof(tms, aws, on='SYS_TIME', direction='backward', tolerance=pd.Timedelta('1min'))

In [45]:
X = make_modelC_features(df, time_col="SYS_TIME", do_resample=True, rule="5min")

y_flux = resample_5min(df, time_col="SYS_TIME", extra_mean_cols=("FLUX_VU","PH_VU"))["FLUX_VU"].shift(-1)
y_ph  = resample_5min(df, time_col="SYS_TIME", extra_mean_cols=("FLUX_VU","PH_VU"))["PH_VU"].shift(-1)

data = X.join(pd.DataFrame({"y_flux": y_flux, "y_ph": y_ph})).dropna()

In [46]:
data

Unnamed: 0_level_0,TA_368,HM_368,TD_368,RN_15m_368,RN_60m_368,RN_12H_368,RN_DAY_368,TA_541,HM_541,TD_541,...,RN60_x_SS_569,RN15_x_TOC_569,RN60_x_TOC_569,hour,dow,is_weekend,hour_sin,hour_cos,y_flux,y_ph
SYS_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-08-26 17:20:00,29.82,64.68,22.48,0.0,0.0,0.0,0.0,29.48,77.80,25.20,...,0.0,0.0,0.0,17,0,0,-0.965926,-0.258819,4600.8,7.10
2024-08-26 17:25:00,29.70,66.10,22.74,0.0,0.0,0.0,0.0,29.40,78.70,25.32,...,0.0,0.0,0.0,17,0,0,-0.965926,-0.258819,4626.6,7.10
2024-08-26 17:30:00,29.68,67.62,23.06,0.0,0.0,0.0,0.0,29.34,77.80,25.08,...,0.0,0.0,0.0,17,0,0,-0.965926,-0.258819,4653.2,7.10
2024-08-26 17:35:00,29.42,68.22,22.96,0.0,0.0,0.0,0.0,29.30,78.24,25.12,...,0.0,0.0,0.0,17,0,0,-0.965926,-0.258819,4679.0,7.10
2024-08-26 17:40:00,28.86,69.48,22.70,0.0,0.0,0.0,0.0,29.10,80.76,25.46,...,0.0,0.0,0.0,17,0,0,-0.965926,-0.258819,4704.8,7.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-09-29 04:55:00,17.48,81.50,14.28,0.0,0.0,0.0,0.0,16.24,95.42,15.52,...,0.0,0.0,0.0,4,0,0,0.866025,0.500000,1668.4,7.20
2025-09-29 05:00:00,17.40,82.10,14.32,0.0,0.0,0.0,0.0,16.26,95.74,15.56,...,0.0,0.0,0.0,5,0,0,0.965926,0.258819,1699.0,7.20
2025-09-29 05:05:00,17.40,82.70,14.40,0.0,0.0,0.0,0.0,16.20,95.76,15.50,...,0.0,0.0,0.0,5,0,0,0.965926,0.258819,1729.8,7.20
2025-09-29 05:10:00,17.40,82.94,14.48,0.0,0.0,0.0,0.0,16.18,95.88,15.48,...,0.0,0.0,0.0,5,0,0,0.965926,0.258819,1760.6,7.20


In [47]:
data.to_csv("../../data/processed/modelC_dataset.csv", index=True)