In [1]:
import sys
from pathlib import Path

# 🔹 반드시 import 전에 sys.path 조정
root_path = Path().resolve().parent.parent
if str(root_path) not in sys.path:
    sys.path.insert(0, str(root_path))
from src.boosting import XGBoost
from src.imputation import knn_imputer as knn

import pandas as pd  
import numpy as np
from sklearn.metrics import mean_squared_error

from sklearn.datasets import load_iris
from typing import Union

def load_time_series_from_csv(
    file_path: str,
    skiprows: range,
    nrows: int,
    time_col: str = 'msur_dt',
    value_cols: Union[str, tuple[str, ...], dict[str, str]] = 'inflow_flux_tot',
    freq: str = 'min'
) -> Union[pd.Series, dict[str, pd.Series]]:
    df = pd.read_csv(file_path, skiprows=skiprows, nrows=nrows)
    df[time_col] = pd.to_datetime(df[time_col])
    dt_index = pd.DatetimeIndex(df[time_col], freq=freq)

    
    if isinstance(value_cols, str):
        return pd.Series(df[value_cols].values, index=dt_index)
    
    elif isinstance(value_cols, tuple):
        # 튜플이면 여러 컬럼을 dict로 반환
        return {col: pd.Series(df[col].values, index=dt_index) for col in value_cols}
    
    elif isinstance(value_cols, dict):
        # dict일 경우 key: 시리즈 이름, value: 컬럼명
        return {key: pd.Series(df[col].values, index=dt_index) for key, col in value_cols.items()}
    
    else:
        raise TypeError("value_cols는 str, tuple[str, ...], dict[str, str] 중 하나여야 합니다.")

X, y = load_iris(return_X_y=True)
acc = XGBoost.train_xgboost_classifier(X, y)
print(f"Accuracy: {acc:.4f}")



Accuracy: 1.0000


In [2]:
# 데이터 호출
file_path = r'D:\dev\modules\pt_eh_inflow_data.csv'
series = load_time_series_from_csv(
    file_path=r'D:\dev\modules\pt_eh_inflow_data.csv',
    skiprows=range(1, 14401 + (1440 * 7) * 2),
    nrows=14,
    time_col='msur_dt',
    value_cols=('inflow_flux_tot', 'inflow_toc_load_fst', 'inflow_ss_dnsty_fst'),
    freq='min'
)
# X_raw를 DataFrame으로 생성
X_raw = pd.DataFrame({
    'inflow_toc_load_fst': series['inflow_toc_load_fst'],
    'inflow_ss_dnsty_fst': series['inflow_ss_dnsty_fst']
})
print(X_raw)

# 목표값 y: 예측하려는 대상
y = series['inflow_flux_tot'].to_numpy()


                     inflow_toc_load_fst  inflow_ss_dnsty_fst
msur_dt                                                      
2025-05-25 00:00:00              1075.88               441.63
2025-05-25 00:01:00              1076.18               442.61
2025-05-25 00:02:00              1075.12               441.36
2025-05-25 00:03:00              1076.64               441.33
2025-05-25 00:04:00              1088.77               441.75
2025-05-25 00:05:00              1093.05               444.34
2025-05-25 00:06:00              1102.68               448.70
2025-05-25 00:07:00              1109.69               450.09
2025-05-25 00:08:00              1121.81               454.30
2025-05-25 00:09:00              1129.87               457.55
2025-05-25 00:10:00              1143.18               462.81
2025-05-25 00:11:00              1165.14               463.50
2025-05-25 00:12:00              1168.46               464.07
2025-05-25 00:13:00              1162.49               459.86


In [3]:

# knn으로 결측치 보정
# 예: 첫 번째 행, 두 번째 컬럼에 NaN 넣기
X_raw.iloc[3, 1] = np.nan
X_raw.iloc[2, 1] = np.nan
X_raw.iloc[5, 1] = np.nan


# ✅ KNN Imputer 적용
X_imputed = knn.knn_impute(X_raw, n_neighbors=3)

# numpy 변환
X = X_imputed.to_numpy()
print(X_raw)
print(X_imputed)


                     inflow_toc_load_fst  inflow_ss_dnsty_fst
msur_dt                                                      
2025-05-25 00:00:00              1075.88               441.63
2025-05-25 00:01:00              1076.18               442.61
2025-05-25 00:02:00              1075.12                  NaN
2025-05-25 00:03:00              1076.64                  NaN
2025-05-25 00:04:00              1088.77               441.75
2025-05-25 00:05:00              1093.05                  NaN
2025-05-25 00:06:00              1102.68               448.70
2025-05-25 00:07:00              1109.69               450.09
2025-05-25 00:08:00              1121.81               454.30
2025-05-25 00:09:00              1129.87               457.55
2025-05-25 00:10:00              1143.18               462.81
2025-05-25 00:11:00              1165.14               463.50
2025-05-25 00:12:00              1168.46               464.07
2025-05-25 00:13:00              1162.49               459.86
        

In [4]:

# 학습 및 평가
acc = XGBoost.train_xgboost_regressor(X, y)
print(f"Accuracy: {acc:.4f}")

mean_y = np.mean(y)
rmse = acc
error_ratio = (rmse / mean_y) * 100
print(f"평균 대비 RMSE 비율: {error_ratio:.2f}%")


Accuracy: 20.3047
평균 대비 RMSE 비율: 2.44%
