In [1]:
import pandas as pd
import numpy as np
import warnings
import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.loggers import TensorBoardLogger
from pytorch_forecasting import (
    TemporalFusionTransformer,
    TimeSeriesDataSet,
    GroupNormalizer,
    MAE,
    QuantileLoss
)

group_cols = ["station_location", "evse_name"]

In [2]:
data_full = pd.read_csv('data_full.csv')

num_cols = data_full.select_dtypes(include=["number"]).columns
data_full[num_cols] = data_full[num_cols].fillna(0)


datetime_cols = [
    "connection_start_time_ts", "last_charge_end_time_ts", "charging_end_time_ts",
    "connection_end_time_ts", "expected_departure_time_ts"
]

for col in datetime_cols:
    if col in data_full.columns and data_full[col].dtype == 'object':
        data_full[col] = pd.to_datetime(data_full[col])
        data_full[col] = data_full[col].astype('int64')
# data_full['duration_per_kwh_missing'].fillna('Missing')
# data_full['kwh_per_usage_time_missing'].fillna('Missing')
# data_full['evse_type_y'].fillna('Missing')
# data_full['supports_discharge_y'].fillna('Missing')
# data_full['scheduled_charge'].fillna('Missing')

In [3]:
import pandas as pd

# 그룹별로 시간 순서대로 정수 인덱스 생성
data_full = data_full.sort_values(group_cols + ["store_timestamp"]).copy()
data_full["time_idx"] = data_full.groupby(group_cols).cumcount().astype(int)


In [4]:
# 최대 인덱스 및 학습/검증 분할 기준
data_full["store_timestamp_dt"] = pd.to_datetime(data_full["store_timestamp"])

# 2. max_prediction_length: 30일 * 24시간 * 2 (60일)
max_prediction_length_hours = 30 * 24 * 2
max_prediction_length_td = pd.Timedelta(hours=max_prediction_length_hours)

# 3. 학습/검증 분할 기준 시점 계산
training_cutoff = data_full["store_timestamp_dt"].max() - max_prediction_length_td



In [5]:
cat_cols = group_cols + ["station_location", "evse_name", "evse_type", "supports_discharge", "month", "weekday"]

# 전체 데이터를 문자열로 변환
for col in cat_cols:
    if col in data_full.columns:
        data_full[col] = data_full[col].astype(str)

# 학습/검증 데이터 분리
training_data = data_full[data_full["store_timestamp_dt"] <= training_cutoff].copy()
validation_data = data_full[data_full["store_timestamp_dt"] > training_cutoff].copy()

# 검증 데이터에서 학습 데이터에 없는 범주 제거
for col in group_cols + ["station_location", "evse_name", "evse_type", "supports_discharge"]:
    train_vals = set(training_data[col].unique())
    validation_data = validation_data[validation_data[col].isin(train_vals)].copy()


In [6]:
for col in group_cols + ["station_location", "evse_name", "evse_type", "supports_discharge"]:
    print(f"검증 데이터에만 있지만 학습 데이터에는 없는 {col} 범주:",
          set(validation_data[col].unique()) - set(training_data[col].unique()))


검증 데이터에만 있지만 학습 데이터에는 없는 station_location 범주: set()
검증 데이터에만 있지만 학습 데이터에는 없는 evse_name 범주: set()
검증 데이터에만 있지만 학습 데이터에는 없는 station_location 범주: set()
검증 데이터에만 있지만 학습 데이터에는 없는 evse_name 범주: set()
검증 데이터에만 있지만 학습 데이터에는 없는 evse_type 범주: set()
검증 데이터에만 있지만 학습 데이터에는 없는 supports_discharge 범주: set()


In [7]:
data_full["store_timestamp_dt"] = pd.to_datetime(data_full["store_timestamp"])
data_full = data_full.sort_values(group_cols + ["store_timestamp_dt"]).copy()
data_full["time_idx"] = data_full.groupby(group_cols).cumcount().astype(int)

In [8]:
# 30분 단위 타임스텝 기준
max_encoder_length = 24 * 2  # 24시간 * 2 타임스텝/시간 = 48 타임스텝 (과거 24시간)
min_encoder_length = max_encoder_length // 2  # 최소 12시간 (24 타임스텝)

max_prediction_length = 30 * 24 * 2  # 30일 * 24시간 * 2 타임스텝 = 1440 타임스텝 (30일 예측 기간)
min_prediction_length = 1  # 최소 1 타임스텝 예측


In [None]:

# --------------------
# TimeSeriesDataSet 생성
# --------------------
training = TimeSeriesDataSet(
    training_data[lambda x: x.store_timestamp_dt <= training_cutoff],
    time_idx="time_idx",                   # datetime 컬럼 대신 정수 인덱스 사용
    target="requested_kwh",
    group_ids=group_cols,
    min_encoder_length=max_encoder_length // 2,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["station_location", "evse_name", "evse_type", "supports_discharge"],
    static_reals=["usage_departure_range", "post_charge_departure_range", "cluster"] if "usage_departure_range" in data_full.columns else [],
    # time_varying_known_categoricals=["month", "weekday"],
    time_varying_known_reals=[],
    time_varying_unknown_reals=[
        col for col in [
            "connection_start_time_ts","last_charge_end_time_ts", "charging_end_time_ts", "connection_end_time_ts", "expected_departure_time_ts",
            "expected_departure_time_missing", "idle_time_ts", "expected_usage_duration_ts", "expected_usage_duration_missing",
            "expected_time_diff_ts", "expected_time_diff_missing", "actual_usage_duration_ts", "actual_charging_duration_ts",
            "actual_charging_duration_missing", "start_delay_duration_ts", "start_delay_duration_missing",
            "post_charge_departure_delay_ts", "post_charge_departure_delay_missing",
            "usage_departure_time_diff_ts", "usage_departure_time_diff_missing",
            "delivered_kwh", "requested_kwh", "kwh_request_diff", "kwh_per_usage_time"
        ] if col in data_full.columns
    ],
    target_normalizer=GroupNormalizer(groups=group_cols, transformation="softplus"),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True,
)

validation = TimeSeriesDataSet.from_dataset(training, validation_data, predict=True, stop_randomization=True)

# --------------------
# Dataloader 설정
# --------------------
batch_size = 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=4)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=4)

# --------------------
# Trainer & 모델 정의
# --------------------
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, mode="min")
lr_logger = LearningRateMonitor()
logger = TensorBoardLogger("lightning_logs")

trainer = pl.Trainer(
    max_epochs=1,
    accelerator="cuda",
    gradient_clip_val=0.1,
    enable_model_summary=True,
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
)

tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=2,
    dropout=0.1,
    hidden_continuous_size=8,
    loss=QuantileLoss(),
    optimizer="ranger",
    reduce_on_plateau_patience=4,
)

# --------------------
# 학습
# --------------------
trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

# --------------------
# 예측 및 평가
# --------------------
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

predictions = best_tft.predict(val_dataloader, return_y=True, trainer_kwargs=dict(accelerator="cuda"))
print("MAE:", MAE()(predictions.output, predictions.y).item())


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\user\.conda\envs\largeGarbage\Lib\site-packages\lightning\pytorch\utilities\parsing.py:209: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
c:\Users\user\.conda\envs\largeGarbage\Lib\site-packages\lightning\pytorch\utilities\parsing.py:209: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                               | Type                

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\user\.conda\envs\largeGarbage\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.
c:\Users\user\.conda\envs\largeGarbage\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Training: |          | 0/? [00:00<?, ?it/s]


Detected KeyboardInterrupt, attempting graceful shutdown ...


In [None]:
print(predictions.output)
print(predictions.y)

tensor([[34.9355, 37.9210, 35.8729, 40.0564, 34.3832, 34.4398, 39.4118, 36.7339,
         25.6542, 37.7875, 36.5588, 36.3540, 38.3027, 37.4756, 40.3747, 32.6620,
         36.2315, 37.2804, 22.2992, 39.5477]], device='cuda:0')
(tensor([[18.5000, 22.2100, 19.8400, 34.0100, 43.8200, 15.5000, 27.9900, 34.2600,
         25.9500, 40.6600, 20.3700, 26.1900, 12.6100, 22.2000, 10.8500, 11.8000,
         15.8100, 17.9500, 12.2500, 28.5100]], device='cuda:0'), None)


: 