# Steel Industry Energy Consumption × PyPOTS 集成示例

本笔记本以 Kaggle 数据集 `csafrit2/steel-industry-energy-consumption` 为例，演示如何：

1. 使用 `kagglehub` 拉取数据；
2. 构造滑动窗口时间序列样本；
3. 在 PyPOTS 中进行缺失值重构（SAITS）。

你可以在此基础上扩展异常检测、特征聚类或源荷耦合建模。


In [None]:
# 如运行于 Colab 或全新环境，请先安装依赖
# !pip install -q kagglehub[pandas-datasets] pypots

import numpy as np
import pandas as pd
from kagglehub import KaggleDatasetAdapter
import kagglehub


In [None]:
# 拉取数据集（默认下载最新版本）
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "csafrit2/steel-industry-energy-consumption",
    file_path="",
)

display(df.head())
print(df.columns)


In [None]:
# 基础清洗：识别时间列、数值列
df = df.copy()

# 自动识别时间列
time_col_candidates = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()]
time_col = time_col_candidates[0] if time_col_candidates else None
if time_col:
    df[time_col] = pd.to_datetime(df[time_col])
    df = df.sort_values(time_col)

# 提取数值列
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if time_col and time_col in numeric_cols:
    numeric_cols.remove(time_col)

# 若数值列为空，可手工补充：
# numeric_cols = ['Usage_kWh', 'Lagging_Current_Reactive.Power_kVarh', ...]
print('Numeric columns:', numeric_cols)

# 转换为 float，并填补可能的无效值
data = df[numeric_cols].apply(pd.to_numeric, errors='coerce')


In [None]:
# 构建滑动窗口序列
def build_sequences(values, n_steps=24, stride=1):
    sequences = []
    for start in range(0, len(values) - n_steps + 1, stride):
        sequences.append(values[start:start + n_steps])
    return np.stack(sequences) if sequences else np.empty((0, n_steps, values.shape[1]))

n_steps = 24
values = data.to_numpy(dtype=np.float32)
X = build_sequences(values, n_steps=n_steps, stride=1)

print('X shape:', X.shape)


In [None]:
# 构造缺失值（示例），并拆分训练/验证/测试
rng = np.random.default_rng(42)
missing_rate = 0.1
mask = rng.random(X.shape) < missing_rate
X_missing = X.copy()
X_missing[mask] = np.nan

n_samples = X.shape[0]
train_end = int(n_samples * 0.7)
val_end = int(n_samples * 0.85)

train_set = {"X": X_missing[:train_end]}
val_set = {
    "X": X_missing[train_end:val_end],
    "X_ori": X[train_end:val_end],
}
test_set = {
    "X": X_missing[val_end:],
    "X_ori": X[val_end:],
}


In [None]:
# PyPOTS SAITS 缺失值重构
from pypots.imputation import SAITS

_, n_steps, n_features = train_set["X"].shape
model = SAITS(
    n_steps=n_steps,
    n_features=n_features,
    n_layers=2,
    d_model=128,
    d_ffn=128,
    n_heads=4,
    d_k=32,
    d_v=32,
    dropout=0.1,
    epochs=10,
    saving_path="outputs/saits_kaggle",
    model_saving_strategy="best",
)

model.fit(train_set, val_set)
imputed = model.impute({"X": test_set["X"]})

# 评估重构误差（仅计算人工缺失位置）
test_mask = np.isnan(test_set["X"]) & ~np.isnan(test_set["X_ori"])
mae = np.abs(imputed - test_set["X_ori"])[test_mask].mean()
print('MAE on masked positions:', mae)
