In [8]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

class LSTMModel(nn.Module):
    def __init__(self, batch_size_, win_len_, input_dim_, hidden_dim_, output_dim_):
        super(LSTMModel, self).__init__()
        self.hidden_dim_ = hidden_dim_
        self.batch_size = batch_size_
        self.lstm = nn.LSTM(input_dim_, hidden_dim_, batch_first=True)
        self.fc = nn.Linear(hidden_dim_, output_dim_)

    def forward(self, x_):
        lstm_out, _ = self.lstm(x_)
        return self.fc(lstm_out[:, -1, :])

In [10]:
file_path = "LD2011_2014.txt"
shift_unit = 24 * 4 * 30
total_rows = sum(1 for _ in open(file_path, encoding='utf-8')) - 1  # 减去1是为了排除标题行

data = pd.read_csv(file_path, sep=';', header=None, skiprows=1, low_memory=False, nrows=70000)  # 1个月有2880行

print(data.head())

# 假设时间戳列为第一列（索引为0）
timestamp_column = 0

# 将时间戳列转换为 pandas 的 datetime 类型
data[data.columns[timestamp_column]] = pd.to_datetime(data.iloc[:, timestamp_column])

# 去除数据中的逗号并转换为浮点数
data[data.columns[1:]] = data[data.columns[1:]].replace(',', '', regex=True).astype(float)

# 计算相对时间值（以分钟为单位）
# reference_time = data.iloc[0, timestamp_column]  # 参考时间点，这里选择第一个时间戳作为参考
# data['relative_time'] = (data.iloc[:, timestamp_column] - reference_time).dt.total_seconds() / 60 / 15

# 移除原始时间戳列
data = data.drop(columns=[timestamp_column])

# Extract the target variable (electricity consumption)
data_rolled = data.iloc[:, 4:7].rolling(window=shift_unit, min_periods=1).sum().shift(-shift_unit)
# To convert values in kWh values must be divided by 4.
target = data_rolled.iloc[:-shift_unit].sum(axis=1).values / 4
data = data.iloc[:-shift_unit].values

# Normalize the data
scaler = MinMaxScaler()
data_normalized = scaler.fit_transform(data)

window_size = 3000  # 滑动窗口的大小

# 使用 sliding_window_view 函数创建滑动窗口的视图
windowed_data = np.lib.stride_tricks.sliding_window_view(data_normalized, (window_size, data.shape[1])).squeeze(1)
windowed_target = target[window_size-1:]

batch_size = 64

X_train, X_test, y_train, y_test = train_test_split(windowed_data, windowed_target, test_size=0.2, random_state=42)

train_dataset = TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(torch.Tensor(X_test), torch.Tensor(y_test))
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Set the hyperparameters
input_dim = data.shape[1]  # Number of input features (excluding the timestamp column)
hidden_dim = 64  # Number of hidden units
output_dim = 1  # Number of output predictions
num_epochs = 1
learning_rate = 0.001

# Create the model
model = LSTMModel(batch_size, window_size, input_dim, hidden_dim, output_dim)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    pbar = tqdm(train_dataloader, desc='Training Progress', leave=False)
    pbar.set_description(f'Epoch {epoch + 1}/{num_epochs}')

    for batch_data in pbar:
        # Forward pass
        x, y = batch_data
        train_outputs = model(x)
        loss = criterion(train_outputs, y.unsqueeze(1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        pbar.set_postfix({'Loss': loss.item()})
        pbar.update()


                   0   1   2   3   4   5   6   7   8   9    ... 361  362  363  \
0  2011-01-01 00:15:00   0   0   0   0   0   0   0   0   0  ...   0  0.0    0   
1  2011-01-01 00:30:00   0   0   0   0   0   0   0   0   0  ...   0  0.0    0   
2  2011-01-01 00:45:00   0   0   0   0   0   0   0   0   0  ...   0  0.0    0   
3  2011-01-01 01:00:00   0   0   0   0   0   0   0   0   0  ...   0  0.0    0   
4  2011-01-01 01:15:00   0   0   0   0   0   0   0   0   0  ...   0  0.0    0   

  364 365  366 367 368 369 370  
0   0   0    0   0   0   0   0  
1   0   0    0   0   0   0   0  
2   0   0    0   0   0   0   0  
3   0   0    0   0   0   0   0  
4   0   0    0   0   0   0   0  

[5 rows x 371 columns]


MemoryError: Unable to allocate 424. GiB for an array with shape (51296, 3000, 370) and data type float64

In [3]:
# Evaluation
model.eval()

y_true = []  # 存储真实标签值
y_pred = []  # 存储预测值

# 禁用梯度计算
with torch.no_grad():
    for batch_data in test_dataloader:
        x, y = batch_data
        outputs = model(x)

        # 将预测值和真实值添加到列表中
        y_true.extend(y.tolist())
        y_pred.extend(outputs.squeeze().tolist())

# 转换为NumPy数组
y_true = np.array(y_true)
y_pred = np.array(y_pred)


判断均方根误差（Root Mean Square Error，RMSE）是否可接受的标准通常是与具体的应用场景和数据特征相关的。没有一个通用的阈值来衡量是否可接受，因为它取决于问题的上下文和预期的误差范围。一种常见的方法是将RMSE与目标变量的范围进行比较。如果RMSE远远小于目标变量的范围，那么可以认为结果是可接受的。另一种方法是与先前的模型或基准模型进行比较，如果新模型的RMSE显著优于先前的模型或基准模型，那么结果可以认为是可接受的。此外，还应该考虑问题的实际需求和对误差的容忍度。有些应用可能对误差非常敏感，需要较低的RMSE值，而其他应用可能对误差更容忍，可以接受较高的RMSE值。

判断平均绝对误差（Mean Absolute Error，MAE）是否可接受的标准也通常与具体的应用场景和数据特征相关。与均方根误差（RMSE）类似，没有一个通用的阈值来衡量是否可接受，因为它取决于问题的上下文和预期的误差范围。一种常见的方法是将MAE与目标变量的范围进行比较。如果MAE远远小于目标变量的范围，那么可以认为结果是可接受的。另一种方法是与先前的模型或基准模型进行比较，如果新模型的MAE显著优于先前的模型或基准模型，那么结果可以认为是可接受的。与RMSE类似，还应该考虑问题的实际需求和对误差的容忍度。有些应用可能对误差非常敏感，需要较低的MAE值，而其他应用可能对误差更容忍，可以接受较高的MAE值。

决定系数（Coefficient of Determination），也称为R-squared（R²），用于评估回归模型的拟合优度。它表示模型能够解释目标变量方差的比例，取值范围从0到1，越接近1表示模型拟合得越好，越接近0表示模型拟合较差。对于决定系数，通常没有一个固定的阈值来判断结果是否可接受，因为它也取决于具体的应用场景和数据特征。一般来说，较高的决定系数意味着模型能够较好地解释目标变量的变异性，而较低的决定系数则表示模型的解释能力较弱。

In [9]:
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f'均方根误差: {rmse}')

mae = mean_absolute_error(y_true, y_pred)
print(f'平均绝对误差: {mae}')

r2 = r2_score(y_true, y_pred)
print(f'决定系数: {r2}')

均方根误差: 0.39495930109779853
平均绝对误差: 0.39418345001008775
决定系数: 0.0
