In [1]:
import json, torch
import lightning  as L
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from torch.nn import functional as F
import csv
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import numpy as np
import pickle
import pandas as pd
import pprint

  from .autonotebook import tqdm as notebook_tqdm


#### load cwb data

In [2]:
from collections import defaultdict
import numpy as np
import csv
from scipy.interpolate import interp1d

# 定义月份和特征
months = ['01','02','03','04','05','06','07','08', '09', '10']
features = ['rain', 'raintime', 'solarpower', 'suntime', 'temp', 'uv']

# 初始化 cwb_data_dict，四层嵌套：特征 -> 月份 -> 天数 -> 时间索引
# 时间索引初始为小时（0-23），插值后为分钟（0-1439）
cwb_data_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float))))

# 读取气象数据
for month in months:
    for feature in features:
        csv_path = f'../cwbdata/{month}/{feature}-{month}.csv'
        try:
            with open(csv_path, 'r', encoding='utf-8') as csv_file:
                reader = csv.reader(csv_file)
                for row in reader:
                    if row[0].isdigit():
                        day = int(row[0])
                        for hour_index, value in enumerate(row[1:], start=0):
                            if hour_index < 24:
                                try:
                                    cwb_data_dict[feature][int(month)][day][hour_index] = float(value)
                                except (ValueError, TypeError):
                                    cwb_data_dict[feature][int(month)][day][hour_index] = None  # 标记为无效值
        except FileNotFoundError:
            print(f"文件未找到: {csv_path}")

# 初始化插值后的数据字典
cwb_data_interp_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))

def interpolate_minute_data(hourly_values):
    """
    对一天的24小时数据进行每分钟插值，返回1440分钟的浮点数列表。
    使用三次样条插值，并确保插值后的值不为负。
    """
    # 已知的分钟位置（每小时的第30分钟）
    known_minutes = np.array([h * 60 + 30 for h in range(24)])
    known_values = np.array(hourly_values)
    
    # 创建所有分钟的索引
    all_minutes = np.arange(1440)
    
    # 处理缺失值后进行插值
    # 使用三次样条插值
    try:
        # 创建插值函数
        spline = interp1d(known_minutes, known_values, kind='cubic', fill_value="extrapolate")
        interpolated = spline(all_minutes)
    except Exception as e:
        print(f"插值失败: {e}")
        interpolated = np.zeros(1440)
    
    # 确保插值后的值不为负
    interpolated = np.clip(interpolated, a_min=0, a_max=None)
    
    return interpolated.tolist()

# 处理每个特征、每个月、每天的数据
for feature in features:
    for month in range(1, 11):  # months '01' to '10' correspond to 1 to 10
        for day in cwb_data_dict[feature][month]:
            # 获取24小时的值
            hourly_values = [cwb_data_dict[feature][month][day].get(hour, None) for hour in range(24)]
            
            # 填充 None 值
            for hour in range(24):
                if hourly_values[hour] is None:
                    # 查找前一个有效值
                    prev = None
                    for h in range(hour - 1, -1, -1):
                        if hourly_values[h] is not None:
                            prev = hourly_values[h]
                            break
                    # 查找后一个有效值
                    next_val = None
                    for h in range(hour + 1, 24):
                        if hourly_values[h] is not None:
                            next_val = hourly_values[h]
                            break
                    # 计算平均值
                    if prev is not None and next_val is not None:
                        hourly_values[hour] = (prev + next_val) / 2
                    elif prev is not None:
                        hourly_values[hour] = prev
                    elif next_val is not None:
                        hourly_values[hour] = next_val
                    else:
                        hourly_values[hour] = 0.0  # 无有效值，填充0.0
            
            # 进行插值
            interpolated_minutes = interpolate_minute_data(hourly_values)
            
            # 将1440分钟的数据分成24个小时，每小时60分钟
            for hour in range(24):
                start_min = hour * 60
                end_min = start_min + 60
                cwb_data_interp_dict[feature][month][day][hour] = [cwb_data_dict[feature][month][day][hour]] * 60
                # cwb_data_interp_dict[feature][month][day][hour] = interpolated_minutes[start_min:end_min]

# 确保插值后的数据没有 None
for feature in features:
    for month in range(1, 11):
        for day in cwb_data_interp_dict[feature][month]:
            for hour in range(24):
                minute_data = cwb_data_interp_dict[feature][month][day][hour]
                if any([x is None for x in minute_data]):
                    # 进一步处理可能的 None（理论上已处理完毕）
                    cwb_data_interp_dict[feature][month][day][hour] = [0.0 if x is None else x for x in minute_data]

# 打印一个示例
print(cwb_data_interp_dict['rain'][1][1][1])  # 1月1日的1点的rain数据，60个浮点数


[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [3]:
class Cwb2LocModel(L.LightningModule):
    def __init__(self, input_dim=7, hidden_dim=128, num_layers=2, output_dim=1, learning_rate=1e-3, delta=1.0):
        super(Cwb2LocModel, self).__init__()
        self.save_hyperparameters()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.delta = delta  # 動態設置 delta

        # 定義 LSTM 層
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=0.15, bidirectional=True)

        # 定義全連接層
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim // 4)
        self.fc2 = nn.Linear(hidden_dim // 4, input_dim)
        self.fc3 = nn.Linear(input_dim, 1)

        # 定義損失函數（Huber Loss）
        # self.criterion = nn.HuberLoss(delta=self.delta)
        # self.criterion = nn.MSELoss()
        self.criterion = nn.L1Loss()
        # self.criterion = log_cosh_loss

    def log_cosh_loss(y_pred, y_true):
        loss = torch.mean(torch.log(torch.cosh(y_pred - y_true)))
        return loss

    def forward(self, x):
        # 初始化隱藏狀態和細胞狀態
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_dim).to(self.device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_dim).to(self.device)
        residual = x
        
        # 前向傳播 LSTM
        out, _ = self.lstm(x, (h0, c0))

        # 通過全連接層得到最終輸出
        out = self.fc1(out)
        # out = F.relu(out)
        out = self.fc2(out)
        out += residual
        out = self.fc3(out)
        
        return out

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.criterion(y_hat, y)  # 使用 Huber Loss
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.criterion(y_hat, y)  # 使用 Huber Loss
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = optim.AdamW(
            self.parameters(),
            lr=self.hparams.learning_rate,  # 初始学习率
            weight_decay=1e-2               # 权重衰减
        )
        
        # 定义调度器
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',             # 目标是最小化验证损失
            factor=0.8,             # 学习率每次调整的倍率
            patience=10,            # 验证损失未改善的 epoch 数
            threshold=1e-4,         # 改善的阈值
            cooldown=5,             # 调整后等待的冷却时间
            min_lr=1e-6,            # 学习率的下限
            verbose=True            # 打印学习率变化信息
        )
        
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "val_loss",  # 监控的指标
                "frequency": 1          # 每个 epoch 检查一次
            }
        }



In [4]:
model_path = f'./saved_models_v3_3/best-checkpoint-AA3.ckpt'
model = Cwb2LocModel.load_from_checkpoint(model_path)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)  # 將模型移動到 GPU（如果可用）
model.eval()  # 切換模型到評估模式

Cwb2LocModel(
  (lstm): LSTM(23, 128, num_layers=2, batch_first=True, dropout=0.15, bidirectional=True)
  (fc1): Linear(in_features=256, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=23, bias=True)
  (fc3): Linear(in_features=23, out_features=1, bias=True)
  (criterion): L1Loss()
)

In [5]:
df = pd.read_csv('./up.csv')
pred_date_list = []
# pred_start_time = [9, 0]
# pred_end_time = [16, 59]
for seq_id in df['序號']:
    seq_id = str(seq_id)
    new = (int(seq_id[4:6]), int(seq_id[6:8]), int(seq_id[-2:]))
    if new not in pred_date_list:
        pred_date_list.append(new)
# for s in pred_date_list:
#     print(s)


In [6]:
max_len = 65
start_time = [6, 30]
end_time = [17, 19]

In [7]:
NUM_LOCS = 17

In [8]:
def one_hot_encode(loc, num_classes):
    one_hot = [0] * num_classes
    one_hot[int(loc) - 1] = 1  # 位置编号从1开始
    return one_hot

def extract_features(cwb_data_interp_dict, features, location, month, day, hour, minute):
    try:
        newx = [cwb_data_interp_dict[feature][month][day][hour][minute] for feature in features]
        # 添加位置的 One-Hot 编码
        loc_one_hot = one_hot_encode(location, NUM_LOCS)
        newx.extend(loc_one_hot)
    except Exception as e:
        print(f"Error at {location}-{month}-{day} {hour}:{minute}: {e}")
        # 如果出错，返回长度为 len(features) + NUM_LOCS 的零向量
        newx = [0.0] * (len(features) + NUM_LOCS)
    return newx

def collate_date_data(month, day, location, features):
    data = []
    for hour in range(start_time[0], end_time[0] + 1):
        minute_start = start_time[1] if hour == start_time[0] else 0
        minute_end = end_time[1]+1 if hour == end_time[0] else 60

        for start_minute in range(minute_start, minute_end, 10):  # 每10分钟一组
            group_x = []

            for minute in range(start_minute, start_minute + 10):
                if minute >= minute_end:
                    break
                # 提取特征数据
                newx = extract_features(cwb_data_interp_dict, features, location, month, day, hour, minute)
                # 提取目标数据
                group_x.append(newx)

                # 计算该时间段的平均值
            averaged_x = np.mean(group_x, axis=0).tolist()
            data.append(averaged_x)
    return data

In [9]:
pred_date_data = []
for month, day, loc in pred_date_list:
    # print(month, day, loc)
    pred_date_data.append(collate_date_data(month, day, loc, features))
for _i, data in enumerate(pred_date_data):
    if len(data) != 65:
        print(_i)
    for _j, feature in enumerate(data):
        if len(feature) != 23:
            print(_j)

In [10]:
print(pred_date_data[0])

[[0.0, 0.0, 0.009999999999999998, 0.0, 16.900000000000002, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.009999999999999998, 0.0, 16.900000000000002, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.009999999999999998, 0.0, 16.900000000000002, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.18999999999999997, 0.0, 17.400000000000002, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.18999999999999997, 0.0, 17.400000000000002, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.18999999999999997, 0.0, 17.400000000000002, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.18999999999999997, 0.0, 17.400000000000002, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

In [11]:
# 加載標準化器
with open(f'./scalar_v3_3/x_scaler-AA3.pkl', 'rb') as f:
    x_scaler = pickle.load(f)
with open(f'./scalar_v3_3/y_scaler-AA3.pkl', 'rb') as f:
    y_scaler = pickle.load(f)

In [14]:
def get_single_day_output(x, x_scaler, y_scaler, device):
    # 转为 NumPy 数组
    x = np.array(x, dtype=np.float32)
    # print(x.shape)
    # 特征和位置的索引
    feature_indices = list(range(len(features)))
    # print(len(features))
    loc_indices = list(range(len(features), len(features) + NUM_LOCS))
    # print(NUM_LOCS)
    # 对特征部分进行标准化，排除 loc 的 One-Hot 编码部分
    x_features = x[:, feature_indices]
    x_locs = x[:, loc_indices]

    # 标准化特征部分
    x_features_scaled = x_scaler.transform(x_features)

    # 将标准化的特征和 One-Hot 编码的 loc 合并
    x_scaled = np.hstack([x_features_scaled, x_locs])

    # 转为 PyTorch 张量，移动到指定设备
    x_tensor = torch.tensor(x_scaled, dtype=torch.float32).to(device).unsqueeze(0)  # [batch=1, seq_len, features]

    # 确保模型处于评估模式
    model.eval()
    with torch.no_grad():
        # 模型预测
        y_pred = model(x_tensor)  # [batch=1, seq_len, 1]
        y_pred = y_pred.squeeze(0).cpu().numpy()  # [seq_len, 1]

    # 逆标准化
    y_pred = y_scaler.inverse_transform(y_pred)

    return y_pred.flatten().tolist()[15:63]


In [15]:
get_single_day_output(pred_date_data[0], x_scaler, y_scaler, device)

[78.16989135742188,
 76.76528930664062,
 76.80075073242188,
 79.172607421875,
 81.20252990722656,
 83.11767578125,
 83.87533569335938,
 96.41915893554688,
 107.638427734375,
 123.76954650878906,
 144.2232208251953,
 164.23439025878906,
 207.67727661132812,
 236.7650909423828,
 257.6512145996094,
 265.92779541015625,
 262.3405456542969,
 249.70069885253906,
 238.37843322753906,
 223.99285888671875,
 217.2252960205078,
 216.6966552734375,
 215.8753204345703,
 224.6254119873047,
 272.0615539550781,
 328.7159729003906,
 349.7405090332031,
 318.72589111328125,
 269.9266052246094,
 292.77642822265625,
 413.90325927734375,
 507.63494873046875,
 436.4362487792969,
 238.01544189453125,
 84.0609130859375,
 35.39421081542969,
 39.05577087402344,
 39.68025207519531,
 36.31230163574219,
 30.430892944335938,
 23.951492309570312,
 17.788742065429688,
 12.691696166992188,
 9.411941528320312,
 6.320358753204346,
 4.579056262969971,
 3.6509861946105957,
 2.0196385383605957]

In [16]:
# 存储预测结果
predictions = []

# 遍历所有需要预测的数据
for i, x in enumerate(pred_date_data):
    try:
        # 获取单天预测结果（返回 list）
        y_pred = get_single_day_output(x, x_scaler, y_scaler, device)
        y_pred = [max(0, val) for val in y_pred]
        for y in y_pred:
            predictions.append(y)
    except Exception as e:
        print(f"Error in prediction for index {i}: {e}")
        predictions.append([0.0] * max_len)  # 用全零填补


In [17]:
print((predictions[0]))

78.16989135742188


In [None]:
# 平均每段时间的预测值作为答案
submission_data = []
for seq_id, y in zip(df['序號'], predictions):
    # 获取对应时间段的平均值
    # seq_id = str(seq_id)
    # time_index = int(seq_id[8:10])  # 提取时间索引
    # pred_value = np.mean(y_pred[time_index])  # 平均值
    y_rounded = round(y, 2)
    submission_data.append([seq_id, y_rounded])

# 转为 DataFrame
submission_df = pd.DataFrame(submission_data, columns=['序號', '答案'])

# 保存为 CSV 文件
submission_df.to_csv('./upC-AA3r.csv', index=False)
print("提交文件已生成: ./submission.csv")


提交文件已生成: ./submission.csv


: 