In [1]:
import json, torch
import lightning  as L
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from torch.nn import functional as F
import csv
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import numpy as np
import pickle
import pandas as pd
import pprint

  from .autonotebook import tqdm as notebook_tqdm


#### load cwb data

In [2]:
from collections import defaultdict
import numpy as np
import csv
from scipy.interpolate import interp1d

# 定义月份和特征
months = ['01','02','03','04','05','06','07','08', '09', '10']
features = ['rain', 'raintime', 'solarpower', 'suntime', 'temp', 'uv']

# 初始化 cwb_data_dict，四层嵌套：特征 -> 月份 -> 天数 -> 时间索引
# 时间索引初始为小时（0-23），插值后为分钟（0-1439）
cwb_data_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float))))

# 读取气象数据
for month in months:
    for feature in features:
        csv_path = f'../cwbdata/{month}/{feature}-{month}.csv'
        try:
            with open(csv_path, 'r', encoding='utf-8') as csv_file:
                reader = csv.reader(csv_file)
                for row in reader:
                    if row[0].isdigit():
                        day = int(row[0])
                        for hour_index, value in enumerate(row[1:], start=0):
                            if hour_index < 24:
                                try:
                                    cwb_data_dict[feature][int(month)][day][hour_index] = float(value)
                                except (ValueError, TypeError):
                                    cwb_data_dict[feature][int(month)][day][hour_index] = None  # 标记为无效值
        except FileNotFoundError:
            print(f"文件未找到: {csv_path}")

# 初始化插值后的数据字典
cwb_data_interp_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))

def interpolate_minute_data(hourly_values):
    """
    对一天的24小时数据进行每分钟插值，返回1440分钟的浮点数列表。
    使用三次样条插值，并确保插值后的值不为负。
    """
    # 已知的分钟位置（每小时的第30分钟）
    known_minutes = np.array([h * 60 + 30 for h in range(24)])
    known_values = np.array(hourly_values)
    
    # 创建所有分钟的索引
    all_minutes = np.arange(1440)
    
    # 处理缺失值后进行插值
    # 使用三次样条插值
    try:
        # 创建插值函数
        spline = interp1d(known_minutes, known_values, kind='cubic', fill_value="extrapolate")
        interpolated = spline(all_minutes)
    except Exception as e:
        print(f"插值失败: {e}")
        interpolated = np.zeros(1440)
    
    # 确保插值后的值不为负
    interpolated = np.clip(interpolated, a_min=0, a_max=None)
    
    return interpolated.tolist()

# 处理每个特征、每个月、每天的数据
for feature in features:
    for month in range(1, 11):  # months '01' to '10' correspond to 1 to 10
        for day in cwb_data_dict[feature][month]:
            # 获取24小时的值
            hourly_values = [cwb_data_dict[feature][month][day].get(hour, None) for hour in range(24)]
            
            # 填充 None 值
            for hour in range(24):
                if hourly_values[hour] is None:
                    # 查找前一个有效值
                    prev = None
                    for h in range(hour - 1, -1, -1):
                        if hourly_values[h] is not None:
                            prev = hourly_values[h]
                            break
                    # 查找后一个有效值
                    next_val = None
                    for h in range(hour + 1, 24):
                        if hourly_values[h] is not None:
                            next_val = hourly_values[h]
                            break
                    # 计算平均值
                    if prev is not None and next_val is not None:
                        hourly_values[hour] = (prev + next_val) / 2
                    elif prev is not None:
                        hourly_values[hour] = prev
                    elif next_val is not None:
                        hourly_values[hour] = next_val
                    else:
                        hourly_values[hour] = 0.0  # 无有效值，填充0.0
            
            # 进行插值
            interpolated_minutes = interpolate_minute_data(hourly_values)
            
            # 将1440分钟的数据分成24个小时，每小时60分钟
            for hour in range(24):
                start_min = hour * 60
                end_min = start_min + 60
                cwb_data_interp_dict[feature][month][day][hour] = interpolated_minutes[start_min:end_min]
                # cwb_data_interp_dict[feature][month][day][hour] = [cwb_data_dict[feature][month][day][hour]] * 60

# 确保插值后的数据没有 None
for feature in features:
    for month in range(1, 11):
        for day in cwb_data_interp_dict[feature][month]:
            for hour in range(24):
                minute_data = cwb_data_interp_dict[feature][month][day][hour]
                if any([x is None for x in minute_data]):
                    # 进一步处理可能的 None（理论上已处理完毕）
                    cwb_data_interp_dict[feature][month][day][hour] = [0.0 if x is None else x for x in minute_data]

# 打印一个示例
print(cwb_data_dict['suntime'][5][15][11])  # 1月1日的1点的rain数据，60个浮点数
print(cwb_data_interp_dict['suntime'][5][15][11])  # 1月1日的1点的rain数据，60个浮点数


1.0
[0.9947946687553407, 0.9947747071946897, 0.994766484862803, 0.994770173585678, 0.9947859451893126, 0.9948139714997057, 0.9948544243428545, 0.9949074755447569, 0.9949732969314115, 0.995052060328816, 0.9951439375629682, 0.9952491004598667, 0.9953677208455086, 0.995499970545893, 0.9956460213870167, 0.9958060451948788, 0.9959802137954769, 0.9961686990148085, 0.9963716726788725, 0.9965893066136661, 0.9968217726451879, 0.9970692425994352, 0.9973318883024068, 0.9976098815800999, 0.9979033942585134, 0.9982125981636445, 0.9985376651214918, 0.9988787669580527, 0.999236075499326, 0.9996097625713087, 1.0, 1.0004062179453885, 1.0008248799034332, 1.0012517077040823, 1.0016824231772867, 1.0021127481529957, 1.0025384044611587, 1.0029551139317252, 1.0033585983946454, 1.0037445796798683, 1.004108779617344, 1.0044469200370216, 1.0047547227688514, 1.0050279096427823, 1.0052622024887647, 1.0054533231367475, 1.0055969934166806, 1.0056889351585143, 1.0057248701921973, 1.0057005203476799, 1.00561160745491

In [3]:
class Cwb2LocModel(L.LightningModule):
    def __init__(self, input_dim=47, hidden_dim=128, output_dim=1, learning_rate=1e-3):
        """
        MLP 模型
        - input_dim: 输入特征的维度
        - hidden_dim: 隐藏层神经元数量
        - output_dim: 输出维度（目标维度）
        - learning_rate: 学习率
        """
        super(Cwb2LocModel, self).__init__()
        self.save_hyperparameters()

        # 定义全连接层
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, output_dim)

        # 激活函数
        self.relu = nn.ReLU()

        # 损失函数
        self.criterion = nn.L1Loss()  # 可以改为 MSELoss 或其他损失函数

    def forward(self, x):
        # 前向传播
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.criterion(y_hat, y)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.criterion(y_hat, y)
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = optim.AdamW(
            self.parameters(),
            lr=self.hparams.learning_rate,  # 初始学习率
            weight_decay=1e-2               # 权重衰减
        )
        
        # 学习率调度器
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.5,
            patience=3,
            threshold=1e-4,
            cooldown=3,
            min_lr=1e-6,
            verbose=True
        )
        
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "val_loss",
                "frequency": 1
            }
        }


In [4]:
model_path = f'./saved_models_v4/best-checkpoint-A.ckpt'
model = Cwb2LocModel.load_from_checkpoint(model_path)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)  # 將模型移動到 GPU（如果可用）
model.eval()  # 切換模型到評估模式

Cwb2LocModel(
  (fc1): Linear(in_features=47, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (criterion): L1Loss()
)

In [5]:
df = pd.read_csv('./up.csv')
pred_date_list = []
# pred_start_time = [9, 0]
# pred_end_time = [16, 59]
for seq_id in df['序號']:
    seq_id = str(seq_id)
    new = (int(seq_id[4:6]), int(seq_id[6:8]), int(seq_id[-2:]))
    if new not in pred_date_list:
        pred_date_list.append(new)
# for s in pred_date_list:
#     print(s)


In [6]:

def get_time_range(month, day, hour, minute):
    result = []
    for delta in range(-2, 3):
        # 計算新的分鐘數
        new_minute = minute + delta
        new_hour = hour

        # 處理跨小時
        if new_minute < 0:
            new_hour -= 1
            new_minute += 60
        elif new_minute >= 60:
            new_hour += 1
            new_minute -= 60

        # 確保時間仍然在9:00到17:00範圍內
        # if 9 <= new_hour <= 17:
        result.append([month, day, new_hour, new_minute])

    return result

# 範例測試
example = get_time_range(1, 2, 9, 59)
print(example)


[[1, 2, 9, 57], [1, 2, 9, 58], [1, 2, 9, 59], [1, 2, 10, 0], [1, 2, 10, 1]]


In [7]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pickle

# 示例: 加载已保存的编码器
with open('./scalar_v4/loc_encoder.pkl', 'rb') as f:
    loc_encoder = pickle.load(f)

# 确保编码器加载正确，打印支持的类别
print(f"One-Hot Encoded Categories: {loc_encoder.categories_}")

with open('./scalar_v4/x_scaler.pkl', 'rb') as f:
    x_scaler = pickle.load(f)


def extract_features(cwb_data_interp_dict, features, location, month, day, hour, minute):
    try:
        newx = [cwb_data_interp_dict[feature][month][day][hour][minute] for feature in features]
        # newx.append(int(location))  # 添加 location 作为特征
        # newx.append(int(hour))
        # newx.append(int(minute))
    except Exception as e:
        print(f"Error at {location}-{month}-{day} {hour}:{minute}: {e}")
        newx = [0.0] * (len(features) + 1)
    return newx
def collate_x_data(month, day, hour, minute, location, features):
    """
    收集给定时间范围内的数据，完成标准化，并将 `location` 转换为 One-Hot Encoding。
    """
    # 将 location 转为 One-Hot 编码
    location_one_hot = loc_encoder.transform([[location]]).flatten()

    # 初始化 x，包含 One-Hot 编码的 location
    x_continuous = []  # 用于存储连续特征
    time_range = get_time_range(month, day, hour, minute)

    # 遍历时间范围，提取每个时间点的气象特征
    for t_month, t_day, t_hour, t_minute in time_range:
        x_continuous += extract_features(cwb_data_interp_dict, features, location, t_month, t_day, t_hour, t_minute)

    # 转换为 NumPy 数组
    x_continuous = np.array(x_continuous).reshape(1, -1)

    # 对连续特征进行标准化
    x_continuous = x_scaler.transform(x_continuous).flatten()

    # 将 One-Hot 编码的 `location` 与标准化的连续特征拼接
    x = np.concatenate([x_continuous, location_one_hot])
    return x




(collate_x_data(1,2,7,50, 1, features))

One-Hot Encoded Categories: [array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])]


array([-0.11767022, -0.36995235, -0.48790407,  0.80765128, -1.89731916,
       -0.8206524 , -0.11778042, -0.37022636, -0.4762824 ,  0.83127131,
       -1.88887866, -0.81733107, -0.1178918 , -0.37049831, -0.46457977,
        0.85471435, -1.8803706 , -0.81391606, -0.11800435, -0.37076801,
       -0.45279961,  0.87797162, -1.87179808, -0.81040746, -0.11811817,
       -0.37103554, -0.44094536,  0.90103431, -1.86316418, -0.8068055 ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ])

In [8]:
with open('./scalar_v4/y_scaler.pkl', 'rb') as f:
    y_scaler = pickle.load(f)
def get_single_x_output(month, day, hour, minute, loc, features):
    # 获取标准化后的输入特征
    x_array = collate_x_data(
        month, day, hour, minute, loc, features)

    # 转换为 PyTorch Tensor
    x_tensor = torch.tensor(x_array, dtype=torch.float32).unsqueeze(0).to(device)  # [1, feature_dim]

    # 确保模型处于评估模式
    model.eval()
    with torch.no_grad():
        # 模型预测
        y_pred = model(x_tensor)  # [1, 1]
        y_pred = y_pred.cpu().numpy()  # 转换为 NumPy 格式

    # 逆标准化目标值
    y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1))

    return y_pred.flatten().tolist()[0]
get_single_x_output(5,15,13,00,4,features)

616.8399047851562

In [9]:
# 平均每段时间的预测值作为答案
from tqdm import tqdm
submission_data = []
error = 0
for seq_id  in tqdm(df['序號']):
    seq_id = str(seq_id)
    month, day, hour, minute, loc = int(seq_id[4:6]), int(seq_id[6:8]), int(seq_id[8:10]), int(seq_id[10:12]), int(seq_id[-2:])
    y_list = []
    for i in range(minute, minute+10):
        y_list.append(get_single_x_output(month, day, hour, i, loc, features))
    # print(month, day, hour, time, loc)
    y = sum(y_list)/len(y_list)
    if y < 0:
        error -= y
        y = 0.0
    submission_data.append([int(seq_id), y])

# 转为 DataFrame
submission_df = pd.DataFrame(submission_data, columns=['序號', '答案'])

# 保存为 CSV 文件
submission_df.to_csv('./upD-A4.csv', index=False)
print("提交文件已生成: ./submission.csv")
print(error)

100%|██████████| 9600/9600 [01:06<00:00, 144.88it/s]

提交文件已生成: ./submission.csv
6207.800587481989



