In [6]:
# 📦 导入必要的库
import pandas as pd

# 📁 加载 NONZERO_SUBSET 数据集
df = pd.read_csv("NONZERO_SUBSET.csv", parse_dates=['dateTimeStmp'])

# 🧭 Step 1: 按时间排序（非常关键，确保shift是对齐的）
df = df.sort_values('dateTimeStmp').reset_index(drop=True)
df

Unnamed: 0,dateTimeStmp,raw_MM_Wh,raw_Tenant_Wh,Exact_Date
0,2021-01-01 00:00:00,9516.17,6829.72,Jan 1st 2021
1,2021-01-01 00:15:00,8710.08,6124.53,Jan 1st 2021
2,2021-01-01 00:30:00,8123.25,6243.97,Jan 1st 2021
3,2021-01-01 00:45:00,8097.63,6234.43,Jan 1st 2021
4,2021-01-01 01:00:00,9140.92,6313.70,Jan 1st 2021
...,...,...,...,...
117599,2024-12-31 22:45:00,9144.98,6309.53,Dec 31st 2024
117600,2024-12-31 23:00:00,8488.17,5948.43,Dec 31st 2024
117601,2024-12-31 23:15:00,7900.97,5894.85,Dec 31st 2024
117602,2024-12-31 23:30:00,8359.67,6054.55,Dec 31st 2024


In [7]:
# 🎯 提取 2023 和 2024 年的数据，作为 TRAINING_DATA
df['year'] = df['dateTimeStmp'].dt.year
TRAINING_DATA = df[df['year'].isin([2023, 2024])].reset_index(drop=True)
TRAINING_DATA

Unnamed: 0,dateTimeStmp,raw_MM_Wh,raw_Tenant_Wh,Exact_Date,year
0,2023-01-01 00:00:00,7993.92,5970.00,Jan 1st 2023,2023
1,2023-01-01 00:15:00,7760.30,5674.25,Jan 1st 2023,2023
2,2023-01-01 00:30:00,7924.57,5721.50,Jan 1st 2023,2023
3,2023-01-01 00:45:00,8126.38,5802.72,Jan 1st 2023,2023
4,2023-01-01 01:00:00,7492.17,5129.45,Jan 1st 2023,2023
...,...,...,...,...,...
69115,2024-12-31 22:45:00,9144.98,6309.53,Dec 31st 2024,2024
69116,2024-12-31 23:00:00,8488.17,5948.43,Dec 31st 2024,2024
69117,2024-12-31 23:15:00,7900.97,5894.85,Dec 31st 2024,2024
69118,2024-12-31 23:30:00,8359.67,6054.55,Dec 31st 2024,2024


In [9]:
# 🔁 添加 D-3 到 D+3 的回归器特征
for d in [-3, -2, -1, 1, 2, 3]:
    shift_steps = d * 96
    TRAINING_DATA[f'D{d:+}'] = TRAINING_DATA['raw_MM_Wh'].shift(-shift_steps)

Unnamed: 0,dateTimeStmp,raw_MM_Wh,raw_Tenant_Wh,Exact_Date,year,D-3,D-2,D-1,D+1,D+2,D+3
0,2023-01-01 00:00:00,7993.92,5970.00,Jan 1st 2023,2023,,,,7755.88,7502.40,8276.55
1,2023-01-01 00:15:00,7760.30,5674.25,Jan 1st 2023,2023,,,,7824.42,7791.07,8278.63
2,2023-01-01 00:30:00,7924.57,5721.50,Jan 1st 2023,2023,,,,7137.95,7193.20,7957.79
3,2023-01-01 00:45:00,8126.38,5802.72,Jan 1st 2023,2023,,,,6942.70,7058.55,7464.58
4,2023-01-01 01:00:00,7492.17,5129.45,Jan 1st 2023,2023,,,,6575.33,6818.25,6754.35
...,...,...,...,...,...,...,...,...,...,...,...
69115,2024-12-31 22:45:00,9144.98,6309.53,Dec 31st 2024,2024,8101.37,7564.48,8433.07,,,
69116,2024-12-31 23:00:00,8488.17,5948.43,Dec 31st 2024,2024,7887.40,7694.98,7819.68,,,
69117,2024-12-31 23:15:00,7900.97,5894.85,Dec 31st 2024,2024,7880.20,7867.38,7166.87,,,
69118,2024-12-31 23:30:00,8359.67,6054.55,Dec 31st 2024,2024,9079.47,7427.48,7533.17,,,


In [10]:
# 🧹 去除包含 NaN 的行
regressor_columns = ['D-3', 'D-2', 'D-1', 'D+1', 'D+2', 'D+3']
TRAINING_DATA = TRAINING_DATA.dropna(subset=regressor_columns).reset_index(drop=True)
TRAINING_DATA

Unnamed: 0,dateTimeStmp,raw_MM_Wh,raw_Tenant_Wh,Exact_Date,year,D-3,D-2,D-1,D+1,D+2,D+3
0,2023-01-04 00:00:00,8276.55,6451.57,Jan 4th 2023,2023,7993.92,7755.88,7502.40,8829.72,9188.75,8592.70
1,2023-01-04 00:15:00,8278.63,6350.17,Jan 4th 2023,2023,7760.30,7824.42,7791.07,7415.50,8011.12,9013.97
2,2023-01-04 00:30:00,7957.79,5802.89,Jan 4th 2023,2023,7924.57,7137.95,7193.20,7401.62,6990.58,8084.63
3,2023-01-04 00:45:00,7464.58,5547.17,Jan 4th 2023,2023,8126.38,6942.70,7058.55,6854.28,7431.23,7588.02
4,2023-01-04 01:00:00,6754.35,5004.47,Jan 4th 2023,2023,7492.17,6575.33,6818.25,6289.82,6578.68,7448.90
...,...,...,...,...,...,...,...,...,...,...,...
68539,2024-12-28 22:45:00,8101.37,5868.28,Dec 28th 2024,2024,9735.42,9165.87,8632.47,7564.48,8433.07,9144.98
68540,2024-12-28 23:00:00,7887.40,5818.33,Dec 28th 2024,2024,8990.82,8880.90,8936.65,7694.98,7819.68,8488.17
68541,2024-12-28 23:15:00,7880.20,5750.53,Dec 28th 2024,2024,8176.53,8001.20,8317.73,7867.38,7166.87,7900.97
68542,2024-12-28 23:30:00,9079.47,5771.53,Dec 28th 2024,2024,8320.17,7232.02,7619.47,7427.48,7533.17,8359.67


In [11]:
# 💾 Step 4: 可选，保存结果到新 CSV
TRAINING_DATA.to_csv("TRAINING_DATA.csv", index=False)