# 資料前處理

In [3]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

base_path = '/content/drive/MyDrive/data'
add_path = '/content/drive/MyDrive/addit'

full_data = pd.DataFrame()

# 將17個地點的dataset合併起來
for i in range(1, 18):
    folder_name = f'l{i:02d}'
    folder_path = os.path.join(base_path, folder_name)

    train_file = os.path.join(folder_path, f'train_{i:02d}.csv')
    incomplete_file = os.path.join(folder_path, f'incom_{i:02d}.csv')
    test_file = os.path.join(folder_path, f'test_{i:02d}.csv')

    train_data = pd.read_csv(train_file, header=None, encoding='utf-8')
    train_data.columns = ['Time', 'WindSpeed(m/s)', 'Pressure(hpa)', 'Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)', 'Target']

    incomplete_data = pd.read_csv(incomplete_file, header=None, encoding='utf-8')
    incomplete_data.columns = ['Time', 'WindSpeed(m/s)', 'Pressure(hpa)', 'Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)', 'Target']

    test_data = pd.read_csv(test_file, header=None, encoding='utf-8')
    test_data.columns = ['Time', 'WindSpeed(m/s)', 'Pressure(hpa)', 'Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)', 'Target']

    merged_data = pd.concat([train_data, incomplete_data, test_data], ignore_index=True)

    # 分開時間與地點
    merged_data['Time'] = merged_data['Time'].astype(str)
    merged_data['ds'] = pd.to_datetime(merged_data['Time'].str[:-2], format='%Y%m%d%H%M')
    merged_data['Location'] = merged_data['Time'].str[-2:]

    full_data = pd.concat([full_data, merged_data], ignore_index=True)

# additional data
for n in range(1, 15):
    add_file = os.path.join(add_path, f'{n:d}.csv')
    add_data = pd.read_csv(add_file, header=None, encoding='utf-8')
    add_data.columns = ['Time', 'WindSpeed(m/s)', 'Pressure(hpa)', 'Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)', 'Target']

    add_data['Time'] = add_data['Time'].astype(str)
    add_data['ds'] = pd.to_datetime(add_data['Time'].str[:-2], format='%Y%m%d%H%M')
    add_data['Location'] = add_data['Time'].str[-2:]

    full_data = pd.concat([full_data, add_data], ignore_index=True)

# 按時間排序
full_data = full_data.sort_values(by='ds').reset_index(drop=True)

# 將ds欄位拆分成各種時間特徵
full_data['month'] = full_data['ds'].dt.month
full_data['weekday'] = full_data['ds'].dt.weekday
full_data['day'] = full_data['ds'].dt.day
full_data['hour'] = full_data['ds'].dt.hour
full_data['minute'] = full_data['ds'].dt.minute

# 新增季節欄位，根據月份來分配季節
def assign_season(month):
    if month in [12, 1, 2]:
        return 1  # 冬季
    elif month in [3, 4, 5]:
        return 3  # 春季
    elif month in [6, 7, 8]:
        return 4  # 夏季
    else:
        return 2  # 秋季

full_data['season'] = full_data['month'].apply(assign_season)

# 模型需要數值型資料
full_data['ds_numeric'] = (full_data['ds'] - pd.to_datetime('1970-01-01')).dt.total_seconds()
full_data['Location'] = full_data['Location'].astype(int)
# 確認結果
print(f"資料合併完成，共 {len(full_data)} 筆資料。")

# 儲存合併後的資料
output_file = '/content/full_train_data.csv'
full_data.to_csv(output_file, index=False)
print(f"合併後的資料已儲存至 {output_file}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
資料合併完成，共 103878 筆資料。
合併後的資料已儲存至 /content/full_train_data.csv


#最終測試集

In [18]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import os
import pickle
import numpy as np

# 讀取訓練資料
train_data = pd.read_csv('/content/full_train_data.csv')

# 設定目標變數和特徵
env_columns = ['Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)', 'Location']
time_columns = ['month', 'hour', 'season', 'weekday', 'day', 'minute', 'ds_numeric']
features_2 = env_columns + time_columns

# 用時間跟地點預測三個特徵：溫度、濕度和光照
def f1(feature_name):
    print(f"正在訓練 {feature_name} 預測模型...")

    features = time_columns + ['Location']
    X = train_data[features]
    y = train_data[feature_name]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=42)

    # XGBoost
    model = XGBRegressor(
        n_estimators=5000,
        learning_rate=0.05,
        max_depth=10,
        random_state=42,
        early_stopping_rounds=50
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    y_pred_val = model.predict(X_val)
    diff_sum = np.sum(np.abs(y_val - y_pred_val))
    print(f"Valid絕對誤差總和: {diff_sum}")

    return model

# 用預測後的數值 去掉自身欄位 再做預測
def f2(feature_name):
    print(f"正在訓練 {feature_name} 預測模型...")

    features = features_2
    X = train_data[features].drop(columns=[feature_name])
    y = train_data[feature_name]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=42)

    # XGBoost
    model = XGBRegressor(
        n_estimators=5000,
        learning_rate=0.05,
        max_depth=10,
        random_state=42,
        early_stopping_rounds=50
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    y_pred_val = model.predict(X_val)
    diff_sum = np.sum(np.abs(y_val - y_pred_val))
    print(f"Valid絕對誤差總和: {diff_sum}")

    return model

# 保存模型
def save_model(model, model_name):
    with open(f"{model_name}.pkl", "wb") as file:
        pickle.dump(model, file)
    print(f"{model_name} 已保存到 {model_name}.pkl")

# 載入模型
def load_model(model_name):
    with open(f"{model_name}.pkl", "rb") as file:
        return pickle.load(file)

temperature_model = f1('Temperature(°C)')
humidity_model = f1('Humidity(%)')
sunlight_model = f1('Sunlight(Lux)')

temperature_model_v2 = f2('Temperature(°C)')
humidity_model_v2 = f2('Humidity(%)')
sunlight_model_v2 = f2('Sunlight(Lux)')

# 保存模型
save_model(temperature_model, "temperature_model")
save_model(humidity_model, "humidity_model")
save_model(sunlight_model, "sunlight_model")
save_model(temperature_model_v2, "temperature_model_v2")
save_model(humidity_model_v2, "humidity_model_v2")
save_model(sunlight_model_v2, "sunlight_model_v2")

# 載入模型
temperature_model = load_model("temperature_model")
humidity_model = load_model("humidity_model")
sunlight_model = load_model("sunlight_model")
temperature_model_v2 = load_model("temperature_model_v2")
humidity_model_v2 = load_model("humidity_model_v2")
sunlight_model_v2 = load_model("sunlight_model_v2")

# 加上預測後的特徵數值
# data 資料 model 模型2 target w1 原特徵權重 w2 再預測模型2權重
def update(data, model, target, w1, w2):
    data[target] = (data[target] * w1 + w2 * model.predict(data[features_2].drop(columns=[target]))).clip(lower=0)

    return data[target]

def pred_test(test_data, temperature_model, humidity_model, sunlight_model):

    # 使用時間特徵模型來預測test的溫度、濕度和光照
    test_data['Temperature(°C)'] = temperature_model.predict(test_data[time_columns + ['Location']])
    test_data['Humidity(%)'] = humidity_model.predict(test_data[time_columns + ['Location']])
    test_data['Sunlight(Lux)'] = sunlight_model.predict(test_data[time_columns + ['Location']])

    return test_data


test_file = os.path.join(os.getcwd(), 'testset.csv')
test_data = pd.read_csv(test_file, header=None, encoding='utf-8')
test_data.columns = ['Time', 'Answer']

#對test做預處理
test_data['Time'] = test_data['Time'].astype(str)
test_data['ds'] = pd.to_datetime(test_data['Time'].str[:-2], format='%Y%m%d%H%M')
test_data['Location'] = test_data['Time'].str[-2:]

test_data['month'] = test_data['ds'].dt.month
test_data['weekday'] = test_data['ds'].dt.weekday
test_data['day'] = test_data['ds'].dt.day
test_data['hour'] = test_data['ds'].dt.hour
test_data['minute'] = test_data['ds'].dt.minute
test_data['season'] = test_data['month'].apply(assign_season)

test_data['ds_numeric'] = (test_data['ds'] - pd.to_datetime('1970-01-01')).dt.total_seconds()
test_data['Location'] = test_data['Location'].astype(int)

test_data = pred_test(test_data, temperature_model, humidity_model, sunlight_model)

w1 = 0.9
w2 = 0.1
y1 = 0.95
y2 = 0.05
# 重複預測 並配比權重來降低誤差
for i in range(2):
  test_data['Temperature(°C)'] = update(test_data, temperature_model_v2, 'Temperature(°C)', w1, w2)
  test_data['Humidity(%)'] = update(test_data, humidity_model_v2, 'Humidity(%)', w1, w2)
  test_data['Sunlight(Lux)'] = update(test_data, sunlight_model_v2, 'Sunlight(Lux)', w1, w2)
test_data['Humidity(%)'] = update(test_data, humidity_model_v2, 'Humidity(%)', y1, y2)
test_data['Sunlight(Lux)'] = update(test_data, sunlight_model_v2, 'Sunlight(Lux)', y1, y2)
test_data['Humidity(%)'] = update(test_data, humidity_model_v2, 'Humidity(%)', y1, y2)
test_data['Humidity(%)'] = update(test_data, humidity_model_v2, 'Humidity(%)', y1, y2)
test_data['Humidity(%)'] = update(test_data, humidity_model_v2, 'Humidity(%)', y1, y2)

# 儲存預測結果
test_data.to_csv('xgbBoost_test_final.csv', index=False)
print("預測結果已儲存在 xgbBoost_test_final.csv 中")



正在訓練 Temperature(°C) 預測模型...
Valid絕對誤差總和: 2947.030661849976
正在訓練 Humidity(%) 預測模型...
Valid絕對誤差總和: 8401.283160438536
正在訓練 Sunlight(Lux) 預測模型...
Valid絕對誤差總和: 16921132.96212837
正在訓練 Temperature(°C) 預測模型...
Valid絕對誤差總和: 1813.66791015625
正在訓練 Humidity(%) 預測模型...
Valid絕對誤差總和: 5786.562985801696
正在訓練 Sunlight(Lux) 預測模型...
Valid絕對誤差總和: 19191885.902021028
temperature_model 已保存到 temperature_model.pkl
humidity_model 已保存到 humidity_model.pkl
sunlight_model 已保存到 sunlight_model.pkl
temperature_model_v2 已保存到 temperature_model_v2.pkl
humidity_model_v2 已保存到 humidity_model_v2.pkl
sunlight_model_v2 已保存到 sunlight_model_v2.pkl
預測結果已儲存在 xgbBoost_test_final.csv 中


#最終模型

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import os
import numpy as np

# 讀取訓練資料
train_data = pd.read_csv('/content/full_train_data.csv')

# 環境特徵與時間特徵
env_columns = ['Temperature(°C)', 'Sunlight(Lux)', 'Humidity(%)', 'Location']
time_columns = ['month', 'hour', 'weekday', 'minute', 'ds_numeric']

# 最終採用之特徵
features = env_columns + time_columns

# 切割訓練集和驗證集
X = train_data[features]
y = train_data['Target']
# 驗證集約會有5000筆資料
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=42)

# XGBoost
xgb_model = XGBRegressor(
    subsample=0.7,
    n_estimators=5000,
    max_depth=10,
    learning_rate=0.05,
    random_state=42,
    early_stopping_rounds=50,
    eval_metric="mae"
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)

save_model(xgb_model, "xgb_model")

# xgb_model = load_model("xgb_model")

# 驗證集的絕對誤差總和
y_pred_val = xgb_model.predict(X_val)
diff_sum = np.sum(np.abs(y_val - y_pred_val))
print(f"Valid絕對誤差總和: {diff_sum}")

# test
test_file = os.path.join(os.getcwd(), 'xgbBoost_test_final.csv')
test_data = pd.read_csv(test_file,encoding='utf-8')

# 預測test發電量
test_data['Answer'] = xgb_model.predict(test_data[features])

# 輸出結果
output = pd.DataFrame()
output['Time'] = test_data['Time'].astype(int)
output['Pred'] = test_data['Answer']
output['Pred'] = output['Pred'].clip(lower=0)

output.to_csv('XGBoost_Predictions_xgb.csv', index=False)
print("XGBoost 預測結果已儲存至 XGBoost_Predictions_xgb.csv")


[0]	validation_0-mae:371.90576
[1]	validation_0-mae:353.86666
[2]	validation_0-mae:336.77579
[3]	validation_0-mae:320.62305
[4]	validation_0-mae:305.31607
[5]	validation_0-mae:290.72429
[6]	validation_0-mae:276.96235
[7]	validation_0-mae:263.85159
[8]	validation_0-mae:251.42228
[9]	validation_0-mae:239.63793
[10]	validation_0-mae:228.47224
[11]	validation_0-mae:217.86294
[12]	validation_0-mae:207.81369
[13]	validation_0-mae:198.24376
[14]	validation_0-mae:189.20933
[15]	validation_0-mae:180.56955
[16]	validation_0-mae:172.38909
[17]	validation_0-mae:164.62825
[18]	validation_0-mae:157.32419
[19]	validation_0-mae:150.40468
[20]	validation_0-mae:143.84394
[21]	validation_0-mae:137.55461
[22]	validation_0-mae:131.62177
[23]	validation_0-mae:126.01791
[24]	validation_0-mae:120.68993
[25]	validation_0-mae:115.57460
[26]	validation_0-mae:110.78743
[27]	validation_0-mae:106.28017
[28]	validation_0-mae:101.91653
[29]	validation_0-mae:97.86038
[30]	validation_0-mae:94.00475
[31]	validation_0-ma