**預測最低價格**

In [3]:
# 重新載入必要的函式庫
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 讀取上傳的數據檔案
file_path = '/Users/yuchingchen/Documents/專題/cleaned_data/short_flight_final.csv'
data = pd.read_csv(file_path)

# 分離經濟艙與非經濟艙的數據
economy_class_data = data[data["艙等"] == "經濟艙"].copy()
non_economy_class_data = data[data["艙等"] != "經濟艙"].copy()

# One-hot encoding
categorical_cols = ['星期', '出發時段', '出發機場代號', '抵達時段', '抵達機場代號',
                    '航空公司', '航空聯盟', '機型分類', '是否過夜', '是否為平日', '機型', '假期', 'Region']

economy_class_data = pd.get_dummies(economy_class_data, columns=categorical_cols, drop_first=True)
non_economy_class_data = pd.get_dummies(non_economy_class_data, columns=categorical_cols, drop_first=True)

# 確保兩個數據集擁有相同的特徵欄位
common_cols = list(set(economy_class_data.columns) & set(non_economy_class_data.columns))
economy_class_data = economy_class_data[common_cols]
non_economy_class_data = non_economy_class_data[common_cols]

# 標準化數值變數
num_cols = ['飛行時間_分鐘', '經濟指標', '機場指標', 'competing_flights']
scaler = StandardScaler()
economy_class_data[num_cols] = scaler.fit_transform(economy_class_data[num_cols])
non_economy_class_data[num_cols] = scaler.transform(non_economy_class_data[num_cols])

# 選取建模特徵
target_keywords = ['出發時段_', '出發機場代號_', '抵達時段_', '航空聯盟_', '機型分類_', '是否為平日_', '假期_', 'Region_']
economy_class_data_dummy_cols = [col for col in economy_class_data.columns if any(keyword in col for keyword in target_keywords)]
non_economy_class_data_dummy_cols = [col for col in non_economy_class_data.columns if any(keyword in col for keyword in target_keywords)]
economy_class_data_feature_cols = economy_class_data_dummy_cols + ['飛行時間_分鐘', '經濟指標', '機場指標', 'competing_flights']
non_economy_class_data_feature_cols = non_economy_class_data_dummy_cols + ['飛行時間_分鐘', '經濟指標', '機場指標', 'competing_flights']

# 分割訓練集與測試集
X_economy = economy_class_data[economy_class_data_feature_cols]
y_economy = economy_class_data["最低價格_log"]

X_non_economy = non_economy_class_data[non_economy_class_data_feature_cols]
y_non_economy = non_economy_class_data["最低價格_log"]

X_train_economy, X_test_economy, y_train_economy, y_test_economy = train_test_split(X_economy, y_economy, test_size=0.3, random_state=42)
X_train_non_economy, X_test_non_economy, y_train_non_economy, y_test_non_economy = train_test_split(X_non_economy, y_non_economy, test_size=0.3, random_state=42)

# 合併自變數與目標變數
train_economy = pd.concat([X_train_economy, y_train_economy], axis=1)
test_economy = pd.concat([X_test_economy, y_test_economy], axis=1)
train_non_economy = pd.concat([X_train_non_economy, y_train_non_economy], axis=1)
test_non_economy = pd.concat([X_test_non_economy, y_test_non_economy], axis=1)

# 儲存檔案
train_economy_path = "/Users/yuchingchen/Documents/專題/model/data/short_train_economy.csv"
test_economy_path = "/Users/yuchingchen/Documents/專題/model/data/short_test_economy.csv"
train_non_economy_path = "/Users/yuchingchen/Documents/專題/model/data/short_train_non_economy.csv"
test_non_economy_path = "/Users/yuchingchen/Documents/專題/model/data/short_test_non_economy.csv"

train_economy.to_csv(train_economy_path, index=False)
test_economy.to_csv(test_economy_path, index=False)
train_non_economy.to_csv(train_non_economy_path, index=False)
test_non_economy.to_csv(test_non_economy_path, index=False)
print("檔案儲存成功")

檔案儲存成功
