In [4]:
# Step 1: 导入工具包
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error

# Step 2: 读取训练集和测试集
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# Step 3: 分离目标列和特征列
y = train_data["SalePrice"]
X = train_data.drop(columns=["Id", "SalePrice"])

# Step 4: 拆分数值特征和类别特征
X_num = X.select_dtypes(include=["int64", "float64"])
X_cat = X.select_dtypes(include=["object"])

# Step 5: 数值特征填补缺失值
num_imputer = SimpleImputer(strategy='median')
X_num_imputed = pd.DataFrame(num_imputer.fit_transform(X_num), columns=X_num.columns)

# Step 6: 类别特征 One-hot 编码
X_cat_dummies = pd.get_dummies(X_cat)

# Step 7: 拼接数值特征和类别特征
X_full = pd.concat([X_num_imputed, X_cat_dummies], axis=1)

# Step 8: 拆分训练集和验证集
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, random_state=0)

# Step 9: 建模 + 训练
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

# Step 10: 验证模型效果
preds = model.predict(X_valid)
mae = mean_absolute_error(y_valid, preds)
print("验证集 MAE:", mae)

# Step 11: 准备测试集数据（保持一致）
# 移除 Id 再选数值列，防止列不匹配
test_num = test_data.select_dtypes(include=["int64", "float64"]).drop(columns=["Id"], errors='ignore')
test_num_imputed = pd.DataFrame(num_imputer.transform(test_num), columns=X_num.columns)

# 类别列处理
test_cat = test_data.select_dtypes(include=["object"])
test_cat_dummies = pd.get_dummies(test_cat)

# 对齐列（防止缺类别）
test_cat_dummies = test_cat_dummies.reindex(columns=X_cat_dummies.columns, fill_value=0)

# 拼接最终测试数据
X_test_final = pd.concat([test_num_imputed, test_cat_dummies], axis=1)

# Step 12: 预测并导出结果
preds_test = model.predict(X_test_final)

submission = pd.DataFrame({
    "Id": test_data["Id"],
    "SalePrice": preds_test
})
submission.to_csv("submission.csv", index=False)


验证集 MAE: 17168.589205479453
