In [48]:
import pandas as pd, numpy as np
from pathlib import Path
from IPython.display import display

DATA_DIR = Path(".")
train = pd.read_csv(DATA_DIR/"train.csv")
test  = pd.read_csv(DATA_DIR/"test.csv")
sample_sub = pd.read_csv(DATA_DIR/"sample_submission.csv")

print(train.shape, test.shape)
display(train.head(3))


(1460, 81) (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [49]:
train_full = train.copy()
test_full  = test.copy()

# 这些列的 NA 表示"没有该设施"
na_none_cols = [
    "PoolQC","MiscFeature","Alley","Fence","FireplaceQu",
    "GarageType","GarageFinish","GarageQual","GarageCond",
    "BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2",
    "MasVnrType"
]
for df in (train_full, test_full):
    for c in na_none_cols:
        if c in df.columns:
            df[c] = df[c].fillna("None")

# LotFrontage 用 Neighborhood 中位数填
for df in (train_full, test_full):
    if "LotFrontage" in df.columns:
        med_map = df.groupby("Neighborhood")["LotFrontage"].transform("median")
        df["LotFrontage"] = df["LotFrontage"].fillna(med_map)


In [50]:
for df in (train_full, test_full):
    df["TotalSF"] = df.get("TotalBsmtSF",0) + df.get("1stFlrSF",0) + df.get("2ndFlrSF",0)
    df["TotalBath"] = df.get("FullBath",0) + 0.5*df.get("HalfBath",0) + df.get("BsmtFullBath",0) + 0.5*df.get("BsmtHalfBath",0)
    df["Age"] = df.get("YrSold",0) - df.get("YearBuilt",0)
    df["AgeSinceRemod"] = df.get("YrSold",0) - df.get("YearRemodAdd",0)
    df["HasPool"] = (df.get("PoolArea",0) > 0).astype(int)
    df["HasGarage"] = (df.get("GarageArea",0) > 0).astype(int)
    df["HasBsmt"] = (df.get("TotalBsmtSF",0) > 0).astype(int)
    df["HasFireplace"] = (df.get("Fireplaces",0) > 0).astype(int)

# 去除经典离群点
out_idx = train_full[(train_full["GrLivArea"]>4000) & (train_full["SalePrice"]<300000)].index
if len(out_idx):
    train_full = train_full.drop(index=out_idx).reset_index(drop=True)

y = train_full["SalePrice"].copy()
X = train_full.drop(columns=["SalePrice"])
X_test = test_full.copy()

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
print(len(num_cols), "numeric,", len(cat_cols), "categorical")


45 numeric, 43 categorical


In [51]:
# 有序质量列映射
qual_map = {"Po":1, "Fa":2, "TA":3, "Gd":4, "Ex":5}
ordinal_cols = ["ExterQual","ExterCond","BsmtQual","BsmtCond","HeatingQC",
                "KitchenQual","FireplaceQu","GarageQual","GarageCond","PoolQC"]
for c in ordinal_cols:
    if c in X.columns:
        X[c] = X[c].map(qual_map).fillna(0)
        X_test[c] = X_test[c].map(qual_map).fillna(0)
        if c in cat_cols:
            cat_cols.remove(c)
        if c not in num_cols:
            num_cols.append(c)

# 对偏态的数值列做 log1p（仅正值）
skews = X[num_cols].apply(lambda s: s.dropna()).apply(pd.Series.skew)
skewed = skews[skews>0.75].index.tolist()
for df in (X, X_test):
    for c in skewed:
        if (df[c] > 0).any():
            df[c] = np.log1p(df[c].clip(lower=0))


In [52]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score

def make_ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.float32)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False, dtype=np.float32)

numeric_transformer = SimpleImputer(strategy="median")
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", make_ohe()),
])

preprocess = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols),
])

# 目标对数化封装
class LogTargetRegressor:
    def __init__(self, base_estimator): self.base_estimator = base_estimator
    def fit(self, X, y):
        self.base_estimator.fit(X, np.log1p(y)); return self
    def predict(self, X):
        return np.expm1(self.base_estimator.predict(X))
    def get_params(self, deep=True): return {"base_estimator": self.base_estimator}
    def set_params(self, **p):
        if "base_estimator" in p: self.base_estimator = p["base_estimator"]; return self

# 模型 A：ElasticNet（稳健线性）
enet = Pipeline([
    ("preprocess", preprocess),
    ("model", LogTargetRegressor(ElasticNet(alpha=0.0005, l1_ratio=0.05, max_iter=6000, random_state=42)))
])

# 模型 B：HistGradientBoosting（树模型，免安装，强）
hgb_params = dict(learning_rate=0.05, max_depth=6, l2_regularization=0.01,
                  min_samples_leaf=10, max_bins=255, random_state=42)
hgb = Pipeline([
    ("preprocess", preprocess),
    ("model", LogTargetRegressor(HistGradientBoostingRegressor(**hgb_params)))
])

def rmse_log1p(y_true, y_pred):
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))

def scorer(est, Xv, yv):
    return -rmse_log1p(yv, est.predict(Xv))

cv = KFold(n_splits=5, shuffle=True, random_state=42)
for name, est in [("ElasticNet", enet), ("HGB", hgb)]:
    s = cross_val_score(est, X, y, scoring=scorer, cv=cv)
    print(name, "CV RMSE:", (-s).mean(), "+/-", (-s).std())


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


ElasticNet CV RMSE: 0.11789516355814719 +/- 0.008273284549108367




HGB CV RMSE: 0.12722834242949604 +/- 0.007625326662395762




In [53]:
enet.fit(X, y)
hgb.fit(X, y)

pred_enet = enet.predict(X_test)
pred_hgb  = hgb.predict(X_test)

# 简单加权（可以把 w 调成 0.4~0.7 做小网格找最优）
w = 0.6
pred_blend = w*pred_enet + (1-w)*pred_hgb

sub = sample_sub.copy()
sub["SalePrice"] = pred_blend
out_path = Path("submission_houseprice.csv")
sub.to_csv(out_path, index=False)
print("Saved:", out_path.resolve())


  model = cd_fast.enet_coordinate_descent(


Saved: /Users/ciqrua/Desktop/Job/kaggle/houseprice/submission_houseprice.csv


