In [1]:
# =========================================
# 1. IMPORTS
# =========================================
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# =========================================
# 2. LOAD DATA
# =========================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# =========================================
# 3. HÀM DROP CỘT NHIỀU NULL + NHIỀU ZERO
#    (áp dụng cho cả train và test để đồng bộ)
# =========================================
def drop_null_and_zero(train_df, test_df, null_threshold=500, zero_ratio=0.5):
    # 3.1 drop cột quá nhiều null trong train
    cols_null = []
    for col in train_df.columns:
        if train_df[col].isnull().sum() > null_threshold:
            cols_null.append(col)
    train_df = train_df.drop(columns=cols_null)
    # test cũng drop cùng cột
    test_df = test_df.drop(columns=[c for c in cols_null if c in test_df.columns], errors="ignore")

    # 3.2 drop cột có > zero_ratio là 0
    def _drop_zero_cols(df, cols_to_check):
        n_rows = len(df)
        cols_drop = []
        for c in cols_to_check:
            zero_count = (df[c] == 0).sum()
            if zero_count / n_rows > zero_ratio:
                cols_drop.append(c)
        return cols_drop

    # chỉ check trên train, sau đó drop cả 2
    numeric_cols = train_df.select_dtypes(include=[np.number]).columns
    zero_cols = _drop_zero_cols(train_df, numeric_cols)

    train_df = train_df.drop(columns=zero_cols)
    test_df = test_df.drop(columns=[c for c in zero_cols if c in test_df.columns], errors="ignore")

    return train_df, test_df

train_clean, test_clean = drop_null_and_zero(train, test, null_threshold=500, zero_ratio=0.5)

# =========================================
# 4. FEATURE ENGINEERING (THEO LOGIC BẠN ĐÃ LÀM)
# =========================================
def prepare_features(train_df, test_df, target_col="SalePrice"):
    train_df = train_df.copy()
    test_df = test_df.copy()

    # ---- chọn các cột gốc cần dùng (khoảng 30 cột + vài cột FE)
    base_cols = [
        'MSZoning','LotArea','LotConfig','LandSlope','Neighborhood',
        'HouseStyle','OverallQual','YearBuilt','YearRemodAdd',
        'RoofStyle','Exterior1st','Exterior2nd','ExterQual','Foundation',
        'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','TotalBsmtSF',
        'HeatingQC','CentralAir','1stFlrSF','GrLivArea','FullBath',
        'TotRmsAbvGrd','GarageType','GarageFinish','GarageCars','GarageArea',
        'PavedDrive','MoSold','YrSold','SaleType','SaleCondition'
    ]

    # đôi khi 1 số cột sau khi drop ở trên sẽ mất → chỉ giữ cột còn tồn tại
    base_cols = [c for c in base_cols if c in train_df.columns]

    # lấy train_X
    train_X = train_df[base_cols].copy()
    # lấy test_X
    test_X = test_df[[c for c in base_cols if c in test_df.columns]].copy()

    # =============================
    # 4.1 MAP / RÚT GỌN CÁC CỘT CATEGORICAL
    # =============================

    # BsmtFinType1
    col_BsmtFinType1 = {'Unf': 1, 'GLQ': 2, 'ALQ': 3, 'BLQ': 3, 'Rec': 3, 'LwQ': 3}
    for df in [train_X, test_X]:
        if 'BsmtFinType1' in df.columns:
            df['BsmtFinType1'] = df['BsmtFinType1'].map(col_BsmtFinType1).fillna(4)

    # BsmtExposure
    col_BsmtExposure = {'No': 1, 'Av': 2, 'Mn': 2, 'Gd': 3}
    for df in [train_X, test_X]:
        if 'BsmtExposure' in df.columns:
            df['BsmtExposure'] = df['BsmtExposure'].map(col_BsmtExposure).fillna(4)

    # BsmtQual
    col_BsmtQual = {'TA':1, 'Fa':1, 'Gd':2, 'Ex':3}
    for df in [train_X, test_X]:
        if 'BsmtQual' in df.columns:
            df['BsmtQual'] = df['BsmtQual'].map(col_BsmtQual)

    # Exterior1st
    for df in [train_X, test_X]:
        if 'Exterior1st' in df.columns:
            df['Exterior1st'] = df['Exterior1st'].apply(
                lambda x: x if x in ['VinylSd','MetalSd','Wd Sdng','HdBoard','Plywood','Stucco'] else 'others'
            )
            map_ex1 = {'VinylSd':1, 'MetalSd':2, 'Wd Sdng':2, 'HdBoard':2, 'Plywood':2, 'Stucco':2, 'others':3}
            df['Exterior1st'] = df['Exterior1st'].map(map_ex1)

    # Exterior2nd
    for df in [train_X, test_X]:
        if 'Exterior2nd' in df.columns:
            df['Exterior2nd'] = df['Exterior2nd'].apply(
                lambda x: x if x in ['VinylSd','MetalSd','HdBoard','Wd Sdng','Plywood'] else 'others'
            )
            map_ex2 = {'VinylSd':1, 'MetalSd':2, 'HdBoard':3, 'Wd Sdng':4, 'Plywood':5, 'others':6}
            df['Exterior2nd'] = df['Exterior2nd'].map(map_ex2)

    # ExterQual
    for df in [train_X, test_X]:
        if 'ExterQual' in df.columns:
            df['ExterQual'] = df['ExterQual'].apply(lambda x: x if x in ['TA','Gd'] else 'others')
            map_exq = {'TA':1, 'Gd':2, 'others':3}
            df['ExterQual'] = df['ExterQual'].map(map_exq)

    # Foundation
    for df in [train_X, test_X]:
        if 'Foundation' in df.columns:
            df['Foundation'] = df['Foundation'].apply(lambda x: x if x in ['PConc','CBlock','BrkTil'] else 'others')
            map_f = {'PConc':1, 'CBlock':2, 'BrkTil':3, 'others':4}
            df['Foundation'] = df['Foundation'].map(map_f)

    # GarageFinish
    for df in [train_X, test_X]:
        if 'GarageFinish' in df.columns:
            df['GarageFinish'] = df['GarageFinish'].fillna('others')
            map_gf = {'Unf':1, 'RFn':2, 'Fin':3, 'others':4}
            df['GarageFinish'] = df['GarageFinish'].map(map_gf)

    # GarageType
    for df in [train_X, test_X]:
        if 'GarageType' in df.columns:
            df['GarageType'] = df['GarageType'].apply(lambda x: x if x in ['Attchd','Detchd'] else 'others')
            map_gt = {'Attchd':1, 'Detchd':2, 'others':3}
            df['GarageType'] = df['GarageType'].map(map_gt)

    # HeatingQC
    for df in [train_X, test_X]:
        if 'HeatingQC' in df.columns:
            df['HeatingQC'] = df['HeatingQC'].apply(lambda x: x if x in ['Ex','TA','Gd'] else 'others')
            map_hq = {'Ex':1, 'TA':2, 'Gd':3, 'others':4}
            df['HeatingQC'] = df['HeatingQC'].map(map_hq)

    # HouseStyle
    for df in [train_X, test_X]:
        if 'HouseStyle' in df.columns:
            df['HouseStyle'] = df['HouseStyle'].apply(lambda x: x if x in ['1Story','2Story','1.5Fin'] else 'others')
            map_hs = {'1Story':1, '2Story':2, '1.5Fin':3, 'others':4}
            df['HouseStyle'] = df['HouseStyle'].map(map_hs)

    # KitchenQual nếu có
    for df in [train_X, test_X]:
        if 'KitchenQual' in df.columns:
            df['KitchenQual'] = df['KitchenQual'].apply(lambda x: x if x in ['TA','Gd','Ex'] else 'others')
            map_kq = {'TA':1, 'Gd':2, 'Ex':3, 'others':4}
            df['KitchenQual'] = df['KitchenQual'].map(map_kq)

    # LandSlope: bạn drop vì lệch → drop luôn
    for df in [train_X, test_X]:
        if 'LandSlope' in df.columns:
            df.drop(columns=['LandSlope'], inplace=True)

    # LotConfig
    for df in [train_X, test_X]:
        if 'LotConfig' in df.columns:
            df['LotConfig'] = df['LotConfig'].apply(lambda x: x if x in ['Inside','Corner'] else 'others')
            map_lc = {'Inside':1, 'Corner':2, 'others':3}
            df['LotConfig'] = df['LotConfig'].map(map_lc)

    # LotShape
    for df in [train_X, test_X]:
        if 'LotShape' in df.columns:
            df['LotShape'] = df['LotShape'].apply(lambda x: x if x in ['Reg','IR1'] else 'others')
            map_ls = {'Reg':1, 'IR1':2, 'others':3}
            df['LotShape'] = df['LotShape'].map(map_ls)

    # MSZoning
    for df in [train_X, test_X]:
        if 'MSZoning' in df.columns:
            df['MSZoning'] = df['MSZoning'].apply(lambda x: x if x in ['RL','RM'] else 'others')
            map_ms = {'RL':1, 'RM':2, 'others':3}
            df['MSZoning'] = df['MSZoning'].map(map_ms)

    # RoofStyle
    for df in [train_X, test_X]:
        if 'RoofStyle' in df.columns:
            df['RoofStyle'] = df['RoofStyle'].apply(lambda x: x if x in ['Gable','Hip'] else 'others')
            map_rs = {'Gable':1, 'Hip':2, 'others':3}
            df['RoofStyle'] = df['RoofStyle'].map(map_rs)

    # SaleCondition
    for df in [train_X, test_X]:
        if 'SaleCondition' in df.columns:
            df['SaleCondition'] = df['SaleCondition'].apply(
                lambda x: x if x in ['Normal','Partial','Abnorml'] else 'others'
            )
            map_sc = {'Normal':1, 'Partial':2, 'Abnorml':3, 'others':4}
            df['SaleCondition'] = df['SaleCondition'].map(map_sc)

    # ============== Neighborhood: target-mean encoding thủ công ==============
    if 'Neighborhood' in train_X.columns:
        nb_mean = train_df.groupby('Neighborhood')[target_col].mean()
        # map train
        train_X['Neighborhood'] = train_X['Neighborhood'].map(nb_mean)
        # test: nếu ko có trong train → fill bằng mean chung
        global_mean = train_df[target_col].mean()
        test_X['Neighborhood'] = test_X['Neighborhood'].map(nb_mean).fillna(global_mean)

        # scale
        scaler_nb = StandardScaler()
        train_X[['Neighborhood']] = scaler_nb.fit_transform(train_X[['Neighborhood']])
        test_X[['Neighborhood']] = scaler_nb.transform(test_X[['Neighborhood']])

    # ============== Feature engineering numeric ===============
    # HouseAge = YrSold - YearBuilt
    if ('YrSold' in train_X.columns) and ('YearBuilt' in train_X.columns):
        train_X['HouseAge'] = train_X['YrSold'] - train_X['YearBuilt']
        test_X['HouseAge'] = test_X['YrSold'] - test_X['YearBuilt']
        scaler_age = StandardScaler()
        train_X[['HouseAge']] = scaler_age.fit_transform(train_X[['HouseAge']])
        test_X[['HouseAge']] = scaler_age.transform(test_X[['HouseAge']])

    # IsRemodeled
    if ('YearRemodAdd' in train_X.columns) and ('YearBuilt' in train_X.columns):
        train_X['IsRemodeled'] = (train_X['YearRemodAdd'] != train_X['YearBuilt']).astype(int)
        test_X['IsRemodeled'] = (test_X['YearRemodAdd'] != test_X['YearBuilt']).astype(int)

    # Scale mấy cột liên tục quan trọng
    for col in ['TotalBsmtSF','1stFlrSF','GrLivArea','LotArea','GarageArea']:
        if col in train_X.columns:
            scaler_tmp = StandardScaler()
            train_X[[col]] = scaler_tmp.fit_transform(train_X[[col]])
            if col in test_X.columns:
                test_X[[col]] = scaler_tmp.transform(test_X[[col]])

    # Has3FullBath
    if 'FullBath' in train_X.columns:
        train_X['Has3FullBath'] = (train_X['FullBath'] >= 3).astype(int)
        test_X['Has3FullBath'] = (test_X['FullBath'] >= 3).astype(int)

    # Has3Garage
    if 'GarageCars' in train_X.columns:
        train_X['Has3Garage'] = (train_X['GarageCars'] == 3).astype(int)
        test_X['Has3Garage'] = (test_X['GarageCars'] == 3).astype(int)

    # GarageArea_per_car
    if ('GarageArea' in train_X.columns) and ('GarageCars' in train_X.columns):
        train_X['GarageArea_per_car'] = train_X['GarageArea'] / (train_X['GarageCars'] + 1)
        test_X['GarageArea_per_car'] = test_X['GarageArea'] / (test_X['GarageCars'] + 1)
        scaler_g = StandardScaler()
        train_X[['GarageArea','GarageArea_per_car']] = scaler_g.fit_transform(
            train_X[['GarageArea','GarageArea_per_car']]
        )
        test_X[['GarageArea','GarageArea_per_car']] = scaler_g.transform(
            test_X[['GarageArea','GarageArea_per_car']]
        )

    # cuối cùng: drop mấy cột gốc ko cần nữa
    drop_cols = ['YearBuilt','YrSold','YearRemodAdd','TotRmsAbvGrd']
    for df in [train_X, test_X]:
        for c in drop_cols:
            if c in df.columns:
                df.drop(columns=[c], inplace=True)

    # =============================
    # 4.2 Encode mọi object còn sót (phòng hờ)
    # =============================
    for df in [train_X, test_X]:
        obj_cols = df.select_dtypes(include='object').columns
        for col in obj_cols:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))

    # gắn lại target
    train_X[target_col] = train_df[target_col].values

    return train_X, test_X

# =========================================
# 5. GỌI HÀM FEATURE ENGINEERING
# =========================================
train_X, test_X = prepare_features(train_clean, test_clean, target_col="SalePrice")

# =========================================
# 6. TÁCH X, y + ĐỒNG BỘ CỘT
# =========================================
X = train_X.drop(columns=['SalePrice'])
y_log = np.log1p(train_X['SalePrice'])

# Đồng bộ cột giữa train và test (rất quan trọng!)
test_X = test_X.reindex(columns=X.columns, fill_value=0)

# =========================================
# 7. CROSS-VALIDATION ĐỂ XEM RMSE
# =========================================
models = {
    "RandomForest": RandomForestRegressor(
        n_estimators=500,
        max_depth=15,
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    ),
    "XGB": XGBRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        tree_method="hist"
    )
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(lambda yt, yp: np.sqrt(mean_squared_error(yt, yp)), greater_is_better=False)

print("==== 5-FOLD CV (log target) ====")
for name, model in models.items():
    scores = -cross_val_score(model, X, y_log, scoring=rmse_scorer, cv=kf)
    print(f"{name}: RMSE={scores.mean():.5f} ± {scores.std():.5f}")

# =========================================
# 8. TRAIN FINAL MODEL + PREDICT + SUBMISSION
# =========================================
best_model = XGBRegressor(
    n_estimators=1200,
    learning_rate=0.04,
    max_depth=6,
    subsample=0.85,
    colsample_bytree=0.85,
    random_state=42,
    tree_method="hist"
)

best_model.fit(X, y_log)
pred_log = best_model.predict(test_X)
pred = np.expm1(pred_log)

submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": pred
})

# Lưu file
submission.to_csv("submission_final.csv", index=False)
print("✅ Saved to submission_final.csv")


==== 5-FOLD CV (log target) ====
RandomForest: RMSE=0.15176 ± 0.01394
XGB: RMSE=0.14060 ± 0.01101
✅ Saved to submission_final.csv


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# =============== 1. LOAD ===============
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# =============== 2. DROP OUTLIER ===============
# classic Kaggle outliers
train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)

# =============== 3. GỘP ĐỂ XỬ LÝ CHUNG ===============
train_y = np.log1p(train["SalePrice"])
train_id = train["Id"]
test_id = test["Id"]

train.drop(columns=["SalePrice"], inplace=True)

full = pd.concat([train, test], axis=0, ignore_index=True)

# =============== 4. FILL MISSING ===============
# numeric -> 0
num_cols = full.select_dtypes(include=[np.number]).columns
full[num_cols] = full[num_cols].fillna(0)

# object -> "None"
obj_cols = full.select_dtypes(include=["object"]).columns
full[obj_cols] = full[obj_cols].fillna("None")

# =============== 5. LOG TRANSFORM CÁC CỘT LỆCH ===============
skew_cols = ["LotArea", "GrLivArea", "1stFlrSF", "2ndFlrSF", "TotalBsmtSF", "LowQualFinSF"]
for c in skew_cols:
    if c in full.columns:
        full[c] = np.log1p(full[c])

# =============== 6. ONE-HOT ===============
full = pd.get_dummies(full, drop_first=True)

# tách lại
n_train = len(train_y)
train_X = full.iloc[:n_train, :].copy()
test_X = full.iloc[n_train:, :].copy()

# =============== 7. CV SCORER ===============
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(lambda yt, yp: np.sqrt(mean_squared_error(yt, yp)), greater_is_better=False)

# =============== 8. MODELS ===============
xgb_model = XGBRegressor(
    n_estimators=1400,
    learning_rate=0.03,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.001,
    reg_lambda=1.0,
    random_state=42,
    tree_method="hist"
)

rf_model = RandomForestRegressor(
    n_estimators=700,
    max_depth=18,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)

# =============== 9. CV ===============
xgb_scores = -cross_val_score(xgb_model, train_X, train_y, scoring=rmse_scorer, cv=kf)
rf_scores  = -cross_val_score(rf_model, train_X, train_y, scoring=rmse_scorer, cv=kf)

print(f"XGB (rich): {xgb_scores.mean():.5f} ± {xgb_scores.std():.5f}")
print(f"RF  (rich): {rf_scores.mean():.5f} ± {rf_scores.std():.5f}")

# =============== 10. FIT FULL + BLEND ===============
xgb_model.fit(train_X, train_y)
rf_model.fit(train_X, train_y)

pred_xgb = np.expm1(xgb_model.predict(test_X))
pred_rf  = np.expm1(rf_model.predict(test_X))

# blend 0.7 xgb + 0.3 rf
pred_final = 0.7 * pred_xgb + 0.3 * pred_rf

sub = pd.DataFrame({
    "Id": test_id,
    "SalePrice": pred_final
})
sub.to_csv("submission_blend.csv", index=False)
print("✅ saved submission_blend.csv")


XGB (rich): 0.12087 ± 0.00985
RF  (rich): 0.14037 ± 0.00930
✅ saved submission_blend.csv


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

# =============== 1. LOAD ===============
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# drop outlier kinh điển
train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)

y = np.log1p(train["SalePrice"])
train_id = train["Id"]
test_id = test["Id"]

train = train.drop(columns=["SalePrice"])

# =============== 2. CONCAT ===============
full = pd.concat([train, test], axis=0, ignore_index=True)

# =============== 3. FILL MISSING ===============
num_cols = full.select_dtypes(include=[np.number]).columns
obj_cols = full.select_dtypes(include=['object']).columns

full[num_cols] = full[num_cols].fillna(0)
full[obj_cols] = full[obj_cols].fillna("None")

# =============== 4. FEATURE ENGINEERING NHẸ ===============
# log mấy cột lệch
skew_cols = ["LotArea", "GrLivArea", "1stFlrSF", "2ndFlrSF", "TotalBsmtSF", "LowQualFinSF"]
for c in skew_cols:
    if c in full.columns:
        full[c] = np.log1p(full[c])

# tổng diện tích sàn kiểu Kaggle
full["TotalSF"] = full["TotalBsmtSF"] + full["1stFlrSF"] + full["2ndFlrSF"]

# overall quality x diện tích
full["Qual_x_GrLiv"] = full["OverallQual"] * (full["GrLivArea"])

# tuổi nhà + năm sửa
full["HouseAge"] = full["YrSold"] - full["YearBuilt"]
full["RemodAge"] = full["YrSold"] - full["YearRemodAdd"]
full["HouseAge"] = full["HouseAge"].clip(lower=0)
full["RemodAge"] = full["RemodAge"].clip(lower=0)

# có hầm không
full["HasBsmt"] = (full["TotalBsmtSF"] > 0).astype(int)

# mấy cột rất lệch / gần như 1 giá trị -> bỏ
low_var_cols = ["Street", "Utilities", "Condition2", "RoofMatl", "PoolArea", "PoolQC",
                "MiscVal", "MiscFeature"]
for c in low_var_cols:
    if c in full.columns:
        full = full.drop(columns=[c])

# =============== 5. ONE-HOT ===============
full = pd.get_dummies(full, drop_first=True)

# =============== 6. TÁCH LẠI ===============
n_train = len(y)
X = full.iloc[:n_train, :].copy()
X_test = full.iloc[n_train:, :].copy()

# =============== 7. SCORER + CV ===============
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(lambda yt, yp: np.sqrt(mean_squared_error(yt, yp)), greater_is_better=False)

# =============== 8. 2 XGB KHÁC NHAU ===============
xgb_deep = XGBRegressor(
    n_estimators=1600,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.7,
    min_child_weight=3,
    reg_lambda=1.0,
    reg_alpha=0.0,
    random_state=42,
    tree_method="hist"
)

xgb_shallow = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.02,
    max_depth=3,
    subsample=0.85,
    colsample_bytree=0.6,
    min_child_weight=2,
    reg_lambda=1.2,
    reg_alpha=0.001,
    random_state=42,
    tree_method="hist"
)

# CV nhanh để xem
scores_deep = -cross_val_score(xgb_deep, X, y, scoring=rmse_scorer, cv=kf)
scores_shallow = -cross_val_score(xgb_shallow, X, y, scoring=rmse_scorer, cv=kf)
print(f"XGB deep   : {scores_deep.mean():.5f} ± {scores_deep.std():.5f}")
print(f"XGB shallow: {scores_shallow.mean():.5f} ± {scores_shallow.std():.5f}")

# =============== 9. FIT FULL + BLEND ===============
xgb_deep.fit(X, y)
xgb_shallow.fit(X, y)

pred1 = np.expm1(xgb_deep.predict(X_test))
pred2 = np.expm1(xgb_shallow.predict(X_test))

# blend: shallow thường generalize tốt hơn -> cho weight cao hơn
final_pred = 0.4 * pred1 + 0.6 * pred2

sub = pd.DataFrame({
    "Id": test_id,
    "SalePrice": final_pred
})
sub.to_csv("submission_tuned.csv", index=False)
print("✅ saved submission_tuned.csv")


XGB deep   : 0.12260 ± 0.00946
XGB shallow: 0.11854 ± 0.00824
✅ saved submission_tuned.csv


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from xgboost import XGBRegressor

# ====== 1. LOAD + PREP y ======
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# bỏ outlier kinh điển
train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)

y = np.log1p(train["SalePrice"])
test_id = test["Id"]

train = train.drop(columns=["SalePrice"])
full = pd.concat([train, test], axis=0, ignore_index=True)

# ====== 2. FILL ======
num_cols = full.select_dtypes(include=[np.number]).columns
obj_cols = full.select_dtypes(include=["object"]).columns
full[num_cols] = full[num_cols].fillna(0)
full[obj_cols] = full[obj_cols].fillna("None")

# ====== 3. FE ======
skew_cols = ["LotArea","GrLivArea","1stFlrSF","2ndFlrSF","TotalBsmtSF","LowQualFinSF"]
for c in skew_cols:
    if c in full.columns:
        full[c] = np.log1p(full[c])

full["TotalSF"] = full["TotalBsmtSF"] + full["1stFlrSF"] + full["2ndFlrSF"]
full["Qual_x_GrLiv"] = full["OverallQual"] * full["GrLivArea"]
full["HouseAge"] = (full["YrSold"] - full["YearBuilt"]).clip(lower=0)
full["RemodAge"] = (full["YrSold"] - full["YearRemodAdd"]).clip(lower=0)
full["HasBsmt"] = (full["TotalBsmtSF"] > 0).astype(int)

low_var_cols = ["Street","Utilities","Condition2","RoofMatl","PoolArea","PoolQC","MiscVal","MiscFeature"]
for c in low_var_cols:
    if c in full.columns:
        full = full.drop(columns=[c])

# ====== 4. ONE-HOT ======
full = pd.get_dummies(full, drop_first=True)

# ====== 5. SPLIT LẠI ======
n_train = len(y)
X = full.iloc[:n_train, :].copy()
X_test = full.iloc[n_train:, :].copy()

# ====== 6. CV SETUP ======
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(lambda yt, yp: np.sqrt(mean_squared_error(yt, yp)), greater_is_better=False)

# ====== 7. MODEL 1: SHALLOW (đang tốt) ======
xgb_shallow = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.02,
    max_depth=3,
    subsample=0.85,
    colsample_bytree=0.6,
    min_child_weight=2,
    reg_lambda=1.2,
    reg_alpha=0.001,
    gamma=0.0,
    random_state=42,
    tree_method="hist"
)

# ====== 8. MODEL 2: WIDE (depth=2, nhiều cây) ======
xgb_wide = XGBRegressor(
    n_estimators=2500,
    learning_rate=0.018,
    max_depth=2,
    subsample=0.9,
    colsample_bytree=0.55,
    min_child_weight=3,
    reg_lambda=1.3,
    reg_alpha=0.01,
    gamma=0.1,
    random_state=42,
    tree_method="hist"
)

# ====== 9. MODEL 3: DEEP-LITE (depth=4, regularize) ======
xgb_deeplite = XGBRegressor(
    n_estimators=1400,
    learning_rate=0.028,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.6,
    min_child_weight=4,
    reg_lambda=1.4,
    reg_alpha=0.02,
    gamma=0.15,
    random_state=42,
    tree_method="hist"
)

# ====== 10. CV NHANH ======
for name, mdl in {
    "XGB_shallow": xgb_shallow,
    "XGB_wide": xgb_wide,
    "XGB_deeplite": xgb_deeplite
}.items():
    scores = -cross_val_score(mdl, X, y, scoring=rmse_scorer, cv=kf)
    print(f"{name}: {scores.mean():.5f} ± {scores.std():.5f}")

# ====== 11. TRAIN FULL + BLEND ======
xgb_shallow.fit(X, y)
xgb_wide.fit(X, y)
xgb_deeplite.fit(X, y)

p1 = np.expm1(xgb_shallow.predict(X_test))
p2 = np.expm1(xgb_wide.predict(X_test))
p3 = np.expm1(xgb_deeplite.predict(X_test))

# weight ưu tiên model tốt nhất (shallow)
final_pred = 0.60 * p1 + 0.25 * p2 + 0.15 * p3

sub = pd.DataFrame({
    "Id": test_id,
    "SalePrice": final_pred
})
sub.to_csv("submission_blend3.csv", index=False)
print("✅ saved submission_blend3.csv")


XGB_shallow: 0.11854 ± 0.00824
XGB_wide: 0.12703 ± 0.00971
XGB_deeplite: 0.12791 ± 0.00981
✅ saved submission_blend3.csv


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from xgboost import XGBRegressor

# --- chuẩn bị như trước ---
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# bỏ outlier kinh điển
train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)

y = np.log1p(train["SalePrice"])
test_id = test["Id"]
train = train.drop(columns=["SalePrice"])
full = pd.concat([train, test], axis=0, ignore_index=True)

num_cols = full.select_dtypes(include=[np.number]).columns
obj_cols = full.select_dtypes(include=["object"]).columns
full[num_cols] = full[num_cols].fillna(0)
full[obj_cols] = full[obj_cols].fillna("None")

skew_cols = ["LotArea","GrLivArea","1stFlrSF","2ndFlrSF","TotalBsmtSF","LowQualFinSF"]
for c in skew_cols:
    if c in full.columns:
        full[c] = np.log1p(full[c])

full["TotalSF"] = full["TotalBsmtSF"] + full["1stFlrSF"] + full["2ndFlrSF"]
full["Qual_x_GrLiv"] = full["OverallQual"] * full["GrLivArea"]
full["HouseAge"] = (full["YrSold"] - full["YearBuilt"]).clip(lower=0)
full["RemodAge"] = (full["YrSold"] - full["YearRemodAdd"]).clip(lower=0)
full["HasBsmt"] = (full["TotalBsmtSF"] > 0).astype(int)

drop_cols = ["Street","Utilities","Condition2","RoofMatl","PoolArea","PoolQC","MiscVal","MiscFeature"]
for c in drop_cols:
    if c in full.columns:
        full = full.drop(columns=[c])

full = pd.get_dummies(full, drop_first=True)

n_train = len(y)
X = full.iloc[:n_train, :].copy()
X_test = full.iloc[n_train:, :].copy()

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(lambda yt, yp: np.sqrt(mean_squared_error(yt, yp)), greater_is_better=False)

# ===== MODEL GỐC (đang 0.11854) =====
base = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.02,
    max_depth=3,
    subsample=0.85,
    colsample_bytree=0.6,
    min_child_weight=2,
    reg_lambda=1.2,
    reg_alpha=0.001,
    gamma=0.0,
    random_state=42,
    tree_method="hist"
)

# ===== MODEL TUNED NHẸ =====
tuned = XGBRegressor(
    n_estimators=2400,          # +200~400 cây
    learning_rate=0.018,        # chậm hơn xíu
    max_depth=3,                # giữ nguyên
    subsample=0.9,
    colsample_bytree=0.55,
    colsample_bylevel=0.7,      # thêm cái này
    min_child_weight=3,         # tăng nhẹ để đỡ overfit
    reg_lambda=1.4,
    reg_alpha=0.01,
    gamma=0.0,
    random_state=42,
    tree_method="hist"
)

print("=== BASE ===")
scores_base = -cross_val_score(base, X, y, scoring=rmse_scorer, cv=kf)
print(f"BASE XGB: {scores_base.mean():.5f} ± {scores_base.std():.5f}")

print("=== TUNED ===")
scores_tuned = -cross_val_score(tuned, X, y, scoring=rmse_scorer, cv=kf)
print(f"TUNED XGB: {scores_tuned.mean():.5f} ± {scores_tuned.std():.5f}")

# nếu tuned tốt hơn -> train full + predict
best = tuned if scores_tuned.mean() < scores_base.mean() else base
best.fit(X, y)
pred = np.expm1(best.predict(X_test))

sub = pd.DataFrame({"Id": test_id, "SalePrice": pred})
sub.to_csv("submission_best_xgb.csv", index=False)
print("✅ saved submission_best_xgb.csv")


=== BASE ===
BASE XGB: 0.11854 ± 0.00824
=== TUNED ===
TUNED XGB: 0.11738 ± 0.00930
✅ saved submission_best_xgb.csv


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

# ===== 1. LOAD =====
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# drop outlier kinh điển
train = train.drop(train[(train["GrLivArea"] > 4000) & (train["SalePrice"] < 300000)].index)

y = np.log1p(train["SalePrice"])
test_id = test["Id"]

train = train.drop(columns=["SalePrice"])
full = pd.concat([train, test], axis=0, ignore_index=True)

# ===== 2. CLEAN =====
num_cols = full.select_dtypes(include=[np.number]).columns
obj_cols = full.select_dtypes(include=["object"]).columns

full[num_cols] = full[num_cols].fillna(0)
full[obj_cols] = full[obj_cols].fillna("None")

# ===== 3. FEATURE CƠ BẢN =====
skew_cols = ["LotArea","GrLivArea","1stFlrSF","2ndFlrSF","TotalBsmtSF","LowQualFinSF"]
for c in skew_cols:
    if c in full.columns:
        full[c] = np.log1p(full[c])

full["TotalSF"] = full["TotalBsmtSF"] + full["1stFlrSF"] + full["2ndFlrSF"]
full["Qual_x_GrLiv"] = full["OverallQual"] * full["GrLivArea"]
full["HouseAge"] = (full["YrSold"] - full["YearBuilt"]).clip(lower=0)
full["RemodAge"] = (full["YrSold"] - full["YearRemodAdd"]).clip(lower=0)
full["HasBsmt"] = (full["TotalBsmtSF"] > 0).astype(int)

drop_cols = ["Street","Utilities","Condition2","RoofMatl","PoolQC","MiscVal","MiscFeature","Alley","Fence"]
for c in drop_cols:
    if c in full.columns:
        full = full.drop(columns=[c])

# one-hot
full = pd.get_dummies(full, drop_first=True)

# ===== 4. SPLIT LẠI =====
n_train = len(y)
X = full.iloc[:n_train, :].copy()
X_test = full.iloc[n_train:, :].copy()

# ===== 5. CV "AN TOÀN HƠN" =====
# sort theo YearSold + Neighborhood để giảm leakage thời gian/khu vực
folds = KFold(n_splits=5, shuffle=True, random_state=2025)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

oof = np.zeros(len(X))
preds = np.zeros(len(X_test))

# 3 cấu hình gần nhau → average để ổn định
configs = [
    dict(n_estimators=1800, learning_rate=0.02, max_depth=3,
         subsample=0.9, colsample_bytree=0.55,
         min_child_weight=4, reg_lambda=1.0, reg_alpha=0.01),
    dict(n_estimators=2000, learning_rate=0.018, max_depth=3,
         subsample=0.85, colsample_bytree=0.6,
         min_child_weight=5, reg_lambda=1.2, reg_alpha=0.02),
    dict(n_estimators=1600, learning_rate=0.022, max_depth=3,
         subsample=0.9, colsample_bytree=0.5,
         min_child_weight=6, reg_lambda=1.3, reg_alpha=0.03),
]

oof_preds_all = []
test_preds_all = []

for cfg in configs:
    oof_cfg = np.zeros(len(X))
    test_cfg = np.zeros(len(X_test))
    for fold, (tr_idx, va_idx) in enumerate(folds.split(X, y)):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        model = XGBRegressor(
            **cfg,
            gamma=0.0,
            random_state=42 + fold,
            tree_method="hist"
        )
        model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
        va_pred = model.predict(X_va)
        oof_cfg[va_idx] = va_pred
        test_cfg += model.predict(X_test) / folds.n_splits

    oof_preds_all.append(oof_cfg)
    test_preds_all.append(test_cfg)

# average 3 cấu hình
oof_blend = np.mean(oof_preds_all, axis=0)
test_blend = np.mean(test_preds_all, axis=0)

cv_score = rmse(y, oof_blend)
print(f"CV blend (log): {cv_score:.5f}")

# ===== 6. SUBMISSION =====
sub = pd.DataFrame({
    "Id": test_id,
    "SalePrice": np.expm1(test_blend)
})
sub.to_csv("submission_blend_xgb_stable.csv", index=False)
print("✅ saved submission_blend_xgb_stable.csv")


CV blend (log): 0.11936
✅ saved submission_blend_xgb_stable.csv


In [7]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV, ElasticNetCV
from xgboost import XGBRegressor

# ========== 1. LOAD ==========
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# drop outlier kinh điển
train = train.drop(train[(train["GrLivArea"] > 4000) & (train["SalePrice"] < 300000)].index)

y = np.log1p(train["SalePrice"])
test_id = test["Id"]

train = train.drop(columns=["SalePrice"])
full = pd.concat([train, test], axis=0, ignore_index=True)

# ========== 2. BASIC CLEAN ==========
num_cols = full.select_dtypes(include=[np.number]).columns
obj_cols = full.select_dtypes(include=["object"]).columns

full[num_cols] = full[num_cols].fillna(0)
full[obj_cols] = full[obj_cols].fillna("None")

# ========== 3. FEATURE ENGINEERING NHANH ==========
skew_cols = ["LotArea","GrLivArea","1stFlrSF","2ndFlrSF","TotalBsmtSF","LowQualFinSF"]
for c in skew_cols:
    if c in full.columns:
        full[c] = np.log1p(full[c])

full["TotalSF"] = full["TotalBsmtSF"] + full["1stFlrSF"] + full["2ndFlrSF"]
full["Qual_x_GrLiv"] = full["OverallQual"] * full["GrLivArea"]
full["HouseAge"] = (full["YrSold"] - full["YearBuilt"]).clip(lower=0)
full["RemodAge"] = (full["YrSold"] - full["YearRemodAdd"]).clip(lower=0)
full["HasBsmt"] = (full["TotalBsmtSF"] > 0).astype(int)

drop_cols = ["Street","Utilities","Condition2","RoofMatl","PoolQC","MiscVal","MiscFeature","Alley","Fence"]
for c in drop_cols:
    if c in full.columns:
        full = full.drop(columns=[c])

# one-hot
full = pd.get_dummies(full, drop_first=True)

# ========== 4. SPLIT LẠI ==========
n_train = len(y)
X = full.iloc[:n_train, :].copy()
X_test = full.iloc[n_train:, :].copy()

# ========== 5. CV SETUP ==========
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

kf = KFold(n_splits=5, shuffle=True, random_state=2025)

# ========== 6. MODEL 1: XGB STABLE ==========
xgb_params = dict(
    n_estimators=1800,
    learning_rate=0.02,
    max_depth=3,
    subsample=0.9,
    colsample_bytree=0.55,
    min_child_weight=4,
    reg_lambda=1.0,
    reg_alpha=0.01,
    gamma=0.0,
    random_state=42,
    tree_method="hist"
)

oof_xgb = np.zeros(len(X))
pred_xgb = np.zeros(len(X_test))

for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y)):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    m = XGBRegressor(**xgb_params)
    m.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)

    oof_xgb[va_idx] = m.predict(X_va)
    pred_xgb += m.predict(X_test) / kf.n_splits

print("XGB CV:", rmse(y, oof_xgb))

# ========== 7. MODEL 2: LASSO ==========
# Lasso cần scale nhẹ, nhưng ở đây full đã one-hot + khá ổn nên dùng trực tiếp
lasso = LassoCV(
    alphas=[1e-3, 3e-3, 1e-2, 3e-2, 1e-1],
    cv=5,
    random_state=42,
    max_iter=20000
)
lasso.fit(X, y)
oof_lasso = lasso.predict(X)
pred_lasso = lasso.predict(X_test)
print("Lasso CV (pseudo):", rmse(y, oof_lasso))

# ========== 8. MODEL 3: ELASTICNET ==========
enet = ElasticNetCV(
    l1_ratio=[.1, .3, .5, .7, .9, .95],
    alphas=[1e-3, 3e-3, 1e-2, 3e-2, 1e-1],
    cv=5,
    random_state=42,
    max_iter=20000
)
enet.fit(X, y)
oof_enet = enet.predict(X)
pred_enet = enet.predict(X_test)
print("ENet CV (pseudo):", rmse(y, oof_enet))

# ========== 9. BLEND ==========
# trọng số có thể thử: 0.6 / 0.25 / 0.15 hoặc 0.65 / 0.2 / 0.15
oof_blend = 0.65 * oof_xgb + 0.2 * oof_lasso + 0.15 * oof_enet
pred_blend = 0.65 * pred_xgb + 0.2 * pred_lasso + 0.15 * pred_enet

cv_blend = rmse(y, oof_blend)
print("BLEND CV:", cv_blend)

# ========== 10. SUBMISSION ==========
sub = pd.DataFrame({
    "Id": test_id,
    "SalePrice": np.expm1(pred_blend)
})
sub.to_csv("submission_xgb_lasso_enet.csv", index=False)
print("✅ saved submission_xgb_lasso_enet.csv")


XGB CV: 0.12083615499952262
Lasso CV (pseudo): 0.10644273727642789
ENet CV (pseudo): 0.09456316665158937
BLEND CV: 0.10968028807417862
✅ saved submission_xgb_lasso_enet.csv


  model = cd_fast.enet_coordinate_descent(


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, ElasticNet
from xgboost import XGBRegressor

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# ===== 1. load & prep nhanh như lúc nãy =====
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# drop outlier kinh điển
train = train.drop(train[(train["GrLivArea"] > 4000) & (train["SalePrice"] < 300000)].index)

y = np.log1p(train["SalePrice"]).reset_index(drop=True)
test_id = test["Id"]

train = train.drop(columns=["SalePrice"])
full = pd.concat([train, test], axis=0, ignore_index=True)

num_cols = full.select_dtypes(include=[np.number]).columns
obj_cols = full.select_dtypes(include=["object"]).columns

full[num_cols] = full[num_cols].fillna(0)
full[obj_cols] = full[obj_cols].fillna("None")

# fe đơn giản
full["TotalSF"] = full["TotalBsmtSF"] + full["1stFlrSF"] + full["2ndFlrSF"]
full["HouseAge"] = (full["YrSold"] - full["YearBuilt"]).clip(lower=0)
full["RemodAge"] = (full["YrSold"] - full["YearRemodAdd"]).clip(lower=0)
full["Qual_x_GrLiv"] = full["OverallQual"] * full["GrLivArea"]

drop_cols = ["Street","Utilities","Condition2","RoofMatl","PoolQC","MiscVal",
             "MiscFeature","Alley","Fence"]
for c in drop_cols:
    if c in full.columns:
        full = full.drop(columns=[c])

full = pd.get_dummies(full, drop_first=True)

n_train = len(y)
X = full.iloc[:n_train, :].reset_index(drop=True)
X_test = full.iloc[n_train:, :].reset_index(drop=True)

# ===== 2. KFold =====
kf = KFold(n_splits=5, shuffle=True, random_state=2025)

oof_xgb = np.zeros(n_train)
oof_lasso = np.zeros(n_train)
oof_enet = np.zeros(n_train)

pred_xgb = np.zeros(len(X_test))
pred_lasso = np.zeros(len(X_test))
pred_enet = np.zeros(len(X_test))

# ===== 3. loop fold =====
for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    # 3.1 XGB (shallow ổn định)
    xgb = XGBRegressor(
        n_estimators=1600,
        learning_rate=0.025,
        max_depth=3,
        subsample=0.9,
        colsample_bytree=0.55,
        min_child_weight=4,
        reg_lambda=1.0,
        reg_alpha=0.01,
        random_state=42,
        tree_method="hist"
    )
    xgb.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
    oof_xgb[va_idx] = xgb.predict(X_va)
    pred_xgb += xgb.predict(X_test) / kf.n_splits

    # 3.2 Lasso (fit trong fold, không leak)
    lasso = Lasso(alpha=0.0007, max_iter=20000, random_state=42)
    lasso.fit(X_tr, y_tr)
    oof_lasso[va_idx] = lasso.predict(X_va)
    pred_lasso += lasso.predict(X_test) / kf.n_splits

    # 3.3 ElasticNet
    enet = ElasticNet(alpha=0.0008, l1_ratio=0.7, max_iter=20000, random_state=42)
    enet.fit(X_tr, y_tr)
    oof_enet[va_idx] = enet.predict(X_va)
    pred_enet += enet.predict(X_test) / kf.n_splits

# ===== 4. đánh giá từng model thật =====
print("XGB OOF  :", rmse(y, oof_xgb))
print("Lasso OOF:", rmse(y, oof_lasso))
print("ENet OOF :", rmse(y, oof_enet))

# ===== 5. meta-learner (stacking level 2) =====
# train 1 linear nhỏ để học trọng số
stack_X = np.vstack([oof_xgb, oof_lasso, oof_enet]).T
stack_test = np.vstack([pred_xgb, pred_lasso, pred_enet]).T

meta = Lasso(alpha=0.0001, max_iter=10000)
meta.fit(stack_X, y)
oof_blend = meta.predict(stack_X)
pred_blend = meta.predict(stack_test)

print("STACK OOF:", rmse(y, oof_blend))
print("meta coefs:", meta.coef_, "intercept:", meta.intercept_)

# ===== 6. submission =====
sub = pd.DataFrame({
    "Id": test_id,
    "SalePrice": np.expm1(pred_blend)
})
sub.to_csv("submission_stack_xgb_linear.csv", index=False)
print("✅ saved submission_stack_xgb_linear.csv")
