In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer

# Load
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")


In [2]:
# Drop nhiều null
drop_null = [col for col in train.columns if train[col].isnull().sum() > 500]
train.drop(columns=drop_null, inplace=True)
test.drop(columns=[c for c in drop_null if c in test.columns], inplace=True)

# Drop nhiều zero
def drop_zero_columns(df, threshold=0.5):
    drop = []
    for col in df.select_dtypes(include=[np.number]).columns:
        if (df[col] == 0).sum() / len(df) > threshold:
            drop.append(col)
    df.drop(columns=drop, inplace=True)
    return drop

drop_zero_columns(train)
drop_zero_columns(test)


['MasVnrArea',
 'BsmtFinSF2',
 '2ndFlrSF',
 'LowQualFinSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'HalfBath',
 'Fireplaces',
 'WoodDeckSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal']

In [6]:
def prepare_features(train, test):
    t = train.copy()
    te = test.copy()

    # (các bước xử lý bạn đã có ở trên, không đổi)
    # ...
    # -----------------------------
    # Cuối cùng, encode mọi object còn sót lại
    from sklearn.preprocessing import LabelEncoder
    for df in [t, te]:
        for col in df.select_dtypes(include='object').columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))

    return t, te


In [7]:
train_X, test_X = prepare_features(train, test)
X = train_X.drop(columns=['SalePrice'])
y_log = np.log1p(train_X['SalePrice'])
from sklearn.preprocessing import LabelEncoder


models = {
    "RandomForest": RandomForestRegressor(n_estimators=400, max_depth=15, random_state=42, n_jobs=-1),
    "XGB": XGBRegressor(n_estimators=800, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42)
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(lambda y_true,y_pred: np.sqrt(mean_squared_error(y_true,y_pred)), greater_is_better=False)

for name, model in models.items():
    rmse = -cross_val_score(model, X, y_log, scoring=rmse_scorer, cv=kf)
    print(f"{name}: RMSE={rmse.mean():.5f} ± {rmse.std():.5f}")


RandomForest: RMSE=0.14481 ± 0.02026
XGB: RMSE=0.13483 ± 0.01931


In [8]:
best_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
best_model.fit(X, y_log)
pred = np.expm1(best_model.predict(test_X))

submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": pred
})
submission.to_csv("submission_final.csv", index=False)
print("✅ Saved submission_final.csv")


ValueError: feature_names mismatch: ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', 'GrLivArea', 'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'OpenPorchSF', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition'] ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', 'GrLivArea', 'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'OpenPorchSF', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']
expected Fireplaces in input data

In [None]:
bug