In [1]:
# Cell 1: Imports & config
import os, joblib, numpy as np, pandas as pd

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

SEED = 42
os.makedirs("models", exist_ok=True)
os.makedirs("submissions", exist_ok=True)

# OneHotEncoder: t∆∞∆°ng th√≠ch c·∫£ sklearn c≈©/m·ªõi (sparse_output vs sparse)
try:
    OHE = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    OHE = OneHotEncoder(handle_unknown="ignore", sparse=False)

print("‚úÖ Libraries ready")


‚úÖ Libraries ready


In [2]:
# Cell 2: Ti·ªÅn x·ª≠ l√Ω & FE

# C√°c map tr∆∞·ªõc ƒë√≥
ordinal_maps = {
    'BsmtExposure': {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
    'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'BsmtQual':     {'None': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'ExterQual':    {'Other': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'GarageFinish': {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
    'HeatingQC':    {'Other': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'KitchenQual':  {'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'LandSlope':    {'Other': 0, 'Sev': 1, 'Mod': 2, 'Gtl': 3},
    'LotShape':     {'Other': 0, 'IR3': 1, 'IR2': 2, 'IR1': 3, 'Reg': 4},
}
ordinal_cols = list(ordinal_maps.keys())

rare_map = {
    'Exterior1st': ['BrkComm', 'Stone', 'AsphShn', 'ImStucc', 'CBlock'],
    'Exterior2nd': ['ImStucc', 'Brk Cmn', 'Stone', 'AsphShn', 'Other', 'CBlock'],
    'ExterQual':   ['Fa'],
    'Foundation':  ['Other'],
    'GarageType':  ['CarPort', '2Types'],
    'HeatingQC':   ['Po'],
    'HouseStyle':  ['1.5Unf', '2.5Unf', '2.5Fin'],
    'LandSlope':   ['Sev'],
    'LotConfig':   ['FR3'],
    'LotShape':    ['IR3'],
    'MSZoning':    ['C (all)'],
    'Neighborhood': ['Veenker', 'NPkVill', 'Blueste'],
    'RoofStyle':   ['Flat', 'Gambrel', 'Mansard', 'Shed'],
    'SaleCondition': ['Alloca', 'AdjLand']
}

nominal_cols = [
    'Exterior1st','Exterior2nd','Foundation','GarageType','HouseStyle',
    'LotConfig','MSZoning','Neighborhood','RoofStyle','SaleCondition'
]

num_base = ['OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF',
            '1stFlrSF','FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd',
            'GarageYrBlt','Fireplaces','YrSold']
skew_log1p = ['GrLivArea','TotalBsmtSF','1stFlrSF']  # c√°c c·ªôt skew n·∫∑ng

def apply_basic_fill(df):
    df = df.copy()
    cols_fill_none = ['GarageType','GarageFinish','BsmtQual','BsmtExposure','BsmtFinType1']
    for c in cols_fill_none:
        if c in df.columns: df[c] = df[c].fillna('None')
    if 'GarageYrBlt' in df.columns and 'YearBuilt' in df.columns:
        df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt'])
    return df

def merge_rare(df):
    df = df.copy()
    for col, vals in rare_map.items():
        if col in df.columns:
            df[col] = df[col].replace(vals, 'Other')
    return df

def map_ordinals(df):
    df = df.copy()
    for col, m in ordinal_maps.items():
        if col in df.columns:
            df[col] = df[col].map(m).fillna(0).astype(int)
    return df

def add_engineered(df):
    df = df.copy()
    if set(['YrSold','YearBuilt']).issubset(df.columns):
        df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    else:
        df['HouseAge'] = 0
    if set(['YrSold','YearRemodAdd']).issubset(df.columns):
        df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']
    else:
        df['RemodAge'] = 0
    if set(['YrSold','GarageYrBlt']).issubset(df.columns):
        df['GarageAge'] = df['YrSold'] - df['GarageYrBlt']
    else:
        df['GarageAge'] = 0
    if set(['TotalBsmtSF','1stFlrSF','GrLivArea']).issubset(df.columns):
        df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['GrLivArea']
    else:
        df['TotalSF'] = 0
    # ƒëi·ªÉm t·ªïng h·ª£p
    for need in ['BsmtQual','BsmtExposure','BsmtFinType1','GarageFinish','GarageCars','ExterQual','KitchenQual']:
        if need not in df.columns: df[need] = 0
    df['BasementScore'] = df['BsmtQual'] + df['BsmtExposure'] + df['BsmtFinType1']
    df['GarageScore']   = df['GarageFinish'] + df['GarageCars']
    df['ExteriorScore'] = df['ExterQual'] + df['KitchenQual']
    return df

def log1p_safe(df, cols):
    df = df.copy()
    for c in cols:
        if c in df.columns:
            df[c] = np.log1p(df[c].fillna(0))
    return df

def full_prepare(df):
    df = apply_basic_fill(df)
    df = merge_rare(df)          # g·ªôp gi√° tr·ªã hi·∫øm ‚Üí 'Other'
    df = map_ordinals(df)        # map ordinal ‚Üí s·ªë
    df = add_engineered(df)      # t·∫°o feature m·ªõi
    df = log1p_safe(df, skew_log1p + ['TotalSF'])  # gi·∫£m skew
    return df


In [3]:
# Cell 3: Load CSV & t·∫°o X/y

train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

train_prep = full_prepare(train)
test_prep  = full_prepare(test)

# One-hot cho nominal (fit tr√™n train, transform test)
nominal_present = [c for c in nominal_cols if c in train_prep.columns]
ohe = OHE.fit(train_prep[nominal_present])

X_ohe_train = pd.DataFrame(ohe.transform(train_prep[nominal_present]),
                           columns=ohe.get_feature_names_out(nominal_present),
                           index=train_prep.index)
X_ohe_test  = pd.DataFrame(ohe.transform(test_prep[nominal_present]),
                           columns=ohe.get_feature_names_out(nominal_present),
                           index=test_prep.index)

# Numeric (g·ªìm c·∫£ engineered)
engineered = ['HouseAge','RemodAge','GarageAge','TotalSF','BasementScore','GarageScore','ExteriorScore']
num_all = list(dict.fromkeys(num_base + engineered))
num_present = [c for c in num_all if c in train_prep.columns]

train_prep[num_present] = train_prep[num_present].fillna(0)
test_prep[num_present]  = test_prep[num_present].fillna(0)

X_num_train = train_prep[num_present]
X_num_test  = test_prep[num_present]

# H·ª£p nh·∫•t
X_train = pd.concat([X_num_train.reset_index(drop=True), X_ohe_train.reset_index(drop=True)], axis=1)
X_test  = pd.concat([X_num_test.reset_index(drop=True),  X_ohe_test.reset_index(drop=True)],  axis=1)

y = train_prep['SalePrice'].values
test_ids = test_prep['Id'].values

print("‚úÖ Shapes:", X_train.shape, X_test.shape, "(train, test)")


‚úÖ Shapes: (1460, 101) (1459, 101) (train, test)


In [4]:
# Cell 4: Train & ch·ªçn model

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

def train_grid(name, est, grid):
    print(f"\n>>> Training {name}")
    gs = GridSearchCV(est, grid, scoring="r2", cv=cv, n_jobs=-1, verbose=1)
    gs.fit(X_train, y)
    print("Best params:", gs.best_params_)
    print("Best CV R2:", gs.best_score_)
    joblib.dump(gs.best_estimator_, f"models/{name}.pkl")
    return gs.best_estimator_, gs.best_score_

results = {}

# 1) Linear (scale tr∆∞·ªõc khi h·ªìi quy)
linear = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("lr", LinearRegression())
])
linear_grid = {
    "lr__fit_intercept": [True, False]
}
best_linear, r2_linear = train_grid("linear", linear, linear_grid)
results["linear"] = r2_linear

# 2) RandomForest
rf = RandomForestRegressor(random_state=SEED, n_jobs=-1)
rf_grid = {
    "n_estimators": [400, 800],
    "max_depth": [None, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", 0.8]
}
best_rf, r2_rf = train_grid("random_forest", rf, rf_grid)
results["random_forest"] = r2_rf

# 3) GradientBoosting
gbr = GradientBoostingRegressor(random_state=SEED)
gbr_grid = {
    "n_estimators": [600, 900],
    "learning_rate": [0.05, 0.1],
    "max_depth": [2, 3],
    "subsample": [0.8, 1.0]
}
best_gbr, r2_gbr = train_grid("gbr", gbr, gbr_grid)
results["gbr"] = r2_gbr

print("\nLeaderboard (CV R2):", sorted(results.items(), key=lambda x: x[1], reverse=True))

# Champion
champ_name = max(results, key=results.get)
champ_model = {"linear": best_linear, "random_forest": best_rf, "gbr": best_gbr}[champ_name]
joblib.dump(champ_model, f"models/model_best_{champ_name}.pkl")
print(f"üèÜ Champion: {champ_name}")

# (T√πy ch·ªçn) Voting top-2/3
models_sorted = sorted(results.items(), key=lambda x: x[1], reverse=True)
voters = [(name, {"linear": best_linear, "random_forest": best_rf, "gbr": best_gbr}[name])
          for name, _ in models_sorted[:min(3, len(models_sorted))]]
voter = VotingRegressor(estimators=voters)
voter.fit(X_train, y)
joblib.dump(voter, "models/voting.pkl")

# ƒê√°nh gi√° nhanh tr√™n train
def eval_train(model, label):
    pred = model.predict(X_train)
    rmse = mean_squared_error(y, pred, squared=False)
    mae  = mean_absolute_error(y, pred)
    r2   = r2_score(y, pred)
    print(f"[{label}] Train RMSE={rmse:.2f} | MAE={mae:.2f} | R2={r2:.4f}")

eval_train(champ_model, f"champion_{champ_name}")
eval_train(voter, "voting")
print("‚úÖ Models saved in ./models")



>>> Training linear
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params: {'lr__fit_intercept': True}
Best CV R2: 0.806748852198296

>>> Training random_forest
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Best CV R2: 0.8532303345778312

>>> Training gbr
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 600, 'subsample': 1.0}
Best CV R2: 0.841786330288838

Leaderboard (CV R2): [('random_forest', np.float64(0.8532303345778312)), ('gbr', np.float64(0.841786330288838)), ('linear', np.float64(0.806748852198296))]
üèÜ Champion: random_forest


TypeError: got an unexpected keyword argument 'squared'

In [5]:
# Cell 5: Predict & Submit
# Champion
pred_champ = champ_model.predict(X_test)
sub_champ = pd.DataFrame({"Id": test_ids, "SalePrice": pred_champ})
sub_champ.to_csv("submissions/submission_champion.csv", index=False)
print("üíæ Saved:", "submissions/submission_champion.csv")

# Voting (th∆∞·ªùng ·ªïn ƒë·ªãnh h∆°n)
pred_vote = voter.predict(X_test)
sub_vote = pd.DataFrame({"Id": test_ids, "SalePrice": pred_vote})
sub_vote.to_csv("submissions/submission_voting.csv", index=False)
print("üíæ Saved:", "submissions/submission_voting.csv")


üíæ Saved: submissions/submission_champion.csv
üíæ Saved: submissions/submission_voting.csv
