In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder,RobustScaler
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
print("Setup completed")

Setup completed


In [None]:
df_train = pd.read_csv('/content/train.csv')
df_test = pd.read_csv('/content/test.csv')
print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")


Train shape: (1460, 81)
Test shape: (1459, 80)


In [None]:
def drop_missing_columns(df, threshold=0.4):
    """Drop columns with missing values above threshold"""
    missing_ratio = df.isna().mean()
    missing_columns = missing_ratio[missing_ratio > threshold].index.tolist()
    print(f'Dropping {len(missing_columns)} columns with >{threshold*100}% missing values.')
    if missing_columns:
        print(f'Columns: {missing_columns}')
    return df.drop(columns=missing_columns)

def drop_zero_columns(df, threshold=0.5):
    """Drop columns with zeros above threshold"""
    zero_ratio = (df == 0).astype(int).mean()
    zero_columns = zero_ratio[zero_ratio > threshold].index.tolist()
    print(f'Dropping {len(zero_columns)} columns with >{threshold*100}% zeros.')
    if zero_columns:
        print(f'Columns: {zero_columns}')
    return df.drop(columns=zero_columns)

def merge_rare_categories(df):
    """Merge rare categories into 'Other'"""
    rare_map = {
        'Exterior1st': ['BrkComm', 'Stone', 'AsphShn', 'ImStucc', 'CBlock'],
        'Exterior2nd': ['ImStucc', 'Brk Cmn', 'Stone', 'AsphShn', 'Other', 'CBlock'],
        'ExterQual': ['Fa'],
        'Foundation': ['Wood'],
        'GarageType': ['CarPort', '2Types'],
        'HeatingQC': ['Po'],
        'HouseStyle': ['1.5Unf', '2.5Unf', '2.5Fin'],
        'LandSlope': ['Sev'],
        'LotConfig': ['FR3'],
        'LotShape': ['IR3'],
        'MSZoning': ['C (all)'],
        'Neighborhood': ['Veenker', 'NPkVill', 'Blueste'],
        'RoofStyle': ['Flat', 'Gambrel', 'Mansard', 'Shed'],
        'SaleCondition': ['Alloca', 'AdjLand']
    }
    for col, rare_values in rare_map.items():
        if col in df.columns:
            df[col] = df[col].replace(rare_values, 'Other')
    return df

In [None]:
# Select important features based on correlation and domain knowledge
numeric_features = [
    'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
    '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd',
    'GarageYrBlt', 'YrSold'
]

categorical_features = [
    'BsmtExposure', 'BsmtFinType1', 'BsmtQual', 'Exterior1st', 'Exterior2nd',
    'ExterQual', 'Foundation', 'GarageFinish', 'GarageType', 'HeatingQC',
    'HouseStyle', 'KitchenQual', 'LandSlope', 'LotConfig', 'LotShape',
    'MSZoning', 'Neighborhood', 'RoofStyle', 'SaleCondition'
]

all_features = numeric_features + categorical_features
print(f"Total features selected: {len(all_features)}")

Total features selected: 31


In [None]:
def preprocess_data(df, is_train=True):
    """Complete preprocessing pipeline"""
    df = df.copy()

    # 1. Fill missing values for specific columns
    cols_fill_none = ['GarageType', 'GarageFinish', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1']
    for col in cols_fill_none:
        if col in df.columns:
            df[col] = df[col].fillna('None')

    if 'GarageYrBlt' in df.columns and 'YearBuilt' in df.columns:
        df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt'])

    # 2. Merge rare categories
    df = merge_rare_categories(df)

    # 3. Create new features BEFORE any transformation
    if 'YrSold' in df.columns and 'YearBuilt' in df.columns:
        df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    if 'YrSold' in df.columns and 'YearRemodAdd' in df.columns:
        df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']
    if 'YrSold' in df.columns and 'GarageYrBlt' in df.columns:
        df['GarageAge'] = df['YrSold'] - df['GarageYrBlt']

    # 4. Log transform skewed features
    cols_log = ['GrLivArea', 'TotalBsmtSF', '1stFlrSF']
    for col in cols_log:
        if col in df.columns:
            df[col] = np.log1p(df[col].clip(lower=0))

    # 5. Ordinal encoding for ordinal categorical features
    ordinal_maps = {
        'BsmtExposure': {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
        'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
        'BsmtQual': {'None': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
        'ExterQual': {'Other': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
        'GarageFinish': {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
        'HeatingQC': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
        'KitchenQual': {'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
        'LandSlope': {'Sev': 0, 'Mod': 1, 'Gtl': 2},
        'LotShape': {'IR3': 0, 'IR2': 1, 'IR1': 2, 'Reg': 3}
    }

    for col, mapping in ordinal_maps.items():
        if col in df.columns:
            df[col] = df[col].map(mapping).fillna(0).astype(int)

    # 6. Create quality scores
    if 'BsmtQual' in df.columns and 'BsmtExposure' in df.columns and 'BsmtFinType1' in df.columns:
        df['BasementScore'] = df['BsmtQual'] + df['BsmtExposure'] + df['BsmtFinType1']
    if 'GarageFinish' in df.columns and 'GarageCars' in df.columns:
        df['GarageScore'] = df['GarageFinish'] + df['GarageCars']
    if 'ExterQual' in df.columns and 'KitchenQual' in df.columns:
        df['ExteriorScore'] = df['ExterQual'] + df['KitchenQual']

    # 7. Create total square footage
    if {'TotalBsmtSF', '1stFlrSF', 'GrLivArea'}.issubset(df.columns):
        df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['GrLivArea']

    return df

In [None]:
# Preprocess both train and test
df_train_processed = preprocess_data(df_train, is_train=True)
df_test_processed = preprocess_data(df_test, is_train=False)

print("Preprocessing completed")
print(f"Train shape: {df_train_processed.shape}")
print(f"Test shape: {df_test_processed.shape}")

Preprocessing completed
Train shape: (1460, 88)
Test shape: (1459, 87)


In [None]:
# Get all numeric columns and remaining categorical columns
numeric_cols = df_train_processed.select_dtypes(include=[np.number]).columns.tolist()
if 'SalePrice' in numeric_cols:
    numeric_cols.remove('SalePrice')
if 'Id' in numeric_cols:
    numeric_cols.remove('Id')

categorical_cols = df_train_processed.select_dtypes(include=['object']).columns.tolist()

print(f"Numeric features: {len(numeric_cols)}")
print(f"Categorical features: {len(categorical_cols)}")

Numeric features: 52
Categorical features: 34


In [None]:
#One-hot encoding for nominal categorical features
if len(categorical_cols) > 0:
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    # Fit on train
    encoded_train = pd.DataFrame(
        encoder.fit_transform(df_train_processed[categorical_cols]),
        columns=encoder.get_feature_names_out(categorical_cols),
        index=df_train_processed.index
    )

    # Transform test
    encoded_test = pd.DataFrame(
        encoder.transform(df_test_processed[categorical_cols]),
        columns=encoder.get_feature_names_out(categorical_cols),
        index=df_test_processed.index
    )

    # Combine with numeric features
    X_train = pd.concat([
        df_train_processed[numeric_cols].reset_index(drop=True),
        encoded_train.reset_index(drop=True)
    ], axis=1)

    X_test = pd.concat([
        df_test_processed[numeric_cols].reset_index(drop=True),
        encoded_test.reset_index(drop=True)
    ], axis=1)
else:
    X_train = df_train_processed[numeric_cols].copy()
    X_test = df_test_processed[numeric_cols].copy()

# Target variable (log transform)
y_train = np.log1p(df_train['SalePrice'])

print(f"\nFinal X_train shape: {X_train.shape}")
print(f"Final X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")


Final X_train shape: (1460, 260)
Final X_test shape: (1459, 260)
y_train shape: (1460,)


In [None]:
# Fill missing values with median from training set
train_medians = X_train.median()
X_train = X_train.fillna(train_medians)
X_test = X_test.fillna(train_medians)

# Check for any remaining NaN or inf
print(f"Train NaN count: {X_train.isna().sum().sum()}")
print(f"Test NaN count: {X_test.isna().sum().sum()}")
print(f"Train Inf count: {np.isinf(X_train.values).sum()}")
print(f"Test Inf count: {np.isinf(X_test.values).sum()}")

Train NaN count: 0
Test NaN count: 0
Train Inf count: 0
Test Inf count: 0


In [None]:
# Use RobustScaler instead of StandardScaler (better for outliers)
scaler = RobustScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

print("Robust Standardization completed")

Robust Standardization completed


In [None]:
# ===================== OPTIMIZED ENSEMBLE STACKING =====================
from sklearn.model_selection import KFold
from sklearn.linear_model import LassoCV, ElasticNetCV, RidgeCV
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb

# === Utility functions ===
def rmse_log(y_log_true, y_log_pred):
    return np.sqrt(mean_squared_error(y_log_true, y_log_pred))

# === Prepare data ===
Xtrn = X_train_scaled.values
Xtst = X_test_scaled.values
y    = y_train.values
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Containers for OOF and test predictions
oof_lasso = np.zeros_like(y, dtype=float)
oof_ridge = np.zeros_like(y, dtype=float)
oof_enet  = np.zeros_like(y, dtype=float)
oof_xgb   = np.zeros_like(y, dtype=float)
oof_lgb   = np.zeros_like(y, dtype=float)

tst_lasso = np.zeros(Xtst.shape[0], dtype=float)
tst_ridge = np.zeros(Xtst.shape[0], dtype=float)
tst_enet  = np.zeros(Xtst.shape[0], dtype=float)
tst_xgb   = np.zeros(Xtst.shape[0], dtype=float)
tst_lgb   = np.zeros(Xtst.shape[0], dtype=float)

print("\n=== 10-Fold CV with Multiple Models ===")

# === Cross-validation ===
for fold, (tr_idx, val_idx) in enumerate(kf.split(Xtrn, y), 1):
    X_tr, X_va = Xtrn[tr_idx], Xtrn[val_idx]
    y_tr, y_va = y[tr_idx],    y[val_idx]

    # ----- LassoCV -----
    lasso = LassoCV(
        alphas=np.logspace(-4.5, -2.0, 80),
        cv=5,
        random_state=42,
        max_iter=10000,
        n_jobs=-1
    )
    lasso.fit(X_tr, y_tr)
    oof_lasso[val_idx] = lasso.predict(X_va)
    tst_lasso += lasso.predict(Xtst) / kf.get_n_splits()

    # ----- RidgeCV -----
    ridge = RidgeCV(
        alphas=np.logspace(-2, 3, 100),
        cv=5
    )
    ridge.fit(X_tr, y_tr)
    oof_ridge[val_idx] = ridge.predict(X_va)
    tst_ridge += ridge.predict(Xtst) / kf.get_n_splits()

    # ----- ElasticNetCV -----
    enet = ElasticNetCV(
        l1_ratio=[0.05, 0.1, 0.3, 0.5, 0.7, 0.85, 0.95, 1.0],
        alphas=np.logspace(-4.5, -2.0, 60),
        cv=5,
        random_state=42,
        max_iter=10000,
        n_jobs=-1
    )
    enet.fit(X_tr, y_tr)
    oof_enet[val_idx] = enet.predict(X_va)
    tst_enet += enet.predict(Xtst) / kf.get_n_splits()

    # ----- XGBoost (optimized params) -----
    dtr  = xgb.DMatrix(X_tr.astype(np.float32), label=y_tr.astype(np.float32))
    dva  = xgb.DMatrix(X_va.astype(np.float32), label=y_va.astype(np.float32))
    dtst = xgb.DMatrix(Xtst.astype(np.float32))

    xgb_params = {
        "eta": 0.02,
        "max_depth": 3,
        "subsample": 0.75,
        "colsample_bytree": 0.6,
        "min_child_weight": 3,
        "reg_alpha": 0.5,
        "reg_lambda": 2.0,
        "gamma": 0.1,
        "objective": "reg:squarederror",
        "seed": 42,
        "eval_metric": "rmse"
    }

    xgb_model = xgb.train(
        xgb_params,
        dtr,
        num_boost_round=5000,
        evals=[(dva, "valid")],
        early_stopping_rounds=200,
        verbose_eval=False
    )

    oof_xgb[val_idx] = xgb_model.predict(dva, iteration_range=(0, xgb_model.best_iteration + 1))
    tst_xgb += xgb_model.predict(dtst, iteration_range=(0, xgb_model.best_iteration + 1)) / kf.get_n_splits()

    # ----- LightGBM -----
    lgb_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.02,
        'max_depth': 3,
        'num_leaves': 10,
        'subsample': 0.8,
        'colsample_bytree': 0.7,
        'min_child_samples': 20,
        'reg_alpha': 0.5,
        'reg_lambda': 2.0,
        'verbose': -1,
        'seed': 42
    }

    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_valid = lgb.Dataset(X_va, y_va, reference=lgb_train)

    lgb_model = lgb.train(
        lgb_params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_valid],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(0)]
    )

    oof_lgb[val_idx] = lgb_model.predict(X_va, num_iteration=lgb_model.best_iteration)
    tst_lgb += lgb_model.predict(Xtst, num_iteration=lgb_model.best_iteration) / kf.get_n_splits()

    # --- Fold scores ---
    s_lasso = rmse_log(y_va, oof_lasso[val_idx])
    s_ridge = rmse_log(y_va, oof_ridge[val_idx])
    s_enet  = rmse_log(y_va, oof_enet[val_idx])
    s_xgb   = rmse_log(y_va, oof_xgb[val_idx])
    s_lgb   = rmse_log(y_va, oof_lgb[val_idx])

    print(f"Fold {fold:>2}: Lasso={s_lasso:.5f} Ridge={s_ridge:.5f} ENet={s_enet:.5f} XGB={s_xgb:.5f} LGB={s_lgb:.5f}")

# === OOF Scores ===
print("\n=== OOF Scores (RMSLE) ===")
print(f"Lasso : {rmse_log(y, oof_lasso):.5f}")
print(f"Ridge : {rmse_log(y, oof_ridge):.5f}")
print(f"ENet  : {rmse_log(y, oof_enet):.5f}")
print(f"XGB   : {rmse_log(y, oof_xgb):.5f}")
print(f"LGB   : {rmse_log(y, oof_lgb):.5f}")

# === Optimal Stacking Weights (minimize RMSLE on OOF) ===
from scipy.optimize import minimize

def blend_loss(weights):
    weights = weights / weights.sum()
    blend = (weights[0] * oof_lasso +
             weights[1] * oof_ridge +
             weights[2] * oof_enet +
             weights[3] * oof_xgb +
             weights[4] * oof_lgb)
    return rmse_log(y, blend)

initial_weights = np.array([0.2, 0.1, 0.2, 0.25, 0.25])
bounds = [(0, 1)] * 5
result = minimize(blend_loss, initial_weights, method='SLSQP', bounds=bounds)

optimal_weights = result.x / result.x.sum()
print(f"\n=== Optimal Ensemble Weights ===")
print(f"Lasso: {optimal_weights[0]:.3f}")
print(f"Ridge: {optimal_weights[1]:.3f}")
print(f"ENet : {optimal_weights[2]:.3f}")
print(f"XGB  : {optimal_weights[3]:.3f}")
print(f"LGB  : {optimal_weights[4]:.3f}")

# Final blended OOF
oof_blend = (optimal_weights[0] * oof_lasso +
             optimal_weights[1] * oof_ridge +
             optimal_weights[2] * oof_enet +
             optimal_weights[3] * oof_xgb +
             optimal_weights[4] * oof_lgb)

print(f"\nðŸŽ¯ Blended OOF RMSLE: {rmse_log(y, oof_blend):.5f}")

# === Final Test Predictions ===
test_pred_log = (optimal_weights[0] * tst_lasso +
                 optimal_weights[1] * tst_ridge +
                 optimal_weights[2] * tst_enet +
                 optimal_weights[3] * tst_xgb +
                 optimal_weights[4] * tst_lgb)

test_pred = np.expm1(test_pred_log)

# Post-processing
train_prices = df_train['SalePrice'].values
q1, q3 = np.percentile(train_prices, 25), np.percentile(train_prices, 75)
iqr = q3 - q1
lo, hi = max(50000, q1 - 1.5*iqr), q3 + 2.0*iqr
test_pred = np.clip(test_pred, lo, hi)

# Save submission
submission_ensemble = pd.DataFrame({'Id': df_test['Id'], 'SalePrice': test_pred})
submission_ensemble.to_csv('submission_optimized_ensemble.csv', index=False)
print("\nðŸš€ Saved submission_optimized_ensemble.csv")
print(f"Price range: ${test_pred.min():.0f} - ${test_pred.max():.0f}")
print(f"Mean price: ${test_pred.mean():.0f}")
# ======================================================================nos


=== 10-Fold CV with Multiple Models ===
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1102]	valid_0's rmse: 0.126354
Fold  1: Lasso=0.10645 Ridge=0.11162 ENet=0.10665 XGB=0.12037 LGB=0.12635
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[616]	valid_0's rmse: 0.140101
Fold  2: Lasso=0.13791 Ridge=0.14501 ENet=0.13937 XGB=0.13892 LGB=0.14010
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[989]	valid_0's rmse: 0.0994824
Fold  3: Lasso=0.10664 Ridge=0.10904 ENet=0.10768 XGB=0.09906 LGB=0.09948
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2900]	valid_0's rmse: 0.117545
Fold  4: Lasso=0.12829 Ridge=0.13022 ENet=0.13101 XGB=0.13057 LGB=0.11754
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[247]	valid_0's rmse: 0.154803
Fold  5: Lasso=0.153

In [None]:
# # ===================== FINAL BOOST PACK (OOF average optimized) =====================
# from sklearn.model_selection import KFold
# from sklearn.linear_model import LassoCV, ElasticNetCV
# from sklearn.metrics import mean_squared_error
# from scipy.optimize import nnls
# import numpy as np
# import pandas as pd
# import xgboost as xgb

# def rmse_log(y_log_true, y_log_pred):
#     return np.sqrt(mean_squared_error(y_log_true, y_log_pred))

# def rmse_orig(y_log_true, y_log_pred):
#     return np.sqrt(mean_squared_error(np.expm1(y_log_true), np.expm1(y_log_pred)))

# # === Data ===
# Xtrn = X_train_scaled.values
# Xtst = X_test_scaled.values
# y    = y_train.values
# kf = KFold(n_splits=10, shuffle=True, random_state=42)

# # Containers
# oof_lasso = np.zeros_like(y, dtype=float)
# oof_enet  = np.zeros_like(y, dtype=float)
# oof_xgb   = np.zeros_like(y, dtype=float)
# tst_lasso = np.zeros(Xtst.shape[0], dtype=float)
# tst_enet  = np.zeros(Xtst.shape[0], dtype=float)
# tst_xgb   = np.zeros(Xtst.shape[0], dtype=float)

# print("\n=== 10-Fold OOF training (LassoCV / ElasticNetCV / XGB) ===")

# for fold, (tr_idx, val_idx) in enumerate(kf.split(Xtrn, y), 1):
#     X_tr, X_va = Xtrn[tr_idx], Xtrn[val_idx]
#     y_tr, y_va = y[tr_idx], y[val_idx]

#     # --- Lasso ---
#     lasso = LassoCV(
#         alphas=np.logspace(-4.5, -2.0, 60),
#         cv=5,
#         random_state=42,
#         max_iter=10000,
#         n_jobs=-1
#     ).fit(X_tr, y_tr)
#     oof_lasso[val_idx] = lasso.predict(X_va)
#     tst_lasso += lasso.predict(Xtst) / kf.get_n_splits()

#     # --- ElasticNet ---
#     enet = ElasticNetCV(
#         l1_ratio=[0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
#         alphas=np.logspace(-4.5, -2.0, 40),
#         cv=5,
#         random_state=42,
#         max_iter=10000,
#         n_jobs=-1
#     ).fit(X_tr, y_tr)
#     oof_enet[val_idx] = enet.predict(X_va)
#     tst_enet += enet.predict(Xtst) / kf.get_n_splits()

#     # --- XGBoost ---
#     dtr = xgb.DMatrix(X_tr.astype(np.float32), label=y_tr)
#     dva = xgb.DMatrix(X_va.astype(np.float32), label=y_va)
#     dtst = xgb.DMatrix(Xtst.astype(np.float32))
#     params = {
#         "eta": 0.03,
#         "max_depth": 4,
#         "subsample": 0.7,
#         "colsample_bytree": 0.7,
#         "min_child_weight": 1.0,
#         "reg_alpha": 1e-3,
#         "reg_lambda": 1.0,
#         "objective": "reg:squarederror",
#         "seed": 42
#     }
#     booster = xgb.train(params, dtr, 10000, evals=[(dva, "val")],
#                         early_stopping_rounds=200, verbose_eval=False)
#     oof_pred = booster.predict(dva, iteration_range=(0, booster.best_iteration + 1))
#     tst_pred = booster.predict(dtst, iteration_range=(0, booster.best_iteration + 1))
#     oof_xgb[val_idx] = oof_pred
#     tst_xgb += tst_pred / kf.get_n_splits()

#     print(f"Fold {fold:>2}: "
#           f"Lasso={rmse_log(y_va, oof_lasso[val_idx]):.5f}, "
#           f"ENet={rmse_log(y_va, oof_enet[val_idx]):.5f}, "
#           f"XGB={rmse_log(y_va, oof_xgb[val_idx]):.5f}")

# # === Tá»•ng há»£p OOF trung bÃ¬nh ===
# print("\n=== OOF Results (RMSLE) ===")
# print(f"Lasso : {rmse_log(y, oof_lasso):.5f}")
# print(f"ENet  : {rmse_log(y, oof_enet):.5f}")
# print(f"XGB   : {rmse_log(y, oof_xgb):.5f}")

# # === Blend báº±ng NNLS ===
# oof_mat = np.column_stack([oof_lasso, oof_enet, oof_xgb])
# w, _ = nnls(oof_mat, y)
# w = w / (w.sum() + 1e-9)
# print(f"\nBlend Weights (NNLS): Lasso={w[0]:.3f}, ENet={w[1]:.3f}, XGB={w[2]:.3f}")

# oof_blend = oof_mat @ w
# print(f"Blended OOF RMSLE : {rmse_log(y, oof_blend):.5f}")
# print(f"Blended OOF RMSE$ : {rmse_orig(y, oof_blend):,.2f}")

# # === Predict Test ===
# tst_mat = np.column_stack([tst_lasso, tst_enet, tst_xgb])
# test_pred_log = tst_mat @ w
# test_pred = np.expm1(test_pred_log)

# # === Háº­u xá»­ lÃ½ nháº¹ ===
# train_prices = df_train['SalePrice'].values
# q1, q3 = np.percentile(train_prices, 25), np.percentile(train_prices, 75)
# iqr = q3 - q1
# lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
# test_pred = np.clip(test_pred, lo, hi)

# submission_boost = pd.DataFrame({'Id': df_test['Id'], 'SalePrice': test_pred})
# submission_boost.to_csv('submission_boost_final.csv', index=False)
# print("\nâœ… Saved submission_boost_final.csv (Stable averaged blend)")
# # ==============================================================================



=== 10-Fold OOF training (LassoCV / ElasticNetCV / XGB) ===
Fold  1: Lasso=0.10579, ENet=0.10605, XGB=0.11280
Fold  2: Lasso=0.13808, ENet=0.13911, XGB=0.13557
Fold  3: Lasso=0.10650, ENet=0.10776, XGB=0.09908
Fold  4: Lasso=0.12844, ENet=0.13108, XGB=0.12728
Fold  5: Lasso=0.15303, ENet=0.15305, XGB=0.14969
Fold  6: Lasso=0.20339, ENet=0.20306, XGB=0.11845
Fold  7: Lasso=0.13123, ENet=0.13037, XGB=0.13365
Fold  8: Lasso=0.10357, ENet=0.10368, XGB=0.10612
Fold  9: Lasso=0.11632, ENet=0.11909, XGB=0.12530
Fold 10: Lasso=0.08719, ENet=0.08701, XGB=0.07428

=== OOF Results (RMSLE) ===
Lasso : 0.13114
ENet  : 0.13174
XGB   : 0.11996

Blend Weights (NNLS): Lasso=0.276, ENet=0.000, XGB=0.724
Blended OOF RMSLE : 0.11795
Blended OOF RMSE$ : 26,447.34

âœ… Saved submission_boost_final.csv (Stable averaged blend)
