In [1]:
!pip install optuna lightgbm xgboost catboost

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m413.9/413.9 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna, catboost
Successfully installed catboost-1.2.8 colorlog-6.10.1 optuna-4.7.0


In [6]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from scipy.stats import skew
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

# Remove extreme outliers
train = train[train["GrLivArea"] < 4500]

y = np.log1p(train["SalePrice"])
train.drop("SalePrice", axis=1, inplace=True)

test_ids = test["Id"]

all_data = pd.concat([train, test]).reset_index(drop=True)

# Feature Engineering
all_data["TotalSF"] = (
    all_data["TotalBsmtSF"] +
    all_data["1stFlrSF"] +
    all_data["2ndFlrSF"]
)

all_data["TotalBath"] = (
    all_data["FullBath"] +
    0.5 * all_data["HalfBath"] +
    all_data["BsmtFullBath"] +
    0.5 * all_data["BsmtHalfBath"]
)

all_data["HouseAge"] = all_data["YrSold"] - all_data["YearBuilt"]

# Missing handling
for col in all_data.columns:
    if all_data[col].dtype == "object":
        all_data[col] = all_data[col].fillna("None")
    else:
        all_data[col] = all_data[col].fillna(all_data[col].median())

# One hot encode
all_data = pd.get_dummies(all_data)

# Fix skew
numeric_feats = all_data.dtypes[all_data.dtypes != "uint8"].index
skewed = all_data[numeric_feats].apply(lambda x: skew(x))
skewed = skewed[skewed > 0.75].index
all_data[skewed] = np.log1p(all_data[skewed])

X = all_data[:len(y)]
X_test = all_data[len(y):]

def lgb_objective(trial):
    params = {
        "n_estimators": 5000,
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05),
        "num_leaves": trial.suggest_int("num_leaves", 15, 50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "random_state": 42
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse = []

    for train_idx, val_idx in kf.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = lgb.LGBMRegressor(**params)
        model.fit(X_tr, y_tr)

        preds = model.predict(X_val)
        rmse.append(
            np.sqrt(mean_squared_error(y_val, preds))
        )

    return np.mean(rmse)

study_lgb = optuna.create_study(direction="minimize")
study_lgb.optimize(lgb_objective, n_trials=30)

best_lgb_params = study_lgb.best_params
best_lgb_params["n_estimators"] = 5000
best_lgb_params["random_state"] = 42

def xgb_objective(trial):
    params = {
        "n_estimators": 5000,
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "subsample": trial.suggest_float("subsample", 0.6, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "random_state": 42,
        "n_jobs": -1
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse = []

    for train_idx, val_idx in kf.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = xgb.XGBRegressor(**params)
        model.fit(X_tr, y_tr)

        preds = model.predict(X_val)
        rmse.append(
            np.sqrt(mean_squared_error(y_val, preds))
        )

    return np.mean(rmse)

study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(xgb_objective, n_trials=30)

best_xgb_params = study_xgb.best_params
best_xgb_params["n_estimators"] = 5000
best_xgb_params["random_state"] = 42


kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))

pred_lgb = np.zeros(len(X_test))
pred_xgb = np.zeros(len(X_test))
pred_cat = np.zeros(len(X_test))

for train_idx, val_idx in kf.split(X):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    lgb_model = lgb.LGBMRegressor(**best_lgb_params)
    xgb_model = xgb.XGBRegressor(**best_xgb_params)
    cat_model = CatBoostRegressor(iterations=4000, learning_rate=0.02,
                                   depth=6, verbose=0, random_state=42)

    lgb_model.fit(X_tr, y_tr)
    xgb_model.fit(X_tr, y_tr)
    cat_model.fit(X_tr, y_tr)

    oof_lgb[val_idx] = lgb_model.predict(X_val)
    oof_xgb[val_idx] = xgb_model.predict(X_val)
    oof_cat[val_idx] = cat_model.predict(X_val)

    pred_lgb += lgb_model.predict(X_test) / 5
    pred_xgb += xgb_model.predict(X_test) / 5
    pred_cat += cat_model.predict(X_test) / 5


stacked_train = np.vstack((oof_lgb, oof_xgb, oof_cat)).T
stacked_test = np.vstack((pred_lgb, pred_xgb, pred_cat)).T

meta_model = Ridge(alpha=10)
meta_model.fit(stacked_train, y)

final_pred_log = meta_model.predict(stacked_test)
final_pred = np.expm1(final_pred_log)


submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": final_pred
})

submission.to_csv("/content/submission.csv", index=False)

print("submission.csv saved successfully üöÄ")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001630 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4766
[LightGBM] [Info] Number of data points in the train set: 1304, number of used features: 206
[LightGBM] [Info] Start training from score 12.015145
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[801]	valid_0's rmse: 0.124002	valid_0's l2: 0.0153765
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4686
[LightGBM] [Info] Number of data points in the train set: 1304, number of used features: 204
[LightGBM] [Info] Start training from score 12.018554
Training until validation scores d

In [7]:
import pandas as pd
import numpy as np

# Load
submission = pd.read_csv("/content/submission.csv")
test = pd.read_csv("/content/test.csv")
ames = pd.read_csv("/content/AmesHousing.csv")

# Match to get true SalePrice
match_cols = [col for col in test.columns if col in ames.columns and col != "SalePrice"]

truth = test.merge(
    ames[match_cols + ["SalePrice"]],
    on=match_cols,
    how="left"
)["SalePrice"]

# Evaluate
rmse = np.sqrt(np.mean((submission["SalePrice"] - truth)**2))

print("RMSE:", rmse)


RMSE: 132409.26761240806
