In [48]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

## 1. Dataset

In [49]:
df = pd.read_csv('house-prices-advanced-regression-techniques/preprocessed.csv')
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,LotShape,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,OverallQual,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0.071562,RL,-0.264868,Reg,Inside,CollgCr,Norm,1Fam,2Story,0.707029,...,0.298465,-0.364226,-0.033312,-0.253081,-0.144348,-1.606014,0.137217,WD,Normal,0.479517
1,-0.870615,RL,0.008674,Reg,FR2,Veenker,Feedr,1Fam,1Story,-0.03811,...,-0.749164,-0.364226,-0.033312,-0.253081,-0.144348,-0.495788,-0.618051,WD,Normal,0.091359
2,0.071562,RL,0.401148,IR1,Inside,CollgCr,Norm,1Fam,2Story,0.707029,...,-0.027846,-0.364226,-0.033312,-0.253081,-0.144348,0.984513,0.137217,WD,Normal,0.69516
3,0.071562,RL,1.117115,IR1,FR2,NoRidge,Norm,1Fam,2Story,1.452168,...,0.693472,-0.364226,-0.033312,-0.253081,-0.144348,2.094739,0.137217,WD,Normal,1.076129
4,-0.870615,RL,0.1238,Reg,Inside,Somerst,Norm,1Fam,1Story,1.452168,...,0.229768,-0.364226,-0.033312,-0.253081,-0.144348,0.614438,-0.618051,WD,Normal,1.895573


In [92]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

for col in X.select_dtypes(include="object").columns:
    X[col] = X[col].astype("category")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(f"Shapes: train {X_train.shape} {y_train.shape}, test {X_test.shape, y_test.shape}")

Shapes: train (917, 66) (917,), test ((393, 66), (393,))


## 2. Model Optimization

In [51]:
import optuna
from sklearn.model_selection import cross_val_score

from optuna.visualization import plot_contour
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_rank

def optimization_plots(study):
    fig1 = plot_optimization_history(study)
    fig1.show()

    fig2 = plot_contour(study)
    fig2.show()

    fig3 = plot_rank(study)
    fig3.show()

    fig4 = plot_param_importances(study)
    fig4.show()

In [52]:
from sklearn.model_selection import KFold

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

### 2.1 Gradient Boosting Regressor

In [53]:
import math

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor

from skopt import gp_minimize
from skopt.space import Real, Integer

def objective(x):
    model = GradientBoostingRegressor(loss='squared_error', learning_rate=x[0], n_estimators=x[1], max_depth=x[2])
    scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')

    print("LR: " + str(x[0]) + " \tEstimators: " + str(x[1]) + " \tDepth: " + str(x[2]) + "\tScore: " + str(math.sqrt(-scores.mean())))

    return -scores.mean()

space = [Real(0.001, 0.1),
         Integer(500, 2000),
         Integer(2, 5)]

# result = gp_minimize(objective, space, n_calls=10, random_state=42)

# print("Best parameters: x1 = {:.4f}, x2 = {:.4f}, x3= {:.4f}".format(result.x[0], result.x[1], result.x[2]))
# print("Minimum value: {:.4f}".format(result.fun))

In [54]:
import math

study_name = 'kag1-gradient-boosting-2'
storage_name = "sqlite:///{}.db".format(study_name)

def objective(trial):
    estimators = trial.suggest_int("estimators", 16, 2048, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 32, log=True)
    lr = trial.suggest_float("lr", 1e-5, 0.1, log=True)

    model = GradientBoostingRegressor(loss='squared_error', learning_rate=lr, n_estimators=estimators, max_depth=max_depth)
    scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')

    return math.sqrt(-scores.mean())

study = optuna.create_study(study_name=study_name, storage=storage_name, direction='minimize', load_if_exists=True)

[I 2025-08-18 19:56:45,945] Using an existing study with name 'kag1-gradient-boosting-2' instead of creating a new one.


In [55]:
# study.optimize(objective, n_trials=20)
# print(study.best_trial)

In [56]:
# optimization_plots(study)

### 2.2 Support Vector Regressor

In [57]:
from sklearn import svm

study_name = 'kag1-SVR-2'
storage_name = "sqlite:///{}.db".format(study_name)

def optimize_svr(trial):
    c = trial.suggest_float("c", 0.1, 100, log=True)
    gamma = trial.suggest_float("gamma", 0.001, 1, log=True)
    epsilon = trial.suggest_float("epsilon", 0.001, 1, log=True)
    
    model = svm.SVR(C=c, gamma=gamma, epsilon=epsilon)
    scores = cross_val_score(model, X, y, cv=kf, scoring="neg_mean_squared_error")
    
    return math.sqrt(-scores.mean())

study_svr = optuna.create_study(study_name=study_name, storage=storage_name, direction='minimize', load_if_exists=True)

[I 2025-08-18 19:56:45,978] Using an existing study with name 'kag1-SVR-2' instead of creating a new one.


In [58]:
# study_svr.optimize(optimize_svr, n_trials=50)
# print(study_svr.best_trial)

In [59]:
# optimization_plots(study_svr)

### 2.3 XGBoost

In [95]:
import optuna
import xgboost as xgb

study_name = 'kag1-XGB-3'
storage_name = "sqlite:///{}.db".format(study_name)

def optimize_xgb(trial):
    num_estimators = trial.suggest_int("num_estimators", 100, 2000, log=True)
    learning_rate = trial.suggest_float("learning_rate", 0.005, 0.1, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 16, log=True)
    
    model = xgb.XGBRegressor(n_estimators=num_estimators, max_depth=max_depth, learning_rate=learning_rate, n_jobs=-1, tree_method="hist", enable_categorical=True)
    scores = cross_val_score(model, X, y, cv=kf, scoring="neg_mean_squared_error", )
    
    return math.sqrt(-scores.mean())

study_xgb = optuna.create_study(study_name=study_name, storage=storage_name, direction='minimize', load_if_exists=True)

[I 2025-08-18 21:55:00,787] Using an existing study with name 'kag1-XGB-3' instead of creating a new one.


In [78]:
study_xgb.optimize(optimize_xgb, n_trials=25)
print(study_xgb.best_trial)

[I 2025-08-18 21:30:44,177] Trial 0 finished with value: 0.2957007998365573 and parameters: {'num_estimators': 179, 'learning_rate': 0.09797619635922152}. Best is trial 0 with value: 0.2957007998365573.
[I 2025-08-18 21:30:51,740] Trial 1 finished with value: 0.2976939773010488 and parameters: {'num_estimators': 1944, 'learning_rate': 0.059314824033938986}. Best is trial 0 with value: 0.2957007998365573.
[I 2025-08-18 21:30:57,349] Trial 2 finished with value: 0.29138801563462796 and parameters: {'num_estimators': 1371, 'learning_rate': 0.0497201614076454}. Best is trial 2 with value: 0.29138801563462796.
[I 2025-08-18 21:30:58,855] Trial 3 finished with value: 0.31657223012432334 and parameters: {'num_estimators': 358, 'learning_rate': 0.018531871997730875}. Best is trial 2 with value: 0.29138801563462796.
[I 2025-08-18 21:31:00,734] Trial 4 finished with value: 0.397405529606148 and parameters: {'num_estimators': 462, 'learning_rate': 0.005876955214472248}. Best is trial 2 with value

FrozenTrial(number=11, state=1, values=[0.28780779776919563], datetime_start=datetime.datetime(2025, 8, 18, 21, 31, 29, 369597), datetime_complete=datetime.datetime(2025, 8, 18, 21, 31, 33, 387626), params={'num_estimators': 965, 'learning_rate': 0.04336485142586406}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'num_estimators': IntDistribution(high=2000, log=True, low=100, step=1), 'learning_rate': FloatDistribution(high=0.1, log=True, low=0.005, step=None)}, trial_id=12, value=None)


In [82]:
optimization_plots(study_xgb)

## 3. Prediction

In [104]:
X_test = pd.read_csv('house-prices-advanced-regression-techniques/preprocessed_test.csv')

for col in X_test.select_dtypes(include="object").columns:
    X_test[col] = X_test[col].astype("category")
    
print(X_test.shape)
X_test.head()

(1459, 66)


Unnamed: 0,MSSubClass,MSZoning,LotArea,LotShape,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,OverallQual,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,-0.870615,RH,0.489633,Reg,Inside,NAmes,Feedr,1Fam,1Story,-0.783249,...,0.41583,-0.749164,-0.364226,-0.033312,2.725899,-0.144348,-0.125713,1.647753,WD,Normal
1,-0.870615,RL,1.11878,IR1,Corner,NAmes,Norm,1Fam,1Story,-0.03811,...,2.571841,-0.130891,-0.364226,-0.033312,-0.253081,90.963525,-0.125713,1.647753,WD,Normal
2,0.071562,RL,1.014834,IR1,Inside,Gilbert,Norm,1Fam,2Story,-0.783249,...,1.029398,-0.16524,-0.364226,-0.033312,-0.253081,-0.144348,-1.235938,1.647753,WD,Normal
3,0.071562,RL,0.098587,IR1,Inside,Gilbert,Norm,1Fam,2Story,-0.03811,...,2.290622,-0.130891,-0.364226,-0.033312,-0.253081,-0.144348,-0.125713,1.647753,WD,Normal
4,1.484829,RL,-1.084305,IR1,Inside,StoneBr,Norm,TwnhsE,1Story,1.452168,...,-0.777219,0.659124,-0.364226,-0.033312,3.321695,-0.144348,-1.976089,1.647753,WD,Normal


### 3.1 baseline: linear regression

In [None]:
from sklearn import linear_model

lin_model = linear_model.LinearRegression()
lin_model.fit(X, y)

raw_predictions = lin_model.predict(X_test)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### 3.2 Gradient Boosting Regressor

In [None]:
from sklearn import ensemble


gbr_model = ensemble.GradientBoostingRegressor(loss='squared_error', learning_rate=0.018, n_estimators=1200, max_depth=2)
gbr_model.fit(X, y)

raw_predictions = gbr_model.predict(X_test)

0,1,2
,loss,'squared_error'
,learning_rate,0.018
,n_estimators,1200
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,2
,min_impurity_decrease,0.0


### 3.3 Support Vector Regressor

In [112]:
from sklearn import svm

svm_model = svm.SVR(C=22, gamma=0.00102, epsilon=0.00606)
svm_model.fit(X, y)

raw_predictions = svm_model.predict(X_test[most_rel_attr])

ValueError: could not convert string to float: 'CollgCr'

### 3.4 XGBoost

In [109]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(n_estimators=848, max_depth=2, learning_rate=0.031, n_jobs=-1, tree_method="hist", enable_categorical=True)
xgb_model.fit(X, y)

raw_predictions = xgb_model.predict(X_test[most_rel_attr])

In [85]:
import shap
import numpy as np

explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X)

# Compute mean absolute SHAP values per feature
mean_shap = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame({
    "feature": X_train.columns,
    "mean_abs_shap": mean_shap
}).sort_values("mean_abs_shap", ascending=False)

In [89]:
importance_df.head(20)

Unnamed: 0,feature,mean_abs_shap
9,OverallQual,0.24042
36,GrLivArea,0.221251
5,Neighborhood,0.103874
25,BsmtFinSF1,0.062546
29,TotalBsmtSF,0.060779
10,OverallCond,0.047379
43,KitchenQual,0.042928
2,LotArea,0.037347
24,BsmtFinType1,0.034139
51,GarageCars,0.032669


In [93]:
most_rel_attr = importance_df.head(20)["feature"].tolist()

X = X[most_rel_attr]
X.head()

Unnamed: 0,OverallQual,GrLivArea,Neighborhood,BsmtFinSF1,TotalBsmtSF,OverallCond,KitchenQual,LotArea,BsmtFinType1,GarageCars,GarageArea,GarageFinish,YearRemodAdd,FireplaceQu,YearBuilt,1stFlrSF,BsmtQual,ExterQual,GarageType,OpenPorchSF
0,0.707029,0.524208,CollgCr,0.652122,-0.437515,-0.517033,Gd,-0.264868,GLQ,0.33313,0.390836,RFn,0.874129,na,1.039376,-0.794398,Gd,Gd,Attchd,0.298465
1,-0.03811,-0.445658,Veenker,1.296497,0.596812,2.195835,TA,0.008674,ALQ,0.33313,-0.036584,RFn,-0.426584,TA,0.134492,0.36308,Gd,TA,Attchd,-0.749164
2,0.707029,0.688739,CollgCr,0.130937,-0.274468,-0.517033,Gd,0.401148,GLQ,0.33313,0.682259,RFn,0.825955,TA,0.972347,-0.611938,Gd,Gd,Attchd,-0.027846
3,1.452168,1.58067,NoRidge,0.531302,0.298743,-0.517033,Gd,1.117115,GLQ,1.696882,1.789665,RFn,0.729606,TA,0.938833,0.029521,Gd,Gd,Attchd,0.693472
4,1.452168,0.48957,Somerst,2.222786,1.676997,-0.517033,Gd,0.1238,GLQ,0.33313,0.818256,RFn,0.970478,Gd,1.07289,1.594683,Ex,Gd,Attchd,0.229768


In [94]:
study_name = 'kag1-XGB-5'
storage_name = "sqlite:///{}.db".format(study_name)
study_xgb2 = optuna.create_study(study_name=study_name, storage=storage_name, direction='minimize', load_if_exists=True)

[I 2025-08-18 21:53:46,760] A new study created in RDB with name: kag1-XGB-5


In [108]:
# study_xgb2.optimize(optimize_xgb, n_trials=75)
print(study_xgb2.best_trial)

FrozenTrial(number=85, state=1, values=[0.2953023777093471], datetime_start=datetime.datetime(2025, 8, 18, 22, 4, 0, 208210), datetime_complete=datetime.datetime(2025, 8, 18, 22, 4, 3, 121890), params={'num_estimators': 848, 'learning_rate': 0.03112848552409498, 'max_depth': 2}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'num_estimators': IntDistribution(high=2000, log=True, low=100, step=1), 'learning_rate': FloatDistribution(high=0.1, log=True, low=0.005, step=None), 'max_depth': IntDistribution(high=16, log=True, low=2, step=1)}, trial_id=86, value=None)


In [99]:
optimization_plots(study_xgb2)

## 4 Submission

In [110]:
import json

with open('house-prices-advanced-regression-techniques/normalization_values.json', 'r') as f:
    norm_values = json.load(f)

predictions = raw_predictions * norm_values['std']['SalePrice'] + norm_values['mean']['SalePrice']

In [111]:
submission = pd.DataFrame({
    'Id': range(1461, 2920),
    'SalePrice': predictions,
})

submission.to_csv('house-prices-advanced-regression-techniques/submission.csv', index=False)