In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns



train_data = pd.read_csv("train.csv")
train_data = train_data.drop('Id', axis=1)
print(train_data.columns.tolist())


# print(data['SalePrice'].describe())
# plt.figure(figsize=(9, 8))
# sns.distplot(data['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4});

X = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice']

for col in X.select_dtypes(include='object').columns:
    X[col] = X[col].fillna('None').astype('category')

for col in X.select_dtypes(include=['int64', 'float64']).columns:
    X[col] = X[col].fillna(X[col].median())

data_dmatrix = xgb.DMatrix(data=X, label=y, enable_categorical=True)

['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fen

In [6]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold

xgb_reg = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    enable_categorical=True
)
parameter_grid = {
    'n_estimators': [100, 400, 800],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.05, 0.1, 0.20],
    'min_child_weight': [1, 10, 100]
    }


cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb_reg,
    param_distributions=parameter_grid,
    n_iter=10,
    scoring='neg_root_mean_squared_error',
    cv=cv_strategy,
    verbose=2,
    n_jobs=-1
)

random_search.fit(X, y)
print(random_search.best_params_)
model = random_search.best_estimator_


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END learning_rate=0.2, max_depth=6, min_child_weight=100, n_estimators=400; total time=   0.6s
[CV] END learning_rate=0.2, max_depth=6, min_child_weight=100, n_estimators=400; total time=   0.5s
[CV] END learning_rate=0.2, max_depth=6, min_child_weight=100, n_estimators=400; total time=   0.6s
[CV] END learning_rate=0.2, max_depth=6, min_child_weight=100, n_estimators=400; total time=   0.5s
[CV] END learning_rate=0.2, max_depth=6, min_child_weight=100, n_estimators=400; total time=   0.5s
[CV] END learning_rate=0.1, max_depth=9, min_child_weight=1, n_estimators=100; total time=   1.3s
[CV] END learning_rate=0.1, max_depth=9, min_child_weight=1, n_estimators=100; total time=   1.4s
[CV] END learning_rate=0.1, max_depth=9, min_child_weight=1, n_estimators=100; total time=   1.4s
[CV] END learning_rate=0.1, max_depth=9, min_child_weight=1, n_estimators=100; total time=   1.4s
[CV] END learning_rate=0.1, max_depth=9, min_ch

In [None]:
# y_pred_train = model.predict(X)
# train_rmse = np.sqrt(mean_squared_error(np.log1p(y), np.log1p(y_pred_train)))
# print("Log-RMSE on training data:", train_rmse)

Log-RMSE on training data: 0.05205835529625534


In [None]:
test_data = pd.read_csv("test.csv")
test_data = test_data.drop('Id', axis=1)
X_test = test_data.copy()


for col in X.select_dtypes(include='category').columns:
    if col in X_test.columns:
        X_test[col] = X_test[col].fillna('None').astype('category')

for col in X.select_dtypes(include=['int64','float64']).columns:
    if col in X_test.columns:
        X_test[col] = X_test[col].fillna(X[col].median())

#
y_pred = model.predict(X_test)  # use tuned model
