In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns



train_data = pd.read_csv("train.csv")
train_data = train_data.drop('Id', axis=1)
print(train_data.columns.tolist())


# print(data['SalePrice'].describe())
# plt.figure(figsize=(9, 8))
# sns.distplot(data['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4});

X = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice']

for col in X.select_dtypes(include='object').columns:
    X[col] = X[col].fillna('None').astype('category')

for col in X.select_dtypes(include=['int64', 'float64']).columns:
    X[col] = X[col].fillna(X[col].median())

data_dmatrix = xgb.DMatrix(data=X, label=y, enable_categorical=True)

['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fen

In [3]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold

xgb_reg = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    enable_categorical=True
)
parameter_grid = {
    'n_estimators': [800, 1200, 1600, 2000],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.005],
    'min_child_weight': [1, 10, 100]
    }


cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb_reg,
    param_distributions=parameter_grid,
    n_iter=25,
    scoring='neg_root_mean_squared_error',
    cv=cv_strategy,
    verbose=2,
    n_jobs=-1
)

random_search.fit(X, y)
print(random_search.best_params_)
model = random_search.best_estimator_


Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END learning_rate=0.005, max_depth=3, min_child_weight=10, n_estimators=1200; total time=   3.9s
[CV] END learning_rate=0.005, max_depth=3, min_child_weight=10, n_estimators=1200; total time=   4.1s
[CV] END learning_rate=0.005, max_depth=3, min_child_weight=10, n_estimators=1200; total time=   4.1s
[CV] END learning_rate=0.005, max_depth=3, min_child_weight=10, n_estimators=1200; total time=   4.2s
[CV] END learning_rate=0.005, max_depth=3, min_child_weight=10, n_estimators=1200; total time=   4.4s
[CV] END learning_rate=0.005, max_depth=3, min_child_weight=1, n_estimators=2000; total time=   6.6s
[CV] END learning_rate=0.005, max_depth=3, min_child_weight=1, n_estimators=2000; total time=   6.7s
[CV] END learning_rate=0.005, max_depth=3, min_child_weight=1, n_estimators=2000; total time=   7.0s
[CV] END learning_rate=0.005, max_depth=6, min_child_weight=100, n_estimators=1200; total time=   3.5s
[CV] END learning_rate

In [4]:
# y_pred_train = model.predict(X)
# train_rmse = np.sqrt(mean_squared_error(np.log1p(y), np.log1p(y_pred_train)))
# print("Log-RMSE on training data:", train_rmse)

In [None]:
test_data = pd.read_csv("test.csv")
ids = test_data['Id']
X_test = test_data.drop('Id', axis=1).copy()


for col in X.select_dtypes(include='category').columns:
    if col in X_test.columns:
        X_test[col] = X_test[col].fillna('None').astype('category')

for col in X.select_dtypes(include=['int64','float64']).columns:
    if col in X_test.columns:
        X_test[col] = X_test[col].fillna(X[col].median())


y_pred = model.predict(X_test)  

#Submission
# submission = pd.DataFrame({
#     "Id": ids,
#     "SalePrice": y_pred
# })

# submission.to_csv("submission.csv", index=False)


     Id      SalePrice
0  1461  140683.125000
1  1462  160548.562500
2  1463  200898.375000
3  1464  194583.593750
4  1465  199757.984375


In [4]:
# * Checking for overfitting

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.02,
random_state=42)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=50
)
train_pred = model.predict(X_train)
val_pred = model.predict(X_val)

train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))

print("Training RMSE:", train_rmse)
print("Validation RMSE:", val_rmse)

[0]	validation_0-rmse:62226.45508
[1]	validation_0-rmse:61730.32616
[2]	validation_0-rmse:61210.31512
[3]	validation_0-rmse:60722.58282
[4]	validation_0-rmse:60189.46331
[5]	validation_0-rmse:59729.22534
[6]	validation_0-rmse:59286.86441
[7]	validation_0-rmse:58837.11873
[8]	validation_0-rmse:58352.07783
[9]	validation_0-rmse:57941.10581
[10]	validation_0-rmse:57470.86028
[11]	validation_0-rmse:57041.79537
[12]	validation_0-rmse:56662.78500
[13]	validation_0-rmse:56206.38503
[14]	validation_0-rmse:55766.55138
[15]	validation_0-rmse:55324.86703
[16]	validation_0-rmse:54951.53141
[17]	validation_0-rmse:54513.23341
[18]	validation_0-rmse:54185.66797
[19]	validation_0-rmse:53745.32193
[20]	validation_0-rmse:53390.02676
[21]	validation_0-rmse:53019.88878
[22]	validation_0-rmse:52688.55852
[23]	validation_0-rmse:52283.58569
[24]	validation_0-rmse:51965.07363
[25]	validation_0-rmse:51610.90268
[26]	validation_0-rmse:51235.06256
[27]	validation_0-rmse:50927.83603
[28]	validation_0-rmse:50594.3



[82]	validation_0-rmse:37546.76083
[83]	validation_0-rmse:37383.24602
[84]	validation_0-rmse:37227.91682
[85]	validation_0-rmse:37045.94912
[86]	validation_0-rmse:36854.92239
[87]	validation_0-rmse:36693.12539
[88]	validation_0-rmse:36539.56282
[89]	validation_0-rmse:36350.94240
[90]	validation_0-rmse:36276.48829
[91]	validation_0-rmse:36128.52685
[92]	validation_0-rmse:35950.89195
[93]	validation_0-rmse:35770.33299
[94]	validation_0-rmse:35681.53981
[95]	validation_0-rmse:35510.25512
[96]	validation_0-rmse:35350.77829
[97]	validation_0-rmse:35177.93198
[98]	validation_0-rmse:35084.19256
[99]	validation_0-rmse:34945.62573
[100]	validation_0-rmse:34896.34737
[101]	validation_0-rmse:34768.56321
[102]	validation_0-rmse:34557.20794
[103]	validation_0-rmse:34401.49339
[104]	validation_0-rmse:34314.68119
[105]	validation_0-rmse:34175.38276
[106]	validation_0-rmse:34009.31476
[107]	validation_0-rmse:33858.90443
[108]	validation_0-rmse:33815.44471
[109]	validation_0-rmse:33688.12607
[110]	vali

In [None]:
# * Saving to CSV