In [1]:
import os 
import numpy as np
import pandas as pd

In [2]:
os.chdir("/kaggle/input")


from housepricesxgbregressortuning.xgboost_hyperparameter_tuning import *

os.chdir("/kaggle/working")

In [3]:
tuner = HousePriceXGBoostTuner(data_path="/kaggle/input/house-prices-advanced-regression-techniques/train.csv", random_state=42)

# Run complete pipeline
tuner.run_complete_pipeline(
    tuning_method='randomized',  # Change to 'grid' for grid search
    n_iter=50  # Adjust based on available time/compute
)

XGBoost Hyperparameter Tuning Pipeline
Start time: 2025-09-14 15:16:01.405344

Loading and exploring data...
Dataset shape: (1460, 81)
Target variable: SalePrice

Target variable statistics:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

Features with missing values (19 total):
  PoolQC: 1453 (99.5%)
  MiscFeature: 1406 (96.3%)
  Alley: 1369 (93.8%)
  Fence: 1179 (80.8%)
  MasVnrType: 872 (59.7%)
  FireplaceQu: 690 (47.3%)
  LotFrontage: 259 (17.7%)
  GarageType: 81 (5.5%)
  GarageYrBlt: 81 (5.5%)
  GarageFinish: 81 (5.5%)

Numeric features: 36
Categorical features: 43

Preprocessing data...
Applied log1p transformation to target variable
Preprocessed features shape: (1460, 79)
Target shape: (1460,)

Performing feature engineering...
Added 3 engineered features
Final feature set: 82 features

Splitting data (test_size=

In [4]:
best_model = tuner.best_model
df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

numeric_features, categorical_features = tuner.load_and_explore_data()
all_features = numeric_features + categorical_features

X = df[all_features].copy()
X_processed = X.copy()

# Process numeric features
if numeric_features:
    numeric_imputer = SimpleImputer(strategy='median')
    scaler = StandardScaler()
    X_processed[numeric_features] = scaler.fit_transform(
        numeric_imputer.fit_transform(X_processed[numeric_features])
    )

# Process categorical features
for feature in categorical_features:
    # Fill missing values
    X_processed[feature] = X_processed[feature].fillna('Missing')
    # Label encode
    le = LabelEncoder()
    X_processed[feature] = le.fit_transform(X_processed[feature].astype(str))

X = X_processed
feature_names = all_features

original_features = len(X.columns)

# Total area features
area_features = [col for col in X.columns if 'SF' in str(col) or 'Area' in str(col)]
if len(area_features) >= 2:
    X['TotalArea'] = X[area_features].sum(axis=1)

# Age features
if 'YearBuilt' in X.columns and 'YrSold' in X.columns:
    X['HouseAge'] = X['YrSold'] - X['YearBuilt']

# Quality-related features
quality_features = [col for col in X.columns if 'Qual' in str(col)]
if len(quality_features) >= 2:
    X['OverallQuality'] = X[quality_features].mean(axis=1)

new_features = len(X.columns) - original_features
print(f"Added {new_features} engineered features")
print(f"Final feature set: {X.shape[1]} features")

# Update feature names
feature_names = list(X.columns)

X.shape

Loading and exploring data...
Dataset shape: (1460, 81)
Target variable: SalePrice

Target variable statistics:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

Features with missing values (19 total):
  PoolQC: 1453 (99.5%)
  MiscFeature: 1406 (96.3%)
  Alley: 1369 (93.8%)
  Fence: 1179 (80.8%)
  MasVnrType: 872 (59.7%)
  FireplaceQu: 690 (47.3%)
  LotFrontage: 259 (17.7%)
  GarageType: 81 (5.5%)
  GarageYrBlt: 81 (5.5%)
  GarageFinish: 81 (5.5%)

Numeric features: 36
Categorical features: 43
Added 3 engineered features
Final feature set: 82 features


(1459, 82)

In [5]:
logpreds = best_model.predict(X)
preds = np.expm1(logpreds)
preds

array([121793.75, 161312.36, 164678.58, ..., 163478.55, 117322.73,
       199022.19], dtype=float32)

In [6]:
df["SalePrice"] = preds
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,121793.750000
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,161312.359375
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,164678.578125
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,189590.671875
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,,,,0,1,2010,WD,Normal,153658.796875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,76849.453125
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,83232.945312
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,163478.546875
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,117322.726562


In [7]:
submission_df = df[["Id", "SalePrice"]]
submission_df.to_csv("/kaggle/working/submission.csv")

In [8]:
submission_df

Unnamed: 0,Id,SalePrice
0,1461,121793.750000
1,1462,161312.359375
2,1463,164678.578125
3,1464,189590.671875
4,1465,153658.796875
...,...,...
1454,2915,76849.453125
1455,2916,83232.945312
1456,2917,163478.546875
1457,2918,117322.726562
