# House Prices - Advanced Regression Techniques


In [None]:
from google.colab import drive
import sys

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)
sys.path.append('/content/drive/MyDrive')
import utils


Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from utils import *
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import make_scorer
!pip install scikit-optimize
!pip install shap
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import xgboost as xgb
from sklearn.model_selection import KFold
import shap



## Load Data

In [None]:
data = pd.read_csv('/content/drive/MyDrive/data/train.csv')
data = data.drop('Id', axis=1)
print(data)
#print(data.info())

#Separate the train set from the target variable
train_price = data[['SalePrice']]
train_data = data.drop('SalePrice', axis=1)
test_data = pd.read_csv('/content/drive/MyDrive/data/test.csv')
ids_test = test_data['Id']
test_data = test_data.drop('Id', axis=1)

      MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0             60       RL         65.0     8450   Pave   NaN      Reg   
1             20       RL         80.0     9600   Pave   NaN      Reg   
2             60       RL         68.0    11250   Pave   NaN      IR1   
3             70       RL         60.0     9550   Pave   NaN      IR1   
4             60       RL         84.0    14260   Pave   NaN      IR1   
...          ...      ...          ...      ...    ...   ...      ...   
1455          60       RL         62.0     7917   Pave   NaN      Reg   
1456          20       RL         85.0    13175   Pave   NaN      Reg   
1457          70       RL         66.0     9042   Pave   NaN      Reg   
1458          20       RL         68.0     9717   Pave   NaN      Reg   
1459          20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour Utilities LotConfig  ... PoolArea PoolQC  Fence MiscFeature  \
0            Lvl    AllPub    Inside  ...  

## Transforming Time related features

In [None]:
# Calculate age-related features
train_data['HouseAge'] = train_data['YrSold'] - train_data['YearBuilt']
train_data['RemodAge'] = train_data['YrSold'] - train_data['YearRemodAdd']
train_data['GarageAge'] = train_data['YrSold'] - train_data['GarageYrBlt']

test_data['HouseAge'] = test_data['YrSold'] - test_data['YearBuilt']
test_data['RemodAge'] = test_data['YrSold'] - test_data['YearRemodAdd']
test_data['GarageAge'] = test_data['YrSold'] - test_data['GarageYrBlt']

# Drop the original year columns
train_data = train_data.drop(['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], axis=1)
test_data = test_data.drop(['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], axis=1)

# Cyclical transformation for MoSold (Month Sold)
train_data['sin_MoSold'] = np.sin(2 * np.pi * train_data['MoSold'] / 12)
train_data['cos_MoSold'] = np.cos(2 * np.pi * train_data['MoSold'] / 12)

test_data['sin_MoSold'] = np.sin(2 * np.pi * test_data['MoSold'] / 12)
test_data['cos_MoSold'] = np.cos(2 * np.pi * test_data['MoSold'] / 12)

# Drop the original MoSold column, as it's now represented by sin and cos
train_data = train_data.drop('MoSold', axis=1)
test_data = test_data.drop('MoSold', axis=1)

# Optional: Time since a baseline year for YrSold
baseline_year = 2000
train_data['TimeSinceSold'] = train_data['YrSold'] - baseline_year
test_data['TimeSinceSold'] = test_data['YrSold'] - baseline_year

# Drop 'YrSold' if needed
train_data = train_data.drop('YrSold', axis=1)
test_data = test_data.drop('YrSold', axis=1)


In [None]:
# List of time-related features that should not be normalized
time_features = ['HouseAge', 'RemodAge', 'GarageAge', 'TimeSinceSold', 'sin_MoSold', 'cos_MoSold', 'OverallQual','OverallCond','MSSubClass']  # Adjust this list if you have more

# Identify the different types of variables
object_columns = train_data.select_dtypes(include=['object']).columns
int_columns = train_data.select_dtypes(include=['int64']).columns
float_columns = train_data.select_dtypes(include=['float64']).columns

# Combine int and float columns to be normalized, excluding the time-related features
numeric_columns = [col for col in list(int_columns) + list(float_columns) if col not in time_features]

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data (numeric columns only)
scaler.fit(train_data[numeric_columns])

# Transform the numeric columns in the training and test data
train_data[numeric_columns] = scaler.transform(train_data[numeric_columns])
test_data[numeric_columns] = scaler.transform(test_data[numeric_columns])

In [None]:
# Define the mappings for each ordinal feature
ordinal_mappings = {
    'GarageFinish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'NA': 0},
    'ExterQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'ExterCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'PoolQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'NA': 0},
    'Fence': {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, 'NA': 0},
    'GarageQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    'GarageCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    'FireplaceQu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    'HeatingQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'KitchenQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'LandSlope': {'Gtl': 3, 'Mod': 2, 'Sev': 1},
    'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    'BsmtCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
    'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0},
    'BsmtFinType1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0},
    'BsmtFinType2': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0}
}

# Apply the mapping to train_data and test_data
train_data = map_ordinal_features(train_data, ordinal_mappings)
test_data = map_ordinal_features(test_data, ordinal_mappings)

In [None]:
binary_mappings = {
    'Street': {'Grvl': 0, 'Pave': 1},
    'Alley': {'Grvl': 1, 'Pave': 2, 'NA': 0},
    'CentralAir': {'N': 0, 'Y': 1},
    'PavedDrive': {'N': 0, 'P': 1, 'Y': 2}
}

train_data = binary_encoding(train_data, binary_mappings)
test_data = binary_encoding(test_data, binary_mappings)

In [None]:
# List of nominal features for one-hot encoding
nominal_columns = ['MSZoning', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
                   'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
                   'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
                   'Foundation', 'Heating', 'Electrical', 'Functional', 'GarageType',
                   'MiscFeature', 'SaleType', 'SaleCondition']
print('Before One-Hot Encoding',train_data.shape)
print('Before One-Hot Encoding',test_data.shape)
# Apply one-hot encoding on train and test data
train_data = pd.get_dummies(train_data, columns=nominal_columns)
test_data = pd.get_dummies(test_data, columns=nominal_columns)

# Align the columns of train and test data after one-hot encoding (important to avoid misalignment)
train_data, test_data = train_data.align(test_data, join='left', axis=1)
print('After One-Hot Encoding',train_data.shape)
print('After One-Hot Encoding',test_data.shape)

Before One-Hot Encoding (1460, 80)
Before One-Hot Encoding (1459, 80)
After One-Hot Encoding (1460, 229)
After One-Hot Encoding (1459, 229)


In [None]:
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

In [None]:
# Check what GPU is available in Google Colab
!nvidia-smi

# Log-transform the target variable
price_log = np.log(train_price)

# Define parameter search space
search_spaces = {
    'learning_rate': Real(0.001, 0.5, 'uniform'),
    'max_depth': Integer(1, 70),
    'n_estimators': Integer(100, 800),
}

# Initialize the XGBRegressor model with GPU support
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='gpu_hist',  # Use GPU for training
    verbosity=2
)

kf = KFold(n_splits=10)  # 10-fold cross-validation

# Using negative RMSE as the scoring metric
optimizer = BayesSearchCV(
    estimator=model,
    search_spaces=search_spaces,
    n_iter=64,
    cv=kf,
    scoring='neg_root_mean_squared_error',  # RMSE scoring
    verbose=1,
    n_jobs=-1,
    return_train_score=True
)

# Fit the optimizer using X_train and log-transformed y_train
optimizer.fit(train_data, price_log)

# Get the best model
best_model = optimizer.best_estimator_

# Save the best model
model_path = '/content/drive/MyDrive/data/XGB_1.json'
best_model.save_model(model_path)

# Get cross-validation errors
cv_results = optimizer.cv_results_

# Calculate and print the mean RMSE for each fold (convert negative RMSE back to positive)
mean_rmse = -cv_results['mean_test_score']  # This is negative RMSE, so we negate it to get RMSE

print(f"Best parameters: {optimizer.best_params_}")
print(f"Mean Cross-validation RMSE: {mean_rmse.mean():.4f}")
print(f"Cross-validation RMSE for each iteration: {mean_rmse}")

print("Optimization complete. Best XGBoost model saved.")


Mon Oct 21 19:43:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   78C    P0              35W /  70W |    361MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    


    E.g. tree_method = "hist", device = "cuda"



Best parameters: OrderedDict([('learning_rate', 0.17379598960219678), ('max_depth', 1), ('n_estimators', 800)])
Mean Cross-validation RMSE: 0.1441
Cross-validation RMSE for each iteration: [0.14867353 0.15439319 0.15635527 0.15008357 0.15595755 0.14913277
 0.14603746 0.14558938 0.14851843 0.14995746 0.1478415  0.13711305
 0.12250037 0.12223658 0.12151542 0.22852083 0.15637727 0.12278206
 0.23855627 0.12995254 0.36962345 0.14354801 0.12376342 0.12536379
 0.14762268 0.12190767 0.13710334 0.13551159 0.13711595 0.12146636
 0.14739707 0.1232596  0.12099542 0.12700137 0.15647132 0.14569983
 0.13681333 0.12096653 0.15004862 0.12189518 0.14457065 0.15651156
 0.15018764 0.12204164 0.12193031 0.14785522 0.12684956 0.12132789
 0.12278792 0.14781197 0.12157715 0.12240428 0.15566704 0.12180245
 0.12084234 0.14599974 0.13556061 0.14912456 0.14571546 0.14717737
 0.12408788 0.14927759 0.12201216 0.15633012]
Optimization complete. Best XGBoost model saved.



    E.g. tree_method = "hist", device = "cuda"



In [None]:
# Convert test data to DMatrix
dtest = xgb.DMatrix(test_data)

# Load the trained model
model_path = f'/content/drive/MyDrive/data/XGB_1.json'
model = xgb.Booster(model_file=model_path)

# Get predictions
preds_log = model.predict(dtest)

# Convert log-transformed predictions back to original scale
preds = np.exp(preds_log)

# Ensure the length of predictions matches the number of rows in the test data
assert len(preds) == len(ids_test), "Mismatch between number of predictions and test data IDs"

# Create output DataFrame with original Id and predicted SalePrice
output = pd.DataFrame({'Id': ids_test, 'SalePrice': preds.squeeze()})

# Remove any duplicate rows by 'Id'
output.drop_duplicates(subset='Id', keep='first', inplace=True)

output.to_csv('/content/drive/MyDrive/data/predictions.csv', index=False)