Let's clean our data

In [109]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('data/train.csv')

# Separate features and target
X = df.drop('SalePrice', axis=1)  # assuming 'SalePrice' is your target
y = df['SalePrice']

def encode_categorical_features(X):
    """
    Encodes categorical features in the DataFrame X using OneHotEncoder.
    Returns a DataFrame with encoded features.
    """
    # Get categorical columns from features only
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # Handle missing values first
    X_clean = X.copy()
    # for col in categorical_cols:
    #     X_clean[col] = X_clean[col].fillna('Missing')

    # Fill numeric missing values
    numeric_cols = X_clean.select_dtypes(include=[np.number]).columns.tolist()
    for col in numeric_cols:
        X_clean[col] = X_clean[col].fillna(X_clean[col].median())

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
        ],
        remainder='passthrough'  # keep other (non-categorical) columns
    )

    transformed = preprocessor.fit_transform(X_clean)
    feature_names = preprocessor.get_feature_names_out()
    return pd.DataFrame(transformed, columns=feature_names) # type: ignore

final_df = encode_categorical_features(X)



Let's use a baseline model first:

In [110]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
# Use all your training data for CV
cv_scores = cross_val_score(lr_model, final_df, y, cv=5, scoring='neg_mean_squared_error')



def print_scores(scores):
    rmse = np.sqrt(-scores)
    # Calculate percentage error
    mean_target = y.mean()
    rmse_mean = rmse.mean()
    rmse_std = rmse.std()

    # Calculate percentage error
    percentage_error = (rmse_mean / mean_target) * 100
    percentage_error_std = (rmse_std * 2 / mean_target) * 100

    print(f"Model Performance (CV): ${rmse_mean:,.2f} (+/- ${rmse_std * 2:,.2f})")
    print(f"Percentage Error (CV): {percentage_error:.2f}% (+/- {percentage_error_std:.2f}%)")
    print(f"Target variable mean: ${mean_target:,.2f}")
    
print_scores(cv_scores)


Model Performance (CV): $35,433.85 (+/- $16,530.80)
Percentage Error (CV): 19.59% (+/- 9.14%)
Target variable mean: $180,921.20


Now that we have a baseline let's try more models and choose one that performs best

In [111]:
# Random forest
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_cv_scores = cross_val_score(rf_model, final_df, y, cv=5, scoring='neg_mean_squared_error')
print_scores(rf_cv_scores)

Model Performance (CV): $30,095.84 (+/- $7,696.55)
Percentage Error (CV): 16.63% (+/- 4.25%)
Target variable mean: $180,921.20


In [112]:
# XGBoost
import xgboost as xgb

xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42, learning_rate=0.1)

xgb_cv_scores = cross_val_score(xgb_model, final_df, y, cv=5, scoring='neg_mean_squared_error')
print_scores(xgb_cv_scores)


Model Performance (CV): $27,268.95 (+/- $8,965.59)
Percentage Error (CV): 15.07% (+/- 4.96%)
Target variable mean: $180,921.20


In [113]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42, learning_rate=0.1)

gb_cv_scores = cross_val_score(gb_model, final_df, y, cv=5, scoring='neg_mean_squared_error')
print_scores(gb_cv_scores)


Model Performance (CV): $26,219.22 (+/- $6,816.72)
Percentage Error (CV): 14.49% (+/- 3.77%)
Target variable mean: $180,921.20


In [114]:
import lightgbm as lgb

lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42, learning_rate=0.1, verbose=-1)

lgb_cv_scores = cross_val_score(lgb_model, final_df, y, cv=5, scoring='neg_mean_squared_error') # type: ignore

print_scores(lgb_cv_scores)

Model Performance (CV): $28,685.10 (+/- $8,337.02)
Percentage Error (CV): 15.86% (+/- 4.61%)
Target variable mean: $180,921.20


In [115]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# SVR works better with scaled features
svr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR(kernel='rbf', C=100, gamma='scale'))
])

svr_cv_scores = cross_val_score(svr_pipeline, final_df, y, cv=5, scoring='neg_mean_squared_error')
print_scores(svr_cv_scores)

Model Performance (CV): $79,331.77 (+/- $12,720.58)
Percentage Error (CV): 43.85% (+/- 7.03%)
Target variable mean: $180,921.20


From all our models GradientBoosting gives the best results. Let's do feature engineering now to improve our results

In [116]:
# Let's remove Id
from sklearn.ensemble import GradientBoostingRegressor

X_clean = X.copy()
X_clean = X_clean.drop(columns=["Id"]) # remove numerical features with 0.03 < correlation > -0.03. Removing more gives worse results

final_df = encode_categorical_features(X_clean)

gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42, learning_rate=0.1)

gb_cv_scores = cross_val_score(gb_model, final_df, y, cv=5, scoring='neg_mean_squared_error')
print_scores(gb_cv_scores)

Model Performance (CV): $26,702.86 (+/- $7,965.73)
Percentage Error (CV): 14.76% (+/- 4.40%)
Target variable mean: $180,921.20


In [117]:
# Let's remove features with very weak correlation (close to zero)
from sklearn.ensemble import GradientBoostingRegressor

X_clean = X.copy()
X_clean = X_clean.drop(columns=['BsmtFinSF2', 'BsmtHalfBath', 'MiscVal', "Id"]) # remove numerical features with 0.03 < correlation > -0.03. Removing more gives worse results

final_df = encode_categorical_features(X_clean)

gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42, learning_rate=0.1)

gb_cv_scores = cross_val_score(gb_model, final_df, y, cv=5, scoring='neg_mean_squared_error')
print_scores(gb_cv_scores)

Model Performance (CV): $26,289.06 (+/- $7,933.59)
Percentage Error (CV): 14.53% (+/- 4.39%)
Target variable mean: $180,921.20


In [118]:
# Let's remove features with very weak correlation (close to zero)
from sklearn.ensemble import GradientBoostingRegressor

X_clean = X.copy()
X_clean = X_clean.drop(columns=['BsmtFinSF2', 'BsmtHalfBath', 'MiscVal', "Id", 'LowQualFinSF', 'YrSold']) # remove numerical features with 0.03 < correlation > -0.03. Removing more gives worse results

final_df = encode_categorical_features(X_clean)

gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42, learning_rate=0.1)

gb_cv_scores = cross_val_score(gb_model, final_df, y, cv=5, scoring='neg_mean_squared_error')
print_scores(gb_cv_scores)

Model Performance (CV): $26,589.31 (+/- $6,461.66)
Percentage Error (CV): 14.70% (+/- 3.57%)
Target variable mean: $180,921.20


In [119]:
# Let's try to remove weak categorical features

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
from sklearn.ensemble import RandomForestRegressor

# Method 4: Tree-based Feature Importance
def get_low_importance_categorical(X, y, importance_threshold=0.001):
    """
    Use Random Forest to identify categorical features with low importance
    """
    # Encode categorical features temporarily
    temp_df = encode_categorical_features(X)
    
    # Fit Random Forest to get feature importances
    rf_temp = RandomForestRegressor(n_estimators=50, random_state=42)
    rf_temp.fit(temp_df, y)
    
    # Get feature importances
    feature_importance = pd.DataFrame({
        'feature': temp_df.columns,
        'importance': rf_temp.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Identify original categorical column names from encoded features
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    categorical_importance = {}
    
    for cat_col in categorical_cols:
        # Find all encoded features that belong to this categorical column
        encoded_features = [f for f in temp_df.columns if f.startswith(f'cat__{cat_col}')]
        # Sum importances of all encoded features for this categorical column
        total_importance = feature_importance[feature_importance['feature'].isin(encoded_features)]['importance'].sum()
        categorical_importance[cat_col] = total_importance
    
    # Identify low importance categorical features
    low_importance_features = [col for col, imp in categorical_importance.items() 
                              if imp < importance_threshold]
    
    return low_importance_features, categorical_importance

# Apply Method 4: Tree-based Feature Importance
print("\n=== Method 4: Tree-based Feature Importance ===")
low_importance_features, cat_importance = get_low_importance_categorical(X_clean, y)

print("Categorical feature importances:")
sorted_importance = sorted(cat_importance.items(), key=lambda x: x[1], reverse=True)
for feature, importance in sorted_importance:
    print(f"{feature}: {importance:.4f}")

print(f"\nLow importance features: {low_importance_features}")


X_clean_clean = X.copy()
X_clean_clean.drop(columns=low_importance_features, inplace=True)

final_df_clean = encode_categorical_features(X_clean_clean)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42, learning_rate=0.1)
gb_cv_scores_clean = cross_val_score(gb_model, final_df_clean, y, cv=5, scoring='neg_mean_squared_error')
print_scores(gb_cv_scores_clean)




=== Method 4: Tree-based Feature Importance ===
Categorical feature importances:
BsmtQual: 0.0054
Neighborhood: 0.0049
GarageType: 0.0039
KitchenQual: 0.0039
ExterQual: 0.0034
BsmtExposure: 0.0025
Exterior1st: 0.0023
GarageFinish: 0.0022
CentralAir: 0.0022
FireplaceQu: 0.0021
Exterior2nd: 0.0021
MSZoning: 0.0019
LandContour: 0.0016
MasVnrType: 0.0016
LotShape: 0.0015
BsmtFinType1: 0.0013
SaleCondition: 0.0012
SaleType: 0.0012
LotConfig: 0.0011
Condition1: 0.0011
HouseStyle: 0.0010
RoofStyle: 0.0010
GarageQual: 0.0009
Heating: 0.0008
Foundation: 0.0008
ExterCond: 0.0008
BsmtFinType2: 0.0007
HeatingQC: 0.0007
GarageCond: 0.0007
Functional: 0.0005
BldgType: 0.0005
Fence: 0.0005
RoofMatl: 0.0005
LandSlope: 0.0004
BsmtCond: 0.0004
PavedDrive: 0.0004
Alley: 0.0003
Condition2: 0.0002
Electrical: 0.0002
PoolQC: 0.0001
MiscFeature: 0.0001
Street: 0.0000
Utilities: 0.0000

Low importance features: ['Street', 'Alley', 'Utilities', 'LandSlope', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',

Removing these features worsen our score. Let's remove only the limited ['BsmtFinSF2', 'BsmtHalfBath', 'MiscVal', "Id", 'LowQualFinSF', 'YrSold'] that give us a base score of 14.70% (+/- 3.57%)

In [120]:
# Let's feature engineer new columns

X_new_features = X.copy()

# Aggregate Features

# Total bathrooms
X_new_features['TotalBath'] = X_new_features['FullBath'] + X_new_features['HalfBath'] * 0.5 + X_new_features['BsmtFullBath'] + X_new_features['BsmtHalfBath'] * 0.5

# Total porch area
X_new_features['TotalPorchSF'] = X_new_features['OpenPorchSF'] + X_new_features['EnclosedPorch'] + X_new_features['3SsnPorch'] + X_new_features['ScreenPorch']

# Living area per room
X_new_features['LivAreaPerRoom'] = X_new_features['GrLivArea'] / X_new_features['TotRmsAbvGrd']

# Age-Based Features

# House age at time of sale
X_new_features['HouseAge'] = X_new_features['YrSold'] - X_new_features['YearBuilt']

# Years since remodel
X_new_features['YearsSinceRemodel'] = X_new_features['YrSold'] - X_new_features['YearRemodAdd']

# Garage age
X_new_features['GarageAge'] = X_new_features['YrSold'] - X_new_features['GarageYrBlt']

# Was house remodeled?
X_new_features['IsRemodeled'] = (X_new_features['YearRemodAdd'] != X_new_features['YearBuilt']).astype(int)


# Quality/Condition Combinations

# Overall quality * condition interaction
X_new_features['QualCondProduct'] = X_new_features['OverallQual'] * X_new_features['OverallCond']

# Quality scores (convert categorical to numerical)
quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
X_new_features['ExterQual_num'] = X_new_features['ExterQual'].map(quality_map)
X_new_features['KitchenQual_num'] = X_new_features['KitchenQual'].map(quality_map)

# Average quality score
X_new_features['AvgQuality'] = (X_new_features['OverallQual'] + X_new_features['ExterQual_num'] + X_new_features['KitchenQual_num']) / 3


# Has specific features
X_new_features['HasPool'] = (X_new_features['PoolArea'] > 0).astype(int)
X_new_features['HasGarage'] = (X_new_features['GarageArea'] > 0).astype(int)
X_new_features['HasBasement'] = (X_new_features['TotalBsmtSF'] > 0).astype(int)
X_new_features['HasFireplace'] = (X_new_features['Fireplaces'] > 0).astype(int)
X_new_features['HasWoodDeck'] = (X_new_features['WoodDeckSF'] > 0).astype(int)
X_new_features['HasFence'] = (X_new_features['Fence'] != 'NA').astype(int)


# Garage to lot area ratio
X_new_features['GarageRatio'] = X_new_features['GarageArea'] / X_new_features['LotArea']

# Living area to lot area ratio
X_new_features['LivAreaRatio'] = X_new_features['GrLivArea'] / X_new_features['LotArea']


# Is corner lot
X_new_features['IsCornerLot'] = (X_new_features['LotConfig'] == 'Corner').astype(int)

# Sale season
X_new_features['SaleSeason'] = X_new_features['MoSold'].apply(lambda x: 'Spring' if x in [3,4,5] 
                                      else 'Summer' if x in [6,7,8]
                                      else 'Fall' if x in [9,10,11] 
                                      else 'Winter')

# House size categories
X_new_features['HouseSizeCategory'] = pd.cut(X_new_features['GrLivArea'], 
                                bins=[0, 1200, 1800, 2500, float('inf')], 
                                labels=['Small', 'Medium', 'Large', 'XLarge'])


# Create indicators for missing values in key features
missing_features = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
for feature in missing_features:
    X_new_features[f'{feature}_Missing'] = X_new_features[feature].isnull().astype(int)
    
final_df = encode_categorical_features(X_new_features)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42, learning_rate=0.1)
gb_cv_scores_clean = cross_val_score(gb_model, final_df, y, cv=5, scoring='neg_mean_squared_error')
print_scores(gb_cv_scores_clean)


Model Performance (CV): $25,893.63 (+/- $6,970.47)
Percentage Error (CV): 14.31% (+/- 3.85%)
Target variable mean: $180,921.20
