In [1]:
import os, sys
import numpy as np
import pandas as pd
import joblib

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

def divide_df(all_data, train_len):
    # Returns divided dfs of training and test set
    return all_data.loc[:train_len-1].copy(), all_data.loc[train_len:].drop(['SalePrice'], axis=1).copy()

# Load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Store train length and target
train_len = len(df_train)
y_train = df_train['SalePrice'].copy()

# Log transform target for better distribution
y_train_log = np.log1p(y_train)

df_all = concat_df(df_train, df_test)

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
print("\nMissing values in train:")
print(df_train.isnull().sum()[df_train.isnull().sum() > 0].sort_values(ascending=False))

# ============================================
# FEATURE ENGINEERING
# ============================================

# 1. Handle missing values strategically
# Categorical features - fill with 'None' or mode
categorical_none = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
                    'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 
                    'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

for col in categorical_none:
    df_all[col] = df_all[col].fillna('None')

# MSZoning - fill with mode
df_all['MSZoning'] = df_all['MSZoning'].fillna(df_all['MSZoning'].mode()[0])

# Utilities - mostly same value, can drop
df_all = df_all.drop(['Utilities'], axis=1)

# Functional - fill with most common
df_all['Functional'] = df_all['Functional'].fillna('Typ')

# Exterior - fill with mode
df_all['Exterior1st'] = df_all['Exterior1st'].fillna(df_all['Exterior1st'].mode()[0])
df_all['Exterior2nd'] = df_all['Exterior2nd'].fillna(df_all['Exterior2nd'].mode()[0])

# Electrical - fill with mode
df_all['Electrical'] = df_all['Electrical'].fillna(df_all['Electrical'].mode()[0])

# KitchenQual - fill with mode
df_all['KitchenQual'] = df_all['KitchenQual'].fillna(df_all['KitchenQual'].mode()[0])

# SaleType - fill with mode
df_all['SaleType'] = df_all['SaleType'].fillna(df_all['SaleType'].mode()[0])

# Numeric features - fill with 0 or median
df_all['LotFrontage'] = df_all.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median()))

numeric_zero = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
                'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 
                'GarageArea', 'MasVnrArea']

for col in numeric_zero:
    df_all[col] = df_all[col].fillna(0)

df_all['MasVnrType'] = df_all['MasVnrType'].fillna('None')

# 2. Create new features
# Total square footage
df_all['TotalSF'] = df_all['TotalBsmtSF'] + df_all['1stFlrSF'] + df_all['2ndFlrSF']

# Total bathrooms
df_all['TotalBath'] = (df_all['FullBath'] + 0.5 * df_all['HalfBath'] +
                       df_all['BsmtFullBath'] + 0.5 * df_all['BsmtHalfBath'])

# Total porch area
df_all['TotalPorchSF'] = (df_all['OpenPorchSF'] + df_all['3SsnPorch'] +
                          df_all['EnclosedPorch'] + df_all['ScreenPorch'] +
                          df_all['WoodDeckSF'])

# Has pool, garage, basement, fireplace
df_all['HasPool'] = (df_all['PoolArea'] > 0).astype(int)
df_all['HasGarage'] = (df_all['GarageArea'] > 0).astype(int)
df_all['HasBsmt'] = (df_all['TotalBsmtSF'] > 0).astype(int)
df_all['HasFireplace'] = (df_all['Fireplaces'] > 0).astype(int)

# House age and remodel age
df_all['HouseAge'] = df_all['YrSold'] - df_all['YearBuilt']
df_all['RemodelAge'] = df_all['YrSold'] - df_all['YearRemodAdd']

# Is remodeled
df_all['IsRemodeled'] = (df_all['YearBuilt'] != df_all['YearRemodAdd']).astype(int)

# Quality-Area interactions
df_all['OverallQual_TotalSF'] = df_all['OverallQual'] * df_all['TotalSF']
df_all['OverallQual_GrLivArea'] = df_all['OverallQual'] * df_all['GrLivArea']
df_all['OverallQual_GarageCars'] = df_all['OverallQual'] * df_all['GarageCars']

# Condition features
df_all['TotalQual'] = df_all['OverallQual'] + df_all['OverallCond']
df_all['QualCondDiff'] = df_all['OverallQual'] - df_all['OverallCond']

# Neighborhood quality (based on median price in training data)
df_train_temp, _ = divide_df(df_all, train_len)
neighborhood_price = df_train_temp.groupby('Neighborhood')['SalePrice'].median()
df_all['NeighborhoodPrice'] = df_all['Neighborhood'].map(neighborhood_price)

# Living area per room
df_all['TotRmsAbvGrd_Safe'] = df_all['TotRmsAbvGrd'].replace(0, 1)
df_all['LivAreaPerRoom'] = df_all['GrLivArea'] / df_all['TotRmsAbvGrd_Safe']
df_all = df_all.drop('TotRmsAbvGrd_Safe', axis=1)

# Garage age
df_all['GarageAge'] = df_all['YrSold'] - df_all['GarageYrBlt']
df_all['GarageAge'] = df_all['GarageAge'].replace(-np.inf, 0).replace(np.inf, 0).fillna(0)

# Basement finish ratio
df_all['BsmtFinRatio'] = df_all['BsmtFinSF1'] / (df_all['TotalBsmtSF'] + 1)

# 3. Ordinal encoding for quality features
quality_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
quality_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
                'HeatingQC', 'KitchenQual', 'FireplaceQu', 
                'GarageQual', 'GarageCond', 'PoolQC']

for col in quality_cols:
    df_all[col] = df_all[col].map(quality_map)

# Basement exposure
exposure_map = {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
df_all['BsmtExposure'] = df_all['BsmtExposure'].map(exposure_map)

# Basement finish type
bsmtfin_map = {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
df_all['BsmtFinType1'] = df_all['BsmtFinType1'].map(bsmtfin_map)
df_all['BsmtFinType2'] = df_all['BsmtFinType2'].map(bsmtfin_map)

# Garage finish
garagefin_map = {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
df_all['GarageFinish'] = df_all['GarageFinish'].map(garagefin_map)

# Fence
fence_map = {'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}
df_all['Fence'] = df_all['Fence'].map(fence_map)

# 4. Get remaining categorical features for one-hot encoding
categorical_features = df_all.select_dtypes(include=['object']).columns.tolist()

# Remove Id columns if present
if 'Id' in categorical_features:
    categorical_features.remove('Id')

# One-hot encode
df_all = pd.get_dummies(df_all, columns=categorical_features, drop_first=True)

# 5. Handle any remaining missing values
df_all = df_all.fillna(0)

# Divide back to train and test
df_train, df_test = divide_df(df_all, train_len)

print("\nFinal train shape:", df_train.shape)
print("Final test shape:", df_test.shape)

# ============================================
# MODEL TRAINING
# ============================================

# Drop unnecessary columns
drop_cols = ['Id', 'SalePrice']
feature_cols = [col for col in df_train.columns if col not in drop_cols]

X = df_train[feature_cols]
y = y_train_log
X_test = df_test[feature_cols]

# Robust scaling for features
scaler = RobustScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

# Define models
models = {
    'RandomForest': RandomForestRegressor(
        n_estimators=500,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=3,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    ),
    'GradientBoosting': GradientBoostingRegressor(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=4,
        min_samples_split=5,
        min_samples_leaf=3,
        subsample=0.8,
        random_state=42
    ),
    'XGBoost': XGBRegressor(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=4,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    ),
    'LightGBM': LGBMRegressor(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=4,
        num_leaves=31,
        min_child_samples=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
}

# K-Fold Cross-Validation
N = 5
kf = KFold(n_splits=N, shuffle=True, random_state=42)

model_predictions = {}
model_scores = {}

for model_name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {model_name}")
    print(f"{'='*50}")
    
    preds_test = np.zeros((len(X_test_scaled), N))
    rmse_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled), 1):
        X_tr, X_val = X_scaled.iloc[train_idx], X_scaled.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Train model
        model.fit(X_tr, y_tr)
        
        # Validation predictions
        y_val_pred = model.predict(X_val)
        
        # Calculate RMSE on log scale
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        rmse_scores.append(rmse)
        
        print(f"  Fold {fold}/{N} - RMSE: {rmse:.5f}")
        
        # Test predictions
        preds_test[:, fold - 1] = model.predict(X_test_scaled)
    
    # Average predictions
    preds_mean = preds_test.mean(axis=1)
    model_predictions[model_name] = preds_mean
    model_scores[model_name] = np.mean(rmse_scores)
    
    print(f"\n{model_name} - Average CV RMSE: {np.mean(rmse_scores):.5f}")

# ============================================
# ENSEMBLE PREDICTIONS
# ============================================

print(f"\n{'='*50}")
print("Model Performance Summary")
print(f"{'='*50}")
for model_name, score in sorted(model_scores.items(), key=lambda x: x[1]):
    print(f"{model_name}: {score:.5f}")

# Weighted ensemble based on inverse RMSE
weights = {}
total_inv_rmse = sum(1/score for score in model_scores.values())
for model_name, score in model_scores.items():
    weights[model_name] = (1/score) / total_inv_rmse
    print(f"\n{model_name} weight: {weights[model_name]:.4f}")

# Create ensemble prediction
ensemble_pred_log = sum(model_predictions[name] * weight 
                        for name, weight in weights.items())

# Convert back from log scale
ensemble_pred = np.expm1(ensemble_pred_log)

# Create submission
submission = pd.DataFrame({
    "Id": df_test["Id"],
    "SalePrice": ensemble_pred
})

submission.to_csv("submission_house_prices.csv", index=False)

print("\n" + "="*50)
print("Submission file 'submission_house_prices.csv' created.")
print("="*50)

# Also save individual model predictions
for model_name, preds_log in model_predictions.items():
    preds = np.expm1(preds_log)
    sub = pd.DataFrame({
        "Id": df_test["Id"],
        "SalePrice": preds
    })
    filename = f"submission_{model_name.lower()}.csv"
    sub.to_csv(filename, index=False)
    print(f"Individual submission saved: {filename}")

Train shape: (1460, 81)
Test shape: (1459, 80)

Missing values in train:
PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtFinType2      38
BsmtExposure      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
Electrical         1
dtype: int64

Final train shape: (1460, 231)
Final test shape: (1459, 230)

Training RandomForest
  Fold 1/5 - RMSE: 0.14888
  Fold 2/5 - RMSE: 0.12309
  Fold 3/5 - RMSE: 0.15016
  Fold 4/5 - RMSE: 0.14605
  Fold 5/5 - RMSE: 0.11635

RandomForest - Average CV RMSE: 0.13691

Training GradientBoosting
  Fold 1/5 - RMSE: 0.13514
  Fold 2/5 - RMSE: 0.11355
  Fold 3/5 - RMSE: 0.15835
  Fold 4/5 - RMSE: 0.12236
  Fold 5/5 - RMSE: 0.10908

GradientBoosting - Average CV RMSE: 0.12769

Training XGBoost
  Fold 1/5 - RMSE: 0.13314
  