In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here"s several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won"t be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
# Load files train.csv and test.csv
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()

Train shape: (1460, 81)
Test shape: (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# Import libraries
from scipy import stats
from scipy.special import boxcox1p
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor, StackingRegressor
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
# Outlier handling - Remove clear outliers
y = train["SalePrice"].copy()
outliers = train[(train["GrLivArea"] > 4000) & (y < 300000)].index
if len(outliers) > 0:
    train = train.drop(outliers)
    y = y.drop(outliers)
    train.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)
    print(f"Removed {len(outliers)} outliers")

Removed 2 outliers


In [6]:
# Extract IDs and features
train_ids = train["Id"]
test_ids = test["Id"]
train = train.drop(["Id", "SalePrice"], axis=1)
test = test.drop(["Id"], axis=1)

# Combine train and test for joint preprocessing
all_data = pd.concat([train, test], axis=0).reset_index(drop=True)
print(f"All data shape: {all_data.shape}")

All data shape: (2917, 79)


In [7]:
# Missing values handling
# LotFrontage - fill by Neighborhood median
if "LotFrontage" in all_data.columns:
    all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
        lambda x: x.fillna(x.median())
    )

# Numeric columns - fill with 0
num_cols = all_data.select_dtypes(include=["number"]).columns
all_data[num_cols] = all_data[num_cols].fillna(0)

# Categorical columns - fill with "None"
cat_cols = all_data.select_dtypes(exclude=["number"]).columns
all_data[cat_cols] = all_data[cat_cols].fillna("None")

print(f"Missing values remaining: {all_data.isnull().sum().sum()}")

Missing values remaining: 0


In [8]:
# Feature Engineering - Create new features
# 1. Total area
all_data["TotalSF"] = all_data["TotalBsmtSF"] + all_data["1stFlrSF"] + all_data["2ndFlrSF"]

# 2. Total bathrooms
all_data["TotalBath"] = (all_data["FullBath"] + 0.5 * all_data["HalfBath"] + 
                         all_data["BsmtFullBath"] + 0.5 * all_data["BsmtHalfBath"])

# 3. House age
all_data["HouseAge"] = all_data["YrSold"] - all_data["YearBuilt"]
all_data["RemodAge"] = all_data["YrSold"] - all_data["YearRemodAdd"]

# 4. Interaction features (product)
all_data["QualSF"] = all_data["OverallQual"] * all_data["TotalSF"]
all_data["QualCond"] = all_data["OverallQual"] * all_data["OverallCond"]

# 5. Squared features
all_data["TotalSF_sq"] = all_data["TotalSF"] ** 2
all_data["GrLivArea_sq"] = all_data["GrLivArea"] ** 2

# 6. Ratios
all_data["LivLotRatio"] = all_data["GrLivArea"] / (all_data["LotArea"] + 1)

# 7. Binary features
all_data["HasGarage"] = (all_data["GarageArea"] > 0).astype(int)
all_data["HasBsmt"] = (all_data["TotalBsmtSF"] > 0).astype(int)

print(f"Features after engineering: {all_data.shape[1]}")

Features after engineering: 90


In [9]:
# Categorical encoding
# 1. Ordinal encoding - ordered variables
quality_map = {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
ordinal_cols = ["ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "HeatingQC", 
                "KitchenQual", "FireplaceQu", "GarageQual", "GarageCond", "PoolQC"]

for col in ordinal_cols:
    if col in all_data.columns:
        all_data[col] = all_data[col].map(quality_map)

# 2. One-hot encoding - nominal variables
all_data = pd.get_dummies(all_data, drop_first=True)
print(f"Features after encoding: {all_data.shape[1]}")

Features after encoding: 246


In [10]:
# Split back to train and test
X = all_data.iloc[:len(train), :].copy()
X_test = all_data.iloc[len(train):, :].copy()

# Log transformation for the target to handle skewness
y_log = np.log1p(y)
print(f"Target skewness - Before: {y.skew():.4f}, After log: {y_log.skew():.4f}")
print(f"X: {X.shape}, X_test: {X_test.shape}")

Target skewness - Before: 1.8813, After log: 0.1216
X: (1458, 246), X_test: (1459, 246)


In [11]:
# Scale features using RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)
print("Data scaled")

Data scaled


In [12]:
# RMSE evaluation function with Cross-Validation
def rmse_cv(model, n_folds=5):
    kf = KFold(n_folds, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X_scaled, y_log, 
                                     scoring="neg_mean_squared_error", cv=kf))
    return rmse.mean(), rmse.std()

In [13]:
# Regularization models - models with regularization
# Ridge (L2), Lasso (L1), ElasticNet (L1 + L2)
ridge = Ridge(alpha=10, random_state=42)
lasso = Lasso(alpha=0.0005, max_iter=10000, random_state=42)
elastic = ElasticNet(alpha=0.001, l1_ratio=0.5, max_iter=10000, random_state=42)

print("Evaluate Regularization models:")
mean, std = rmse_cv(ridge)
print(f"Ridge: {mean:.5f} (+/-{std:.5f})")
mean, std = rmse_cv(lasso)
print(f"Lasso: {mean:.5f} (+/-{std:.5f})")
mean, std = rmse_cv(elastic)
print(f"ElasticNet: {mean:.5f} (+/-{std:.5f})")

Evaluate Regularization models:
Ridge: 0.11404 (+/-0.00742)
Lasso: 0.11440 (+/-0.00681)
ElasticNet: 0.11434 (+/-0.00685)


In [14]:
# Ensemble models - combined learning
# Random Forest (Bagging based)
rf = RandomForestRegressor(n_estimators=500, max_depth=15, random_state=42, n_jobs=-1)

# Gradient Boosting (Boosting)
gbr = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.03, 
                               max_depth=4, loss="huber", random_state=42)

# Bagging with Ridge
bag_ridge = BaggingRegressor(estimator=Ridge(alpha=10), n_estimators=50, 
                            random_state=42, n_jobs=-1)

print("\nEvaluate Ensemble models:")
mean, std = rmse_cv(rf)
print(f"Random Forest: {mean:.5f} (+/-{std:.5f})")
mean, std = rmse_cv(gbr)
print(f"Gradient Boosting: {mean:.5f} (+/-{std:.5f})")
mean, std = rmse_cv(bag_ridge)
print(f"Bagging Ridge: {mean:.5f} (+/-{std:.5f})")


Evaluate Ensemble models:
Random Forest: 0.13397 (+/-0.00917)
Gradient Boosting: 0.12450 (+/-0.00805)
Bagging Ridge: 0.11356 (+/-0.00755)


In [15]:
# Stacking Regressor - combine multiple models
stacking = StackingRegressor(
    estimators=[
        ("ridge", ridge),
        ("lasso", lasso),
        ("elastic", elastic),
        ("rf", rf),
        ("gbr", gbr)
    ],
    final_estimator=Ridge(alpha=5),
    cv=5,
    n_jobs=-1
)

print("\nEvaluate Stacking:")
mean, std = rmse_cv(stacking)
print(f"Stacking: {mean:.5f} (+/-{std:.5f})")

# Train the Stacking model on full training data
print("\nTraining stacking model...")
stacking.fit(X_scaled, y_log)
print("\nFinished training stacking model.")


Evaluate Stacking:
Stacking: 0.11256 (+/-0.00797)

Training stacking model...

Finished training stacking model.


In [16]:
# Predict and create submission
preds_log = stacking.predict(X_test_scaled)
preds = np.expm1(preds_log)  # Invert log transformation

submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": preds
})
submission.to_csv("submission.csv", index=False)
print(f"Saved submission.csv with {len(submission)} predictions")


Saved submission.csv with 1459 predictions


In [17]:
# !cat submission.csv
submission.head(10)

Unnamed: 0,Id,SalePrice
0,1461,116011.868882
1,1462,157036.846074
2,1463,179719.574112
3,1464,196148.685919
4,1465,191555.444338
5,1466,172544.133451
6,1467,181110.903642
7,1468,165401.065966
8,1469,193906.460411
9,1470,118668.397911
