# House Price Prediction using Machine Learning

In [2]:

!pip install scikit-learn joblib

# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, r2_score

import joblib




Loading the Train.csv dataset for training the ML model

In [3]:
from google.colab import files

uploaded = files.upload()


Saving training_set.csv to training_set.csv


In [4]:

df = pd.read_csv('training_set.csv')

print(f'Dataset shape: {df.shape}')
df.head()


Dataset shape: (1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
df = df.drop('Id', axis=1)
print(f'Dataset shape after dropping Id column: {df.shape}')
df.head()

Dataset shape after dropping Id column: (1460, 80)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


Feature Splitting

In [6]:

y = df['SalePrice']
X = df.drop(['SalePrice', 'Id'], axis=1, errors='ignore')

print(f'Features shape: {X.shape}')
print(f'Target shape: {y.shape}')


Features shape: (1460, 79)
Target shape: (1460,)


In [7]:
nulls = X.isnull().sum()
nulls[nulls > 0].sort_values(ascending=False)

Unnamed: 0,0
PoolQC,1453
MiscFeature,1406
Alley,1369
Fence,1179
MasVnrType,872
FireplaceQu,690
LotFrontage,259
GarageType,81
GarageYrBlt,81
GarageFinish,81


In [8]:
numerical = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical = X.select_dtypes(include=['object']).columns.tolist()

print('Numerical:', numerical[:5])
print('Categorical:', categorical[:5])
print(f'Numerical columns: {len(numerical)}, Categorical columns: {len(categorical)}')


Numerical: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond']
Categorical: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour']
Numerical columns: 36, Categorical columns: 43


In [9]:
# Quality columns with order
ordinal_cols = [
    'ExterQual', 'ExterCond',
    'BsmtQual', 'BsmtCond',
    'HeatingQC', 'KitchenQual',
    'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'
]

# Ordered labels (worst to best)
ordinal_map = ['Po', 'Fa', 'TA', 'Gd', 'Ex']

print('Ordinal columns:', ordinal_cols)


Ordinal columns: ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']


In [10]:
# Numeric
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Nominal categorical (One-Hot Encoding)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Ordinal (quality ratings)
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Po')),
    ('ordinal', OrdinalEncoder(categories=[ordinal_map]*len(ordinal_cols)))
])

# Combine all
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical),
    ('cat', categorical_transformer, [c for c in categorical if c not in ordinal_cols]),
    ('ord', ordinal_transformer, [c for c in ordinal_cols if c in X.columns])
])


Test-Train split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print(f'Train shape: {X_train.shape}, Test shape: {X_test.shape}')


Train shape: (1168, 79), Test shape: (292, 79)


In [12]:
# SelectKBest
feature_selector = SelectKBest(score_func=f_regression, k=50)

# Ridge Pipeline
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('ridge', RidgeCV(alphas=[0.1, 1.0, 10.0]))
])

# Lasso Pipeline
lasso_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('lasso', LassoCV(alphas=[0.1, 1.0, 10.0]))
])


In [13]:
ridge_pipeline.fit(X_train, y_train)
lasso_pipeline.fit(X_train, y_train)

print('Models trained.')


Models trained.


In [14]:
ridge_pred = ridge_pipeline.predict(X_test)
lasso_pred = lasso_pipeline.predict(X_test)

ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))

ridge_r2 = r2_score(y_test, ridge_pred)
lasso_r2 = r2_score(y_test, lasso_pred)

print(f'Ridge RMSE: {ridge_rmse:.2f}, R2: {ridge_r2:.4f}')
print(f'Lasso RMSE: {lasso_rmse:.2f}, R2: {lasso_r2:.4f}')


Ridge RMSE: 32867.04, R2: 0.8592
Lasso RMSE: 32638.26, R2: 0.8611


Selecting which is the best model based on the minimum RMSE Score

In [34]:
if ridge_rmse < lasso_rmse:
    best_pipeline = ridge_pipeline
    print("Best model: Ridge")
else:
    best_pipeline = lasso_pipeline
    print("Best model: Lasso")



Best model: Lasso


# Testing the Model

In [37]:
from google.colab import files
uploaded = files.upload()


Saving sample_set.csv to sample_set (1).csv


In [38]:
test_df = pd.read_csv('sample_set.csv')
print(test_df.shape)
test_df.head()

(1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


Now we predict the SalePrice with the help of the best pipeline which is the Lasso Pipeline from the above results

In [40]:
# Predicting with Lasso
test_preds = best_pipeline.predict(test_df)

# Combining the Id and predicted SalePrice in a single dataframe file
output = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': test_preds
})

output.head()


Unnamed: 0,Id,SalePrice
0,1461,112483.023978
1,1462,156546.668138
2,1463,175356.895895
3,1464,182743.689738
4,1465,201400.92052


Downloading the Predictions for individual IDs of houses

In [21]:
output.to_csv('submission.csv', index=False)

files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Cross validation with the Training data

# Predicting on training data with lasso and ridge regression

In [43]:
ridge_train_pred = ridge_pipeline.predict(X_train)
lasso_train_pred = lasso_pipeline.predict(X_train)

In [45]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

ridge_train_rmse = np.sqrt(mean_squared_error(y_train, ridge_train_pred))
ridge_train_r2 = r2_score(y_train, ridge_train_pred)

lasso_train_rmse = np.sqrt(mean_squared_error(y_train, lasso_train_pred))
lasso_train_r2 = r2_score(y_train, lasso_train_pred)

print("Ridge Train")
print(f'RMSE: {ridge_train_rmse:.2f}, R2: {ridge_train_r2:.4f}')
print("-"*50)
print("Lasso Train")
print(f'RMSE: {lasso_train_rmse:.2f}, R2: {lasso_train_r2:.4f}')


Ridge Train
RMSE: 31831.66, R2: 0.8301
--------------------------------------------------
Lasso Train
RMSE: 31708.56, R2: 0.8314


# Now Comparing both the Test and Train dataset predictions

In [46]:
print("Ridge Regression")
print(f'Ridge Train RMSE: {ridge_train_rmse:.2f}, Test RMSE: {ridge_rmse:.2f}')
print(f'Ridge Train R2: {ridge_train_r2:.4f}, Test R2: {ridge_r2:.4f}')
print("-"*50)

print("\nLasso Regression")
print(f'Lasso Train RMSE: {lasso_train_rmse:.2f}, Test RMSE: {lasso_rmse:.2f}')
print(f'Lasso Train R2: {lasso_train_r2:.4f}, Test R2: {lasso_r2:.4f}')


Ridge Regression
Ridge Train RMSE: 31831.66, Test RMSE: 32867.04
Ridge Train R2: 0.8301, Test R2: 0.8592
--------------------------------------------------

Lasso Regression
Lasso Train RMSE: 31708.56, Test RMSE: 32638.26
Lasso Train R2: 0.8314, Test R2: 0.8611


In [33]:

test_preds = best_pipeline.predict(test_df)

submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': test_preds
})

submission.to_csv('submission.csv', index=False)

from google.colab import files
files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 📊 Model Evaluation Summary

We compared **Ridge Regression** and **Lasso Regression** using RMSE and R² metrics for both training and test sets.

**Ridge Regression**
- **Train RMSE:** 31,831.66  
- **Test RMSE:** 32,867.04  
- **Train R²:** 0.8301  
- **Test R²:** 0.8592

**Lasso Regression**
- **Train RMSE:** 31,708.56  
- **Test RMSE:** 32,638.26  
- **Train R²:** 0.8314  
- **Test R²:** 0.8611

The results indicate that both models generalize well, with minimal performance drop from training to test data. The **Test RMSE is close to the Train RMSE**, and the **Test R² is slightly higher**, which confirms there is no significant overfitting.

✅ **Best model:** Based on slightly better test metrics, **Lasso Regression** was selected as the final model for predicting house prices on the out-of-sample test data.

---


## 🎉 **Conclusion**

This notebook demonstrates an end-to-end workflow for:
- **Data preprocessing**
- **Feature selection**
- **Ridge and Lasso model training**
- **Model selection**
- **Evaluation**
- **Out-of-sample prediction**
- **Exporting final results**

With clean feature engineering, proper regularization, and evaluation, the model should provide robust predictions for house prices in the given dataset.
