In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [2]:
df = pd.read_csv('Housing-Prices-in-Chennai.csv')

In [3]:
print(df.head())
print(df.info())

      Price  Area        Location  No. of Bedrooms  Resale  MaintenanceStaff  \
0   5500000  1310   Perungalathur                3       0                 0   
1   5350000  1126      Madhavaram                2       0                 0   
2   8205000  1307      Karapakkam                3       0                 0   
3  23400000  3600  Thiruvidandhai                3       0                 0   
4  10100000  1700  Iyappanthangal                3       0                 0   

   Gymnasium  SwimmingPool  LandscapedGardens  JoggingTrack  ...  \
0          0             0                  0             0  ...   
1          1             1                  1             0  ...   
2          1             1                  1             1  ...   
3          1             1                  0             1  ...   
4          1             1                  1             1  ...   

   LiftAvailable  BED  VaastuCompliant  Microwave  GolfCourse  TV  \
0              0    0                0   

In [4]:
df["AreaPerBedroom"] = df["Area"] / df["No. of Bedrooms"]
df["IsLuxury"] = (df["SwimmingPool"] + df["Gymnasium"] + df["AC"] + df["PowerBackup"]) > 2
df["IsLuxury"] = df["IsLuxury"].astype(int)

In [5]:
features = [
    "Area", "Location", "No. of Bedrooms", "Resale", "Hospital", "LiftAvailable", "School", "CarParking",
    "Gymnasium", "SwimmingPool", "PowerBackup", "ClubHouse", "ShoppingMall", "VaastuCompliant",
    "Children'splayarea", "GolfCourse", "AC", "AreaPerBedroom", "IsLuxury"
]

In [6]:
X = df[features]
y = np.log1p(df["Price"])

In [7]:
# One-hot encode Location
X_encoded = pd.get_dummies(X, columns=["Location"], drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train model
model = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

In [None]:
print("\n📥 Enter property details to predict price:")

# User inputs
user_area = int(input("Enter Area (in sq.ft): "))
user_location = input("Enter Location: ")
user_bedrooms = int(input("Enter No. of Bedrooms: "))

# Build input row with user values
default_values = df.mode().iloc[0].to_dict()
input_dict = {
    "Area": user_area,
    "No. of Bedrooms": user_bedrooms,
    "Location": user_location.title(),  # Capitalize location input for consistency
}

# Fill other features with most frequent (mode) values
for feature in features:
    if feature not in input_dict:
        input_dict[feature] = default_values[feature]

# Add new features based on user input
input_dict["AreaPerBedroom"] = user_area / user_bedrooms  # Area per Bedroom
input_dict["IsLuxury"] = (input_dict["SwimmingPool"] + input_dict["Gymnasium"] + input_dict["AC"] + input_dict["PowerBackup"]) > 2
input_dict["IsLuxury"] = int(input_dict["IsLuxury"])

# Create DataFrame and encode (one-hot encoding for Location)
input_df = pd.DataFrame([input_dict])
input_df_encoded = pd.get_dummies(input_df, columns=["Location"])

# Align the input features with training data columns
input_df_encoded = input_df_encoded.reindex(columns=X_encoded.columns, fill_value=0)

# Predict the price (log-transformed)
predicted_price_log = model.predict(input_df_encoded)[0]
predicted_price = np.expm1(predicted_price_log)

print(f"\n🏡 Predicted House Price: ₹ {int(predicted_price):,}")


📥 Enter property details to predict price:


In [None]:
# --- Model Evaluation ---
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)  # reverse log1p
y_test_actual = np.expm1(y_test)  # actual values (reverse log)

# Evaluation metrics
mae = mean_absolute_error(y_test_actual, y_pred)
mse = mean_squared_error(y_test_actual, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_actual, y_pred)

print(f"\n📈 Model Evaluation :")
print(f"R² Score: {r2:.4f}")
print(f"MAE : ₹ {mae:,.2f}")
print(f"RMSE: ₹ {rmse:,.2f}")

In the code, we take user input using the Location and No. of Bedrooms or BHK amount from the user, and then use the features
[area, location, No. of Bedrooms, Resale, Hospital, LiftAvailable, School, CarParking] to predict the price as per the dataset.

We verify the working of the model by using the r2_score metric of evaluation ; by calculating the MAE and RMSE.
With this in mind, we verify that with a R2 Score of 0.3265, the model is able to explain variance in the house prices 32.65 % of the time with enhanced feature engineering.

The ideal target is >0.5, which can be done through the use of more advanced ML models than GradientBoostingRegressor and with more parameters in the features.