# 02 â€“ Data Cleaning & Feature Engineering
Preparing data for machine learning.


In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/house_prices_srilanka.csv")

In [2]:
# Feature engineering: house age
df["house_age"] = 2025 - df["year_built"]
df.drop("year_built", axis=1, inplace=True)

# Boolean to integer
df["has_garden"] = df["has_garden"].astype(int)
df["has_ac"] = df["has_ac"].astype(int)

In [3]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())

Missing values:
 district             0
area                 0
perch                0
bedrooms             0
bathrooms            0
kitchen_area_sqft    0
parking_spots        0
has_garden           0
has_ac               0
water_supply         0
electricity          0
floors               0
price_lkr            0
house_age            0
dtype: int64


In [4]:
# Outlier detection using IQR for price
Q1 = df['price_lkr'].quantile(0.25)
Q3 = df['price_lkr'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['price_lkr'] < lower_bound) | (df['price_lkr'] > upper_bound)]
print(f"Number of outliers in price: {len(outliers)}")

# Remove outliers
df = df[(df['price_lkr'] >= lower_bound) & (df['price_lkr'] <= upper_bound)]

Number of outliers in price: 949


In [5]:
# Additional features
df['total_rooms'] = df['bedrooms'] + df['bathrooms']
df['area_per_room'] = df['perch'] / df['total_rooms']
df['luxury_score'] = df['has_garden'] + df['has_ac'] + (df['parking_spots'] > 1).astype(int)

In [6]:
# One-hot encoding categorical features
df_encoded = pd.get_dummies(
    df,
    columns=["district", "area", "water_supply", "electricity"],
    drop_first=True
)

In [7]:
# Save processed data
df_encoded.to_csv("../data/processed/cleaned_data.csv", index=False)