Importing CSV File

In [31]:
import pandas as pd

df = pd.read_csv("housing.csv")

df = df.drop(columns=['Unnamed: 0', 'property_id'])
df = df[df['purpose'] == 'For Sale'].drop(columns=['purpose'])

Removing Outliers

In [32]:
q1 = df['price'].quantile(0.25)
q3 = df['price'].quantile(0.75)
iqr = q3 - q1

df_sale = df[
    (df['price'] >= q1 - 1.5 * iqr) &
    (df['price'] <= q3 + 1.5 * iqr)
]

Select Final Features

In [40]:
features = [
    'Total_Area',
    'bedrooms',
    'baths',
    'latitude',
    'longitude',
    'location_id'
]

X = df_sale[features]
y = df_sale['price']

Importing Preprocessing Pipeline

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

Separate columns

In [34]:
num_features = ['Total_Area', 'bedrooms', 'baths', 'latitude', 'longitude']
cat_features = ['location_id']

Numeric Pipeline

In [35]:
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median'))])

Categorical Pipeline

In [21]:
cat_pipeline = Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore'))])

Combining Pipelines

In [22]:
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

Full Model Pipeline

In [43]:
from sklearn.ensemble import RandomForestRegressor

model_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        random_state=42,
        n_jobs=-1
    ))
])

Train & Evaluate

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

MAE: 6517500.0
R²: 0.7213500338154867


Save Model

In [45]:
import joblib

joblib.dump(model_pipeline, 'pakistan_housing_price_model.pkl')

['pakistan_housing_price_model.pkl']

Load Model & Predict New Data

In [46]:
loaded_model = joblib.load('pakistan_housing_price_model.pkl')

new_property = pd.DataFrame([{
    'Total_Area': 1200,
    'bedrooms': 3,
    'baths': 2,
    'latitude': 31.5204,
    'longitude': 74.3587,
    'location_id': 3852
}])

prediction = loaded_model.predict(new_property)
print("Predicted Price:", prediction[0])

Predicted Price: 6919000.0


Try edge cases

In [47]:
test_cases = pd.DataFrame([
    {'Total_Area': 500, 'bedrooms': 1, 'baths': 1, 'latitude': 31.5, 'longitude': 74.3, 'location_id': 9999},
    {'Total_Area': 3000, 'bedrooms': 6, 'baths': 5, 'latitude': 24.86, 'longitude': 67.01, 'location_id': 3852}
])

loaded_model.predict(test_cases)

array([ 8569333.33333333, 34522333.33333334])