## Poland House Price Prediction


Given data about houses Poland, let's try to predict the price of a given house.

We will use a gradient boosting regression model to make our prediction

In [1]:
# Loading libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


from sklearn.ensemble import GradientBoostingRegressor

In [2]:
data = pd.read_csv("C:/Users/Ashraf/Documents/Zoom/Houses.csv", encoding='latin-1')
data

Unnamed: 0.1,Unnamed: 0,address,city,floor,id,latitude,longitude,price,rooms,sq,year
0,0,Podgórze Zab³ocie Stanis³awa Klimeckiego,Kraków,2.0,23918.0,50.049224,19.970379,749000.0,3.0,74.05,2021.0
1,1,Praga-Po³udnie Grochowska,Warszawa,3.0,17828.0,52.249775,21.106886,240548.0,1.0,24.38,2021.0
2,2,Krowodrza Czarnowiejska,Kraków,2.0,22784.0,50.066964,19.920025,427000.0,2.0,37.00,1970.0
3,3,Grunwald,Poznañ,2.0,4315.0,52.404212,16.882542,1290000.0,5.0,166.00,1935.0
4,4,Ochota Gotowy budynek. Stan deweloperski. Osta...,Warszawa,1.0,11770.0,52.212225,20.972630,996000.0,5.0,105.00,2020.0
...,...,...,...,...,...,...,...,...,...,...,...
23759,23759,Stare Miasto Naramowice,Poznañ,0.0,3976.0,52.449649,16.949408,543000.0,4.0,77.00,2020.0
23760,23760,W³ochy,Warszawa,4.0,10206.0,52.186109,20.948438,910000.0,3.0,71.00,2017.0
23761,23761,Nowe Miasto Malta ul. Katowicka,Poznañ,0.0,4952.0,52.397345,16.961939,430695.0,3.0,50.67,2022.0
23762,23762,Podgórze Duchackie Walerego S³awka,Kraków,6.0,24148.0,50.024231,19.959569,359000.0,2.0,38.86,2021.0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23764 entries, 0 to 23763
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  23764 non-null  int64  
 1   address     23764 non-null  object 
 2   city        23764 non-null  object 
 3   floor       23764 non-null  float64
 4   id          23764 non-null  float64
 5   latitude    23764 non-null  float64
 6   longitude   23764 non-null  float64
 7   price       23764 non-null  float64
 8   rooms       23764 non-null  float64
 9   sq          23764 non-null  float64
 10  year        23764 non-null  float64
dtypes: float64(8), int64(1), object(2)
memory usage: 2.0+ MB


## Preprocessing

In [6]:
def preprocess_inputs(df):
    df = df.copy()
    df = df.drop(['Unnamed: 0', 'address','id'],axis=1)
    
    # split df into X and y
    y = df['price']
    X = df.drop('price',axis=1)
    
    # Train_test_split
    X_train,X_test,y_train,y_test = train_test_split(X,y, train_size=0.7, shuffle=True, random_state=42)
    
    return  X_train,X_test,y_train,y_test 

In [7]:
 X_train,X_test,y_train,y_test  = preprocess_inputs(data)

In [8]:
X_train

Unnamed: 0,city,floor,latitude,longitude,rooms,sq,year
1576,Kraków,3.0,50.058788,19.950060,3.0,59.00,2016.0
7101,Warszawa,1.0,52.201321,20.962718,4.0,94.00,2000.0
14853,Warszawa,8.0,52.229876,20.909551,2.0,45.80,1980.0
16091,Kraków,3.0,50.052768,19.928241,2.0,45.80,2019.0
12218,Kraków,0.0,50.048456,19.959778,3.0,84.00,2010.0
...,...,...,...,...,...,...,...
21575,Poznañ,0.0,52.407393,16.979467,2.0,36.72,2021.0
5390,Warszawa,1.0,52.174545,21.111017,4.0,108.00,2020.0
860,Kraków,3.0,50.059040,19.937168,4.0,140.00,1900.0
15795,Kraków,4.0,50.010297,20.008570,4.0,69.19,2021.0


In [9]:
y_train

1576      989000.0
7101     1175000.0
14853     540000.0
16091    1144954.2
12218    1250000.0
           ...    
21575     332316.0
5390      953300.0
860      1350000.0
15795     664224.0
23654    1100000.0
Name: price, Length: 16634, dtype: float64

## Building Pipeline

In [10]:
nominal_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('nominal', nominal_transformer,['city'])
], remainder='passthrough')

model = Pipeline(steps=[
    ('preprocessor', preprocessor ),
    ('scaler', StandardScaler()),
    ('regressor', GradientBoostingRegressor())
])

## Training

In [11]:
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('nominal',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(sparse=False))]),
                                                  ['city'])])),
                ('scaler', StandardScaler()),
                ('regressor', GradientBoostingRegressor())])

## Results 

In [12]:
y_pred = model.predict(X_test)
rmse = np.sqrt(np.mean(y_test - y_pred)**2)
print('RMSE: {:.5f}'.format(rmse))

RMSE: 1151.38613


In [13]:
baseline_errors = np.sum((y_test - np.mean(y_test))**2)
model_errors = np.sum((y_test - y_pred)**2)

In [14]:
r2 = 1 - (model_errors / baseline_errors)
print("R^2 Score: {:.5f}".format(r2))

R^2 Score: 0.78779


In [15]:
model.score(X_test, y_test)

0.7877943140679815