# Final selected model
This file contains the final model selected to be used for predicted a house price. It uses `sklearn`'s `DecisionTreeRegressor()` from notebook 3.

In [6]:
# Autoload module updates
%load_ext autoreload
%autoreload 2

# Import modules
import numpy as np
import pandas as pd
import sklearn
import sys
sys.path.insert(0, '../')
import src.cleanup as cleanup
properties = pd.read_csv('..\data\cleaned_output_modeling.csv', skip_blank_lines=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# Import modules
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split



# Define houses df & subsets
houses = properties #[properties['type of property'] == 'HOUSE']
print("There are", houses.shape[0], "rows before cleaning.")
houses = cleanup.clean_outliers(houses)
houses = cleanup.drop_column(houses, ['id', 'type_of_property', 'subtype_of_property', 'province', 'locality', 'postalCode'])
houses = cleanup.drop_column(houses, ['garden_area', 'terrace_area','number_of_facades','open_fire'])
# Remove all properties missing a living area
houses = houses[houses['living_area'] != 0]

print("There are", houses.shape[0], "rows after cleaning.\n")
X = cleanup.drop_column(houses, 'price').to_numpy()
y = houses['price'].to_numpy()

#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)


# With GridsearchCV
from sklearn.model_selection import GridSearchCV
regressor = DecisionTreeRegressor()
params = {'max_depth': [ 4, 6, 8, 10, 12], 'min_weight_fraction_leaf': [0.002, 0.003, 0.0035, 0.004, 0.0045, 0.006]}
grid_search = GridSearchCV(regressor, params, cv=10)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
regressor = DecisionTreeRegressor(max_depth=best_params['max_depth'], min_weight_fraction_leaf=best_params['min_weight_fraction_leaf'])
print(best_params)
regressor.fit(X_train, y_train)
print("The score of the training set is", regressor.score(X_train, y_train))
print("The score of the testing set is", regressor.score(X_test, y_test))

There are 6347 rows before cleaning.
There were 201 properties with a price value of more than 2000000 that have been removed.
There were 19 properties with a number_of_bedrooms value of more than 10 that have been removed.
There were 16 properties with a living_area value of more than 100 that have been removed.
There were 20 properties with a total_property_area value of more than 750 that have been removed.
There were 19 properties with a total_land_area value of more than 20000 that have been removed.
There were 7 properties with a terrace_area value of more than 250 that have been removed.
There were 3 properties with a number_of_facades value of more than 4 that have been removed.
There are 3804 rows after cleaning.

{'max_depth': 6, 'min_weight_fraction_leaf': 0.006}
The score of the training set is 0.6056829209020435
The score of the testing set is 0.5751440641192962


In [8]:
# Predictor

y_pred = regressor.predict(X_test)
print("For the test data the model predicts a mean average house price of", round(y_pred.mean(),2), "€.\n")

data = [4, 80, 1, 0, 1, 1, 180, 230, 0, 1]
prediction_1 = round(regressor.predict([data])[0], 2)
print(
    "For a house with:\n\n",
    data[0], " bedrooms\n",
    data[1], "m² of living space\n",
    "A fully equipped kitchen\n",
    "Comes unfurnished\n",
    "Has a terrace\n",
    "Has a garden\n",
    data[6], "m² property space\n",
    data[7], "m² land\n", 
    "No swimming pool\n",
    "Comes renovated\n\n",
    "The model predicts a price of: ", prediction_1 ,"€."
    )

For the test data the model predicts a mean average house price of 464329.48 €.

For a house with:

 4  bedrooms
 80 m² of living space
 A fully equipped kitchen
 Comes unfurnished
 Has a terrace
 Has a garden
 180 m² property space
 230 m² land
 No swimming pool
 Comes renovated

 The model predicts a price of:  790846.15 €.


In [9]:
import pickle
pickle_out = open("../data/model.pkl", "wb")
pickle.dump(regressor, pickle_out)
pickle_out.close()


In [10]:
houses.head(3)

Unnamed: 0,price,number_of_bedrooms,living_area,fully_equipped_kitchen,furnished,terrace,garden,total_property_area,total_land_area,swimming_pool,state_of_the_building
2397,750000.0,6.0,1.0,0.0,0.0,1.0,1.0,513.0,415.0,0.0,0
2398,1390000.0,3.0,1.0,1.0,0.0,1.0,0.0,153.0,0.0,0.0,1
2399,670000.0,4.0,1.0,1.0,0.0,1.0,1.0,189.0,0.0,0.0,1


In [11]:
from pydantic import BaseModel

class propmodel(BaseModel):
    price: float
    number_of_bedrooms: float
    living_area: float
    fully_equipped_kitchen: float
    furnished: float
    terrace: float
    garden: float
    total_property_area: float
    total_land_area: float
    swimming_pool: float
    state_of_the_building: float