# Final selected model
This file contains the final model selected to be used for predicted a house price. It uses `sklearn`'s `DecisionTreeRegressor()` from notebook 3.

In [201]:
# Autoload module updates
%load_ext autoreload
%autoreload 2

# Import modules
import numpy as np
import pandas as pd
import sklearn
import sys
sys.path.insert(0, '../')
import src.cleanup as cleanup
properties = pd.read_csv('..\data\cleaned_output_modeling.csv', skip_blank_lines=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [202]:
# Import modules
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split



# Define houses df & subsets
properties = properties #[properties['type of property'] == 'HOUSE']
print("There are", properties.shape[0], "rows before cleaning.")
# properties = cleanup.clean_outliers(properties)
properties = cleanup.drop_column(properties, ['id', 'type_of_property', 'subtype_of_property', 'province', 'locality', 'postalCode'])
properties = cleanup.drop_column(properties, ['garden_area', 'terrace_area','number_of_facades','open_fire'])
# Remove all properties missing a living area
properties = properties[properties['living_area'] != 0]

print("There are", properties.shape[0], "rows after cleaning.\n")
X = cleanup.drop_column(properties, 'price').to_numpy()
y = properties['price'].to_numpy()

#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)


# With GridsearchCV
from sklearn.model_selection import GridSearchCV
regressor = DecisionTreeRegressor()
params = {'max_depth':  [10, 12, 14, 16, 18], 'min_weight_fraction_leaf': [0.001, 0.002, 0.003, 0.0035, 0.004, 0.0045, 0.006]}
grid_search = GridSearchCV(regressor, params, cv=10)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
regressor = DecisionTreeRegressor(max_depth=best_params['max_depth'], min_weight_fraction_leaf=best_params['min_weight_fraction_leaf'])
print(best_params)
regressor.fit(X_train, y_train)
print("The score of the training set is", regressor.score(X_train, y_train))
print("The score of the testing set is", regressor.score(X_test, y_test))

There are 6347 rows before cleaning.
There are 3950 rows after cleaning.

{'max_depth': 10, 'min_weight_fraction_leaf': 0.006}
The score of the training set is 0.6776353739731492
The score of the testing set is 0.5850227675341833


In [203]:
# Predictor

y_pred = regressor.predict(X_test)
print("For the test data the model predicts a mean average house price of", round(y_pred.mean(),2), "€.\n")

data = [4, 80, 1, 0, 1, 1, 180, 230, 0, 1]
prediction_1 = round(regressor.predict([data])[0], 2)
print(
    "For a house with:\n\n",
    data[0], " bedrooms\n",
    data[1], "m² of living space\n",
    "A fully equipped kitchen\n",
    "Comes unfurnished\n",
    "Has a terrace\n",
    "Has a garden\n",
    data[6], "m² property space\n",
    data[7], "m² land\n", 
    "No swimming pool\n",
    "Comes renovated\n\n",
    "The model predicts a price of: ", prediction_1 ,"€."
    )

For the test data the model predicts a mean average house price of 539611.48 €.

For a house with:

 4  bedrooms
 80 m² of living space
 A fully equipped kitchen
 Comes unfurnished
 Has a terrace
 Has a garden
 180 m² property space
 230 m² land
 No swimming pool
 Comes renovated

 The model predicts a price of:  539833.33 €.


In [204]:
import pickle
pickle_out = open("../data/model.pkl", "wb")
pickle.dump(regressor, pickle_out)
pickle_out.close()


In [205]:
properties.head(3)

Unnamed: 0,price,number_of_bedrooms,living_area,fully_equipped_kitchen,furnished,terrace,garden,total_property_area,total_land_area,swimming_pool,state_of_the_building
2397,750000.0,6.0,1.0,0.0,0.0,1.0,1.0,513.0,415.0,0.0,0
2398,1390000.0,3.0,1.0,1.0,0.0,1.0,0.0,153.0,0.0,0.0,1
2399,670000.0,4.0,1.0,1.0,0.0,1.0,1.0,189.0,0.0,0.0,1


In [206]:
from pydantic import BaseModel

class propmodel(BaseModel):
    number_of_bedrooms: float
    living_area: float
    fully_equipped_kitchen: float
    furnished: float
    terrace: float
    garden: float
    total_property_area: float
    total_land_area: float
    swimming_pool: float
    state_of_the_building: float

In [207]:
from fastapi import FastAPI
import uvicorn
app = FastAPI()
pickle_in = open('../data/model.pkl', 'rb')
regressor = pickle.load(pickle_in)

@app.get('/')
def home():
    return {"Intro" : "Welcome to the model. To use this model you need to input the following parameters as a json"}

@app.get('/pricepredict')
def predict(data: propmodel):
    print(data)
    number_of_bedrooms = data['number_of_bedrooms']
    living_area = data['living_area']
    fully_equipped_kitchen = data['fully_equipped_kitchen']
    furnished = data['furnished']
    terrace = data['terrace']
    garden = data['garden']
    total_property_area = data['total_property_area']
    total_land_area = data['total_land_area']
    swimming_pool = data['swimming_pool']
    state_of_the_building = data['state_of_the_building']
    print(
        "You can expect the price to be", 
        regressor.predict([[
            number_of_bedrooms,
            living_area,
            fully_equipped_kitchen,
            furnished,
            terrace,
            garden,
            total_property_area,
            total_land_area,
            swimming_pool,
            state_of_the_building
        ]]),   
        "€ for a property with the above variables."
    )
    prediction = regressor.predict([[
        number_of_bedrooms,
        living_area,
        fully_equipped_kitchen,
        furnished,
        terrace,
        garden,
        total_property_area,
        total_land_area,
        swimming_pool,
        state_of_the_building
    ]])
    return {
        'prediction': round(prediction[0],2)
        }

In [208]:
predict(
    {
        'number_of_bedrooms' : 2,
        'living_area': 500,
        'fully_equipped_kitchen': 0,
        'furnished': 0,
        'terrace': 0,
        'garden': 0,
        'total_property_area': 10000,
        'total_land_area': 205,
        'swimming_pool': 0,
        'state_of_the_building': 0
    }
    )

{'number_of_bedrooms': 2, 'living_area': 500, 'fully_equipped_kitchen': 0, 'furnished': 0, 'terrace': 0, 'garden': 0, 'total_property_area': 10000, 'total_land_area': 205, 'swimming_pool': 0, 'state_of_the_building': 0}
You can expect the price to be [3452500.] € for a property with the above variables.


{'prediction': 3452500.0}

In [209]:
regressor.feature_importances_

array([0.0115482 , 0.04940154, 0.00835968, 0.        , 0.00552764,
       0.0020334 , 0.89172471, 0.01447442, 0.01577649, 0.00115392])

In [210]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# Train the decision tree regressor with the best hyperparameters

# Get the column names as a list
#feature_names = cleanup.drop_column(properties, 'price').columns.tolist()

# Visualize the decision tree
#plt.figure(figsize=(200, 150))
#plot_tree(regressor, feature_names=feature_names, filled=True, rounded=True)
#plt.show()