In [73]:
# Autoload module updates
%load_ext autoreload
%autoreload 2

# Import modules
import numpy as np
import pandas as pd
import sklearn
import sys
sys.path.insert(0, '../')
import src.cleanup as cleanup
properties = pd.read_csv('..\data\cleaned_output_modeling.csv', skip_blank_lines=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [74]:
# Define houses df
houses = properties[properties['type_of_property'] == 'HOUSE']

# Houses Linear

In [75]:
# Remove non-numerical data from houses
houses = cleanup.drop_column(houses, ['type_of_property', 'subtype_of_property', 'province', 'locality', 'postalCode'])
print("There are", houses.shape[0], "rows.")

cleanup.clean_outliers(houses)


There are 4625 rows.
There were 170 properties with a price value of more than 2000000 that have been removed.
There were 17 properties with a number_of_bedrooms value of more than 10 that have been removed.
There were 12 properties with a living_area value of more than 100 that have been removed.
There were 20 properties with a total_property_area value of more than 750 that have been removed.
There were 19 properties with a total_land_area value of more than 20000 that have been removed.
There were 7 properties with a terrace_area value of more than 250 that have been removed.
There were 3 properties with a number_of_facades value of more than 4 that have been removed.


Unnamed: 0,id,price,number_of_bedrooms,living_area,fully_equipped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,total_property_area,total_land_area,number_of_facades,swimming_pool,state_of_the_building
0,10559233.0,350000.0,3.0,0.0,1.0,0.0,1.0,1.0,25.0,1.0,0.0,140.0,2590.0,4.0,0.0,0
1,10673158.0,389000.0,3.0,0.0,1.0,0.0,0.0,1.0,40.0,1.0,260.0,160.0,570.0,4.0,0.0,1
2,10470833.0,389000.0,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,133.0,347.0,3.0,0.0,1
3,10560493.0,389000.0,6.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,119.0,210.0,220.0,2.0,0.0,0
4,10452028.0,389000.0,3.0,0.0,1.0,1.0,1.0,1.0,17.0,0.0,0.0,222.0,111.0,2.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6308,10665206.0,1470000.0,9.0,96.0,0.0,0.0,1.0,1.0,42.0,1.0,43.0,294.0,983.0,3.0,0.0,0
6309,10641711.0,985000.0,4.0,98.0,1.0,0.0,0.0,1.0,100.0,0.0,0.0,270.0,0.0,2.0,0.0,1
6311,10666826.0,795000.0,4.0,100.0,1.0,0.0,0.0,1.0,0.0,1.0,1172.0,445.0,1663.0,3.0,0.0,1
6312,9722809.0,1690000.0,8.0,100.0,1.0,0.0,0.0,1.0,0.0,1.0,120.0,700.0,307.0,3.0,0.0,1


In [76]:
# Convert houses to a numpy dataframe
X = cleanup.drop_column(houses, ['price']).to_numpy()
y = houses['price'].to_numpy()


# Import train_test_split
# Apply train_test_split to dataframe
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=133)

# Fit and score model

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
print("The score of the training set is", regressor.score(X_train, y_train))
print("The score of the testing set is", regressor.score(X_test, y_test))



The score of the training set is 0.5287206143063975
The score of the testing set is 0.5248218013923855


# Houses decision tree

In [79]:
# Import modules
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error



# Define houses df & subsets
houses = properties[properties['type_of_property'] == 'HOUSE']
print("There are", houses.shape[0], "rows.")
#houses = cleanup.clean_outliers(houses)
houses = cleanup.drop_column(houses, ['type_of_property', 'subtype_of_property', 'province', 'locality', 'postalCode'])
houses = cleanup.drop_column(houses, ['garden_area', 'terrace_area','number_of_facades','open_fire'])
houses = houses[houses['living_area'] != 0]

print("There are", houses.shape[0], "rows.\n")
X = cleanup.drop_column(houses, 'price').to_numpy()
y = houses['price'].to_numpy()

#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)


# With GridsearchCV
from sklearn.model_selection import GridSearchCV
regressor = DecisionTreeRegressor()
params = {'max_depth': [ 4, 6, 8, 10, 12], 'min_weight_fraction_leaf': [0.002, 0.003, 0.0035, 0.004, 0.0045, 0.006]}
grid_search = GridSearchCV(regressor, params, cv=10)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
regressor = DecisionTreeRegressor(max_depth=best_params['max_depth'], min_weight_fraction_leaf=best_params['min_weight_fraction_leaf'])
print(best_params)
regressor.fit(X_train, y_train)
print("The score of the training set is", regressor.score(X_train, y_train))
print("The score of the testing set is", regressor.score(X_test, y_test))
houses.head()



There are 4625 rows.
There are 2920 rows.

{'max_depth': 10, 'min_weight_fraction_leaf': 0.006}
The score of the training set is 0.6941764901715921
The score of the testing set is 0.5948051111950129


Unnamed: 0,id,price,number_of_bedrooms,living_area,fully_equipped_kitchen,furnished,terrace,garden,total_property_area,total_land_area,swimming_pool,state_of_the_building
2397,10541158.0,750000.0,6.0,1.0,0.0,0.0,1.0,1.0,513.0,415.0,0.0,0
2402,10567894.0,144900.0,2.0,8.0,1.0,0.0,0.0,1.0,90.0,242.0,0.0,0
2403,10534619.0,143500.0,3.0,8.0,0.0,0.0,1.0,1.0,108.0,132.0,0.0,0
2405,10623016.0,150000.0,2.0,10.0,0.0,0.0,1.0,1.0,139.0,413.0,0.0,0
2406,10162175.0,159000.0,3.0,10.0,1.0,0.0,1.0,0.0,110.0,120.0,0.0,1


In [80]:
import xgboost
print(xgboost.__version__)

# Define houses df & subsets
houses = properties[properties['type_of_property'] == 'HOUSE']
print("There are", houses.shape[0], "rows.")
houses = cleanup.clean_outliers(houses)
houses = cleanup.drop_column(houses, ['type_of_property', 'subtype_of_property', 'province', 'locality', 'postalCode'])
houses = cleanup.drop_column(houses, ['garden_area', 'terrace_area', 'living_area'])
#houses = houses[houses['living area'] != 0]
print("There are", houses.shape[0], "rows.\n")
X = cleanup.drop_column(houses, 'price').to_numpy()
y = houses['price'].to_numpy()

regressor = xgboost.XGBRegressor()
#regressor.fit(X_train, y_train)
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
cv = RepeatedKFold(n_splits = 10, n_repeats = 3)
scores = cross_val_score(regressor, X_train, y_train, cv=10, n_jobs = 1)


scores = np.absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()))


1.7.6
There are 4625 rows.
There were 170 properties with a price value of more than 2000000 that have been removed.
There were 17 properties with a number_of_bedrooms value of more than 10 that have been removed.
There were 12 properties with a living_area value of more than 100 that have been removed.
There were 20 properties with a total_property_area value of more than 750 that have been removed.
There were 19 properties with a total_land_area value of more than 20000 that have been removed.
There were 7 properties with a terrace_area value of more than 250 that have been removed.
There were 3 properties with a number_of_facades value of more than 4 that have been removed.
There are 4377 rows.

Mean MAE: 0.518 (0.213)
