In [525]:
# Autoload module updates
%load_ext autoreload
%autoreload 2

# Import modules
import numpy as np
import pandas as pd
import sklearn
import sys
sys.path.insert(0, '../')
import src.cleanup as cleanup
import src.cleanupoutliers as cleanoutliers
properties = pd.read_csv('..\data\cleaned_output_2.csv', skip_blank_lines=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [526]:
# Define houses df
houses = properties[properties['type of property'] == 'HOUSE']

# Houses Linear

In [527]:
# Remove non-numerical data from houses
houses = cleanup.drop_column(houses, ['type of property', 'subtype of property', 'province', 'locality', 'postalCode'])
print("There are", houses.shape[0], "rows.")
houses = cleanoutliers.clean_outliers(houses)
print("There are", houses.shape[0], "rows.")
display(houses)


There are 4625 rows.
There are 170 properties with a value of more than 2,000,000€
There are 17 properties with more than 10 bedrooms.
There are 12 properties with a living area greater than 100sqm.
There are 20 properties with a total property area greater than 750sqm.
There are 19 properties with a total property land greater than 20000sqm.
There are 7 properties with a terrace area greater than 250sqm.
There are 3 properties with more than 4 facades.

248 outliers have been removed from the dataframe in total.
There are 4377 rows.


Unnamed: 0,price,number of bedrooms,living area,fully equipped kitchen,furnished,open fire,terrace,terrace area,garden,garden area,total property area,total land area,number of facades,swimming pool,state of the building
0,350000.0,3,0,1.0,0.0,1,1.0,25.0,1.0,0.0,140.0,2590.0,4.0,0,0
1,389000.0,3,0,1.0,0.0,0,1.0,40.0,1.0,260.0,160.0,570.0,4.0,0,1
2,389000.0,3,0,1.0,0.0,1,0.0,0.0,0.0,0.0,133.0,347.0,3.0,0,1
3,389000.0,6,0,1.0,0.0,0,1.0,0.0,1.0,119.0,210.0,220.0,2.0,0,0
4,389000.0,3,0,1.0,1.0,1,1.0,17.0,0.0,0.0,222.0,111.0,2.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6308,1470000.0,9,96,0.0,0.0,1,1.0,42.0,1.0,43.0,294.0,983.0,3.0,0,0
6309,985000.0,4,98,1.0,0.0,0,1.0,100.0,0.0,0.0,270.0,0.0,2.0,0,1
6311,795000.0,4,100,1.0,0.0,0,1.0,0.0,1.0,1172.0,445.0,1663.0,3.0,0,1
6312,1690000.0,8,100,1.0,0.0,0,1.0,0.0,1.0,120.0,700.0,307.0,3.0,0,1


In [528]:
# Convert houses to a numpy dataframe
X = cleanup.drop_column(houses, ['price']).to_numpy()
y = houses['price'].to_numpy()


# Import train_test_split
# Apply train_test_split to dataframe
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=133)

# Fit and score model

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
print("The score of the training set is", regressor.score(X_train, y_train))
print("The score of the testing set is", regressor.score(X_test, y_test))



The score of the training set is 0.5461274385102269
The score of the testing set is 0.559965619399208


# Houses decision tree

In [533]:
# Import modules
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error



# Define houses df & subsets
houses = properties[properties['type of property'] == 'HOUSE']
print("There are", houses.shape[0], "rows.")
houses = cleanoutliers.clean_outliers(houses)
houses = cleanup.drop_column(houses, ['type of property', 'subtype of property', 'province', 'locality', 'postalCode'])
houses = cleanup.drop_column(houses, ['garden area', 'terrace area','number of facades','open fire'])
houses = houses[houses['living area'] != 0]

print("There are", houses.shape[0], "rows.\n")
X = cleanup.drop_column(houses, 'price').to_numpy()
y = houses['price'].to_numpy()

#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

regressor = DecisionTreeRegressor(max_depth=12,min_weight_fraction_leaf=0.004)
regressor.fit(X_train, y_train)
print("The score of the training set is", regressor.score(X_train, y_train))
print("The score of the testing set is", regressor.score(X_test, y_test))
houses.head()



There are 4625 rows.
There are 170 properties with a value of more than 2,000,000€
There are 17 properties with more than 10 bedrooms.
There are 12 properties with a living area greater than 100sqm.
There are 20 properties with a total property area greater than 750sqm.
There are 19 properties with a total property land greater than 20000sqm.
There are 7 properties with a terrace area greater than 250sqm.
There are 3 properties with more than 4 facades.

248 outliers have been removed from the dataframe in total.
There are 2784 rows.

The score of the training set is 0.7119470491924487
The score of the testing set is 0.4699935352461757


Unnamed: 0,price,number of bedrooms,living area,fully equipped kitchen,furnished,terrace,garden,total property area,total land area,swimming pool,state of the building
2397,750000.0,6,1,0.0,0.0,1.0,1.0,513.0,415.0,0,0
2402,144900.0,2,8,1.0,0.0,0.0,1.0,90.0,242.0,0,0
2403,143500.0,3,8,0.0,0.0,1.0,1.0,108.0,132.0,0,0
2405,150000.0,2,10,0.0,0.0,1.0,1.0,139.0,413.0,0,0
2406,159000.0,3,10,1.0,0.0,1.0,0.0,110.0,120.0,0,1
