In [1]:
# Autoload module updates
%load_ext autoreload
%autoreload 2

# Import modules
import numpy as np
import pandas as pd
import sklearn
import sys
sys.path.insert(0, '../')
import src.cleanup as cleanup
properties = pd.read_csv('..\data\cleaned_output_modeling.csv', skip_blank_lines=True)
properties.head()

Unnamed: 0,id,type_of_property,subtype_of_property,province,locality,postalCode,price,number_of_bedrooms,living_area,fully_equipped_kitchen,...,open_fire,terrace,terrace_area,garden,garden_area,total_property_area,total_land_area,number_of_facades,swimming_pool,state_of_the_building
0,10559233.0,HOUSE,HOUSE,Flemish Brabant,ZICHEM,3271.0,350000.0,3.0,0.0,1.0,...,1.0,1.0,25.0,1.0,0.0,140.0,2590.0,4.0,0.0,0
1,10673158.0,HOUSE,HOUSE,Antwerp,Nijlen,2560.0,389000.0,3.0,0.0,1.0,...,0.0,1.0,40.0,1.0,260.0,160.0,570.0,4.0,0.0,1
2,10470833.0,HOUSE,HOUSE,West Flanders,Oudenburg,8460.0,389000.0,3.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,133.0,347.0,3.0,0.0,1
3,10560493.0,HOUSE,HOUSE,West Flanders,Brugge,8000.0,389000.0,6.0,0.0,1.0,...,0.0,1.0,0.0,1.0,119.0,210.0,220.0,2.0,0.0,0
4,10452028.0,HOUSE,HOUSE,West Flanders,Blankenberge,8370.0,389000.0,3.0,0.0,1.0,...,1.0,1.0,17.0,0.0,0.0,222.0,111.0,2.0,0.0,1


In [2]:
# Convert properties to a numpy dataframe
properties = cleanup.drop_column(properties, ['type_of_property', 'subtype_of_property', 'province', 'locality', 'postalCode'])
X = properties.drop(columns=['price']).to_numpy()
y = properties['price'].to_numpy()
display(y)

array([ 350000.,  389000.,  389000., ...,  249900., 1400000.,  645000.])

In [3]:
# Import train_test_split
# Apply train_test_split to dataframe

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=133)

In [4]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
print("The score of the training set is", regressor.score(X_train, y_train))
print("The score of the testing set is", regressor.score(X_test, y_test))

The score of the training set is 0.49580437496172947
The score of the testing set is 0.5054518823436795


Below, I clean the data set of outliers.

In [5]:
# Manually clean outliers
# Set removal count to zero. Each clean will sum the amount of removed rows into removal_count


properties = cleanup.clean_outliers(properties)

There were 201 properties with a price value of more than 2000000 that have been removed.
There were 19 properties with a number_of_bedrooms value of more than 10 that have been removed.
There were 16 properties with a living_area value of more than 100 that have been removed.
There were 20 properties with a total_property_area value of more than 750 that have been removed.
There were 19 properties with a total_land_area value of more than 20000 that have been removed.
There were 7 properties with a terrace_area value of more than 250 that have been removed.
There were 3 properties with a number_of_facades value of more than 4 that have been removed.


In [6]:
# Convert properties to a numpy dataframe
X = properties.drop(columns=['price']).to_numpy()
y = properties['price'].to_numpy()
display(y)

array([ 350000.,  389000.,  389000., ...,  990000., 1590000., 1250000.])

In [7]:
# Import train_test_split
# Apply train_test_split to dataframe

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=133)

In [8]:
# Fit and score model

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
print("The score of the training set is", regressor.score(X_train, y_train))
print("The score of the testing set is", regressor.score(X_test, y_test))

The score of the training set is 0.5098442931343585
The score of the testing set is 0.4525606291381028


In [9]:
# Predictor
y_pred = regressor.predict(X_test)

In [10]:
# Check regressor coefficients
coefficients= regressor.coef_
print(coefficients)
# Check regressor intercept
intercept = regressor.intercept_
print(intercept)

[-1.46359776e-01  1.72944195e+04  1.01570732e+03  1.08028000e+05
  8.11050144e+04 -4.61754172e+04  4.12248368e+04  4.09198834e+02
 -4.55890674e+04 -1.56682110e+00  1.75814069e+03  8.98780408e+00
  7.60426910e+03  1.15650383e+05  4.65051373e+04]
1469050.6709826186


In [11]:
# Manually create first prediction
print((X_test[0])*coefficients.sum()+ intercept)

# Check model's first prediction to compare vs. previous output
print(y_pred[0])
properties.head()

[3.48116527e+12 2.45556436e+06 7.71697072e+06 1.79788857e+06
 1.46905067e+06 1.46905067e+06 1.79788857e+06 9.68999810e+06
 1.79788857e+06 1.46905067e+06 4.25737878e+07 9.68320408e+07
 2.12672647e+06 1.46905067e+06 1.46905067e+06]
342305.90976845566


Unnamed: 0,id,price,number_of_bedrooms,living_area,fully_equipped_kitchen,furnished,open_fire,terrace,terrace_area,garden,garden_area,total_property_area,total_land_area,number_of_facades,swimming_pool,state_of_the_building
0,10559233.0,350000.0,3.0,0.0,1.0,0.0,1.0,1.0,25.0,1.0,0.0,140.0,2590.0,4.0,0.0,0
1,10673158.0,389000.0,3.0,0.0,1.0,0.0,0.0,1.0,40.0,1.0,260.0,160.0,570.0,4.0,0.0,1
2,10470833.0,389000.0,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,133.0,347.0,3.0,0.0,1
3,10560493.0,389000.0,6.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,119.0,210.0,220.0,2.0,0.0,0
4,10452028.0,389000.0,3.0,0.0,1.0,1.0,1.0,1.0,17.0,0.0,0.0,222.0,111.0,2.0,0.0,1


In [12]:
# Drop 'living area' from dataframe
properties = cleanup.drop_column(properties, 'living_area')

In [13]:
# Redo and score the model
X = properties.drop(columns=['price']).to_numpy()
y = properties['price'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=133)
regressor.fit(X_train, y_train)
print("The score of the training set is", regressor.score(X_train, y_train))
print("The score of the testing set is", regressor.score(X_test, y_test))

The score of the training set is 0.5063590773629773
The score of the testing set is 0.4513471350686433
