In [1005]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


# Reading our Immoweb dataset in 'df'.
df = pd.read_csv('../data/merged_data.csv')

# Instantiating LinearRegression as 'reg'.
reg = LinearRegression()

# Filling NaN values with 0.
df['landplot'].fillna(0, inplace=True)
df['facades'].fillna(0, inplace=True)
df['Living area'].fillna(0, inplace=True)

# Reduces zip codes to 2 digits for broader scope.
df['Zip code'] = (df['Zip code']/100).astype(int)

# Creating dummy columns from categorical data.
df = pd.get_dummies(df, columns=['condition', 'province', 'subtype', 'Zip code'])

# Removing features that we won't be using.
df.drop(['city', 'Kitchen', 'Terrace', 'type'], axis=1, inplace=True)

# Because 'get_dummies()' creates boolean values, we re-define our dataframe to be integers only.
df = df.astype(int)

# Shows the 10 first rows of the cleaned dataframe.
#display(df.head(10))

# Defining 'X' and 'y' variables from our dataframe using purely features that contain numerical data.
X = df.drop(['price'], axis=1).to_numpy()
y = df['price'].to_numpy()

# Reshaping 'y' to be 2D array.
#y = y.reshape(-1, 1) # Depending on what Regressor is being used, this might need to be commented out.

# Setting up 'train_test_split' to get standardized training/testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
display(df.head(10))
# Training our model.
reg.fit(X_train, y_train)

# Displaying score of Training variables.
print("Training score:", reg.score(X_train, y_train)) 

# Predicting the 'y' target value (Price).
y_prediction = reg.predict(X_test)

# Displaying the score of Testing variables
features = X.shape[1]
print("Testing score:", reg.score(X_test, y_test))
print(f"Using {features} features, and 1 (price)target")


Unnamed: 0,price,bedrooms,Living area,landplot,facades,condition_as new,condition_good,condition_just renovated,condition_to be done up,condition_to renovate,...,Zip code_90,Zip code_91,Zip code_92,Zip code_93,Zip code_94,Zip code_95,Zip code_96,Zip code_97,Zip code_98,Zip code_99
0,335000,0,58,0,2,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1400000,2,220,0,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,289000,2,80,0,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,606000,3,220,0,3,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,580000,4,179,80,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,340000,4,150,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,330000,2,105,0,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1200000,3,568,196,2,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,495000,2,158,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,800000,8,420,0,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


Training score: 0.6036959252333232
Testing score: 0.5418008220878945
Using 124 features, and 1 (price)target


In [1021]:
from sklearn.tree import DecisionTreeRegressor


# Instatiate 'DecisionTreeRegressor()' and setting parameters.
dtr = DecisionTreeRegressor(max_depth=50, max_leaf_nodes=120)

dtr.fit(X_train, y_train)
print('DTR Training score:', dtr.score(X_train, y_train))
y_pred = dtr.predict(X_test)
print('DTR Testing score:', dtr.score(X_test, y_test))

DTR Training score: 0.8560562295569463
DTR Testing score: 0.656486999869303


In [1007]:
from xgboost import XGBRegressor

# Attempt at using XGBoost
xg = XGBRegressor()

xg.fit(X_train, y_train)
predictions = xg.predict(X_test)
print('XGBoost Training score:', xg.score(X_train, y_train))
print('XGBoost Testing score:', xg.score(X_test, y_test))

XGBoost Training score: 0.9436563901588909
XGBoost Testing score: 0.701387761130668


In [1008]:
from sklearn.ensemble import RandomForestRegressor

rforest = RandomForestRegressor(random_state=3)

rforest.fit(X_train, y_train)
prd = rforest.predict(X_test)
print('RandomForest Training score:', rforest.score(X_train, y_train))
print('RandomForest Testing score:', rforest.score(X_test, y_test))

RandomForest Training score: 0.9676078089132826
RandomForest Testing score: 0.7651368865400737
