In [526]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


# Reading our Immoweb dataset in 'df'.
df = pd.read_csv('../data/merged_data.csv')

# Instantiating LinearRegression as 'reg'.
reg = LinearRegression()

# Filling NaN values with 0.
df['landplot'].fillna(0, inplace=True)
df['facades'].fillna(0, inplace=True)
df['Living area'].fillna(0, inplace=True)

# Reduces zip codes to 2 digits for broader scope.
df['Zip code'] = (df['Zip code']/100).astype(int)

# Creating dummy columns from categorical data.
df = pd.get_dummies(df, columns=['condition', 'province', 'Zip code', 'subtype'])

# Removing features that we won't be using.
df.drop(['city', 'Kitchen', 'Terrace', 'type'], axis=1, inplace=True)

# Because 'get_dummies()' creates boolean values, we re-define our dataframe to be integers only.
df = df.astype(int)

# Shows the 10 first rows of the cleaned dataframe.
#display(df.head(10))

# Defining 'X' and 'y' variables from our dataframe using purely features that contain numerical data.
X = df.drop(['price'], axis=1).to_numpy()
y = df['price'].to_numpy()

# Reshaping 'y' to be 2D array.
y = y.reshape(-1, 1)

# Setting up 'train_test_split' to get standardized training/testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Training our model.
reg.fit(X_train, y_train)

# Displaying score of Training variables.
print("Training score:", reg.score(X_train, y_train)) 

# Predicting the 'y' target value (Price).
y_prediction = reg.predict(X_test)

# Displaying the score of Testing variables
features = X.shape[1]
print("Testing score:", reg.score(X_test, y_test))
print(f"Using {features} features, and 1 (price)target")


Training score: 0.6043052550623464
Testing score: 0.5339174600378664
Using 124 features, and 1 (price)target


In [869]:
from sklearn.tree import DecisionTreeRegressor


# Instatiate 'DecisionTreeRegressor()' and setting parameters.
dtr = DecisionTreeRegressor(max_depth=50, max_leaf_nodes=66, ccp_alpha=0.1)

dtr.fit(X_train, y_train)
print('DTR Training score:', dtr.score(X_train, y_train))
y_pred = dtr.predict(X_test)
print('DTR Testing score:', dtr.score(X_test, y_test))

DTR Training score: 0.8106913069516318
DTR Testing score: 0.6102132000228297


In [535]:
import xgboost as xgb
from sklearn.metrics import accuracy_score


# Attempt at using XGBoost
train = xgb.DMatrix(X_train, label=y_train)
test = xgb.DMatrix(X_test, label=y_test)

param = {
    'max_depth':4,
    'eta':0.3,
    'objective': 'multi:softmax',
    'num_class':len(np.unique(y))}
epochs = 10

model = xgb.train(param, train, num_boost_round=epochs)
predictions = model.predict(test)
accuracy_score(y_test, predictions)

XGBoostError: [15:33:13] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0fdc6d574b9c0d168-1\xgboost\xgboost-ci-windows\src\objective\multiclass_obj.cu:123: SoftmaxMultiClassObj: label must be in [0, num_class).