In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Reading our Immoweb dataset in 'df'.
df = pd.read_csv('../data/merged_data.csv')

# Instantiating LinearRegression as 'reg'.
reg = LinearRegression()

# Filling NaN values with 0.
df['landplot'].fillna(0, inplace=True)
df['facades'].fillna(0, inplace=True)
df['Living area'].fillna(0, inplace=True)

# Creating dummy columns from categorical data.
df = pd.get_dummies(df, columns=['condition', 'subtype', 'province', 'Zip code'])

# Removing features that we won't be using.
df.drop(['type', 'city', 'facades', 'Terrace', 'Kitchen'], axis=1, inplace=True)

# Because 'get_dummies()' creates boolean values, we re-define our dataframe to be integers only.
df = df.astype(int)

# Shows the 10 first rows of the cleaned dataframe.
display(df.head(10))

# Defining 'X' and 'y' variables from our dataframe using purely features that contain numerical data.
X = df.drop(['price'], axis=1).to_numpy()
y = df['price'].to_numpy()

# Reshaping 'y' to be 2D array.
y = y.reshape(-1, 1)

# Setting up 'train_test_split' to get standardized training/testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Training our model.
reg.fit(X_train, y_train)

# Displaying score of Training variables.
print("Training score:", reg.score(X_train, y_train)) 

# Predicting the 'y' target value (Price).
y_prediction = reg.predict(X_test)

# Displaying the score of Testing variables
features = X.shape[1]
print("Testing score:", reg.score(X_test, y_test))
print(f"Using {features} features, and 1 (price)target")


Unnamed: 0,price,bedrooms,Living area,landplot,condition_as new,condition_good,condition_just renovated,condition_to be done up,condition_to renovate,condition_to restore,...,Zip code_9920,Zip code_9930,Zip code_9940,Zip code_9950,Zip code_9961,Zip code_9968,Zip code_9980,Zip code_9990,Zip code_9991,Zip code_9992
0,335000,0,58,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1400000,2,220,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,289000,2,80,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,606000,3,220,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,580000,4,179,80,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,340000,4,150,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,330000,2,105,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1200000,3,568,196,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,495000,2,158,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,800000,8,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


Training score: 0.6856981751650779
Testing score: 0.6520743076924211
Using 859 features, and 1 (price)target
