## Libraries

In [217]:
# Import libraries
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read the csv file
df = pd.read_csv("../data/cleaned_properties.csv")

# Display the head
display(df.head())
df.shape
df.columns


## Filtering the data

In [225]:
# Filter the DataFrame for values APARTMENT and APARTMENT_BLOCK
df_house1 = df[df["property_type"] == "HOUSE"]
df_house2 = df_house1[df_house1['subproperty_type'] != 'APARTMENT_BLOCK']

df_house = df[(df["property_type"] == "HOUSE") & (df['subproperty_type'] != 'APARTMENT_BLOCK')]

df_house.head()
print(df_house.info())
print(df_house.shape)

df_house["subproperty_type"].unique()
print(df_house["locality"].unique())
df_house.isna().sum().sort_values(ascending=False)

<class 'pandas.core.frame.DataFrame'>
Index: 37211 entries, 1 to 75506
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   price                           37211 non-null  float64
 1   property_type                   37211 non-null  object 
 2   subproperty_type                37211 non-null  object 
 3   region                          37211 non-null  object 
 4   province                        37211 non-null  object 
 5   locality                        37211 non-null  object 
 6   zip_code                        37211 non-null  int64  
 7   construction_year               19969 non-null  float64
 8   total_area_sqm                  32360 non-null  float64
 9   surface_land_sqm                37211 non-null  float64
 10  nbr_frontages                   29563 non-null  float64
 11  nbr_bedrooms                    37211 non-null  float64
 12  equipped_kitchen                20753

construction_year                 17242
cadastral_income                  17168
equipped_kitchen                  16458
heating_type                      13276
state_building                    11529
primary_energy_consumption_sqm    10070
terrace_sqm                        8818
epc                                8721
nbr_frontages                      7648
total_area_sqm                     4851
garden_sqm                         2361
fl_double_glazing                     0
fl_floodzone                          0
fl_swimming_pool                      0
fl_garden                             0
price                                 0
fl_terrace                            0
fl_open_fire                          0
property_type                         0
nbr_bedrooms                          0
surface_land_sqm                      0
zip_code                              0
locality                              0
province                              0
region                                0


## Data types of df_house

In [None]:
# List of df_house columns
column_list = df_house.columns.to_list()
print(column_list)

In [None]:
data_object = df_house.select_dtypes(include='object')
data_float = df_house.select_dtypes(include='float64')
data_int = df_house.select_dtypes(include='int64')

In [None]:
object_list = data_object.columns.to_list()
print("objects:", object_list)
int_columns = data_int.columns.to_list()
print("integers:", int_columns)
float = data_float.columns.to_list()
print("float:", float)

## Splitting the data

Creating variables X and y: define the target and the feature.

In [None]:
# Name X and y (specific columns=subset(houses))
X = df_house.drop(columns=['price', 'property_type', 'zip_code', 'locality', 'construction_year', 'cadastral_income'])
y = df_house['price']

# Print shape
print("X shape: ", X.shape)
print("y-shape: ", y.shape)

In [None]:
# Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

## Rescaling data

Convert categorical data to a numerical form.

Data to convert:  'region', 'province', 'equipped_kitchen', 'state_building', 'epc', 'heating_type'

In [232]:
X_train.info()
X_train['province'].unique()

<class 'pandas.core.frame.DataFrame'>
Index: 29768 entries, 8081 to 31954
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   subproperty_type                29768 non-null  object 
 1   region                          29768 non-null  object 
 2   province                        29768 non-null  object 
 3   total_area_sqm                  25900 non-null  float64
 4   surface_land_sqm                29768 non-null  float64
 5   nbr_frontages                   23670 non-null  float64
 6   nbr_bedrooms                    29768 non-null  float64
 7   equipped_kitchen                16580 non-null  object 
 8   fl_furnished                    29768 non-null  int64  
 9   fl_open_fire                    29768 non-null  int64  
 10  fl_terrace                      29768 non-null  int64  
 11  terrace_sqm                     22712 non-null  float64
 12  fl_garden                       29

array(['Flemish Brabant', 'West Flanders', 'Antwerp', 'East Flanders',
       'Luxembourg', 'Hainaut', 'Liège', 'Brussels', 'Walloon Brabant',
       'Namur', 'Limburg'], dtype=object)

In [None]:
# Initiationg the encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
data = X_train[['province']]
dfx = X_train
print(type(data))
print(type(X_train))

In [None]:
# Fit the encoder to the data
encoder.fit(data)

In [None]:
# Transform the data
encoded_data = encoder.transform(data)
print(encoded_data)
print(type(encoded_data))

- encoder.fit_transform(X_train[['province']])
- .fit_transform == .fit & .transform

In [218]:
# Handle NaN values using SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')  # Fill NaN values with most frequent value
encoded_data_imputed = imputer.fit_transform(encoded_data)

In [219]:
# Convert the encoded data to a dataframe
encoded_df = pd.DataFrame(encoded_data_imputed, columns=encoder.get_feature_names_out(['province']))

In [220]:
# Concatenate the encoded dataframe with the original dataframe
df_encoded = pd.concat([dfx, encoded_df], axis=1)
df_encoded.drop('province', axis=1, inplace=True) 

In [224]:
print(df_encoded.info())

<class 'pandas.core.frame.DataFrame'>
Index: 47794 entries, 8081 to 29767
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   total_area_sqm                  25900 non-null  float64
 1   surface_land_sqm                29768 non-null  float64
 2   nbr_frontages                   23670 non-null  float64
 3   nbr_bedrooms                    29768 non-null  float64
 4   fl_furnished                    29768 non-null  float64
 5   fl_open_fire                    29768 non-null  float64
 6   fl_terrace                      29768 non-null  float64
 7   terrace_sqm                     22712 non-null  float64
 8   fl_garden                       29768 non-null  float64
 9   garden_sqm                      27915 non-null  float64
 10  fl_swimming_pool                29768 non-null  float64
 11  fl_floodzone                    29768 non-null  float64
 12  primary_energy_consumption_sqm  21

## Exploratory Data Analysis

In [None]:
df_corr = df_encoded.drop(columns=['subproperty_type', 'region', 'equipped_kitchen', 'state_building', 'epc', 'heating_type'])
display(df_corr.corr())

In [None]:
# Create the correlation matrix and represent it as a heatmap.
plt.figure(figsize=(20, 15))
sns.heatmap(df_corr.corr(), annot = True, cmap = 'coolwarm', fmt=".2f")
plt.show()

## Imputing with mean

In [226]:
# Drop columns with dtype=object
object_cols = df_encoded.select_dtypes(include='object').columns
df_encoded.drop(object_cols, axis=1, inplace=True)
df_encoded.isna().sum().sort_values(ascending=False)

primary_energy_consumption_sqm    26120
terrace_sqm                       25082
nbr_frontages                     24124
total_area_sqm                    21894
garden_sqm                        19879
fl_furnished                      18026
province_Brussels                 18026
province_Walloon Brabant          18026
province_Namur                    18026
province_Luxembourg               18026
province_Liège                    18026
province_Limburg                  18026
province_Hainaut                  18026
province_Flemish Brabant          18026
province_East Flanders            18026
province_Antwerp                  18026
fl_open_fire                      18026
fl_double_glazing                 18026
surface_land_sqm                  18026
fl_floodzone                      18026
fl_swimming_pool                  18026
fl_garden                         18026
nbr_bedrooms                      18026
fl_terrace                        18026
province_West Flanders            18026


In [None]:
# See unique values in dataframe
for column in df_encoded.columns:
    unique_values = df_encoded[column].unique()
    print(f"Unique values in column '{column}' is : {unique_values}")

In [None]:
# Instantiate SimpleImputer with strategy='mean' to impute NaN values with the mean
imputer = SimpleImputer(strategy='median')

# Fit and transform the data using SimpleImputer
X_train['total_area_sqm'] = imputer.fit_transform(X_train[['total_area_sqm']])

print(X_train['total_area_sqm'].unique())


## Linear Regressor model

In [247]:
# Create regressor and instantiate LinearRegression class
reg = LinearRegression()
print(type(reg))

X_train_reshaped = X_train['total_area_sqm'] .values.reshape(-1,1)
# Train the model with X_train and  y_train
reg.fit(X_train_reshaped, y_train)

<class 'sklearn.linear_model._base.LinearRegression'>


In [245]:
# Display score of training model
score = reg.score(X_train_reshaped, y_train)
print(score*100, "%")

1.6493678686115176 %
