In [21]:
# Import libraries
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
#import matplotlib.pyplot as plt
#import seaborn as sns

In [9]:
# Read the csv file
df = pd.read_csv("../data/cleaned_properties.csv")

df_house = df[(df["property_type"] == "HOUSE") & (df['subproperty_type'] != 'APARTMENT_BLOCK')]

print(df_house.columns.to_list())

['price', 'property_type', 'subproperty_type', 'region', 'province', 'locality', 'zip_code', 'construction_year', 'total_area_sqm', 'surface_land_sqm', 'nbr_frontages', 'nbr_bedrooms', 'equipped_kitchen', 'fl_furnished', 'fl_open_fire', 'fl_terrace', 'terrace_sqm', 'fl_garden', 'garden_sqm', 'fl_swimming_pool', 'fl_floodzone', 'state_building', 'primary_energy_consumption_sqm', 'epc', 'heating_type', 'fl_double_glazing', 'cadastral_income']


In [10]:
# Name X and y
X = df_house.drop(columns=['price', 'subproperty_type', 'property_type', 'region', 'locality', 'construction_year', 'cadastral_income', 'nbr_frontages', 'fl_floodzone'])
y = df_house['price']

# Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [20]:
print(X_train.columns.tolist())
print(X_train.info())
display(df_house.describe().T)

['province', 'zip_code', 'total_area_sqm', 'surface_land_sqm', 'nbr_bedrooms', 'equipped_kitchen', 'fl_furnished', 'fl_open_fire', 'fl_terrace', 'terrace_sqm', 'fl_garden', 'garden_sqm', 'fl_swimming_pool', 'state_building', 'primary_energy_consumption_sqm', 'epc', 'heating_type', 'fl_double_glazing']
<class 'pandas.core.frame.DataFrame'>
Index: 29768 entries, 8081 to 31954
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   province                        29768 non-null  object 
 1   zip_code                        29768 non-null  int64  
 2   total_area_sqm                  25900 non-null  float64
 3   surface_land_sqm                29768 non-null  float64
 4   nbr_bedrooms                    29768 non-null  float64
 5   equipped_kitchen                16580 non-null  object 
 6   fl_furnished                    29768 non-null  int64  
 7   fl_open_fire                    2976

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,37211.0,459510.222569,498247.770532,76000.0,259000.0,363000.0,498000.0,22500000.0
zip_code,37211.0,5469.416866,2877.401608,1000.0,2610.0,5330.0,8370.0,9992.0
construction_year,19969.0,1969.94822,45.054128,1753.0,1945.0,1972.0,2012.0,2024.0
total_area_sqm,32360.0,216.498053,309.408879,7.0,141.0,176.0,230.0,15348.0
surface_land_sqm,37211.0,1196.208164,10381.255815,0.0,161.0,379.0,830.0,950774.0
nbr_frontages,29563.0,3.005953,0.894377,1.0,2.0,3.0,4.0,47.0
nbr_bedrooms,37211.0,3.453387,1.943435,0.0,3.0,3.0,4.0,200.0
fl_furnished,37211.0,0.011018,0.104389,0.0,0.0,0.0,0.0,1.0
fl_open_fire,37211.0,0.192766,0.394476,0.0,0.0,0.0,0.0,1.0
fl_terrace,37211.0,0.492381,0.499949,0.0,0.0,0.0,1.0,1.0


In [16]:
print(f"The features of X_train are:\n {', '.join([str(feature)for feature in X_train.columns])}.\n")

The features of X_train are:
 province, zip_code, total_area_sqm, surface_land_sqm, nbr_bedrooms, equipped_kitchen, fl_furnished, fl_open_fire, fl_terrace, terrace_sqm, fl_garden, garden_sqm, fl_swimming_pool, state_building, primary_energy_consumption_sqm, epc, heating_type, fl_double_glazing.



In [17]:
categorical_cols = X.select_dtypes(include='object').columns.tolist()
for column in categorical_cols:
    unique_values = X[column].unique()
    print(f"Unique values in '{column}' are : {unique_values}")

Unique values in 'province' are : ['East Flanders' 'Antwerp' 'Flemish Brabant' 'West Flanders' 'Hainaut'
 'Liège' 'Brussels' 'Luxembourg' 'Walloon Brabant' 'Namur' 'Limburg']
Unique values in 'equipped_kitchen' are : [nan 'HYPER_EQUIPPED' 'INSTALLED' 'USA_UNINSTALLED' 'SEMI_EQUIPPED'
 'USA_HYPER_EQUIPPED' 'NOT_INSTALLED' 'USA_INSTALLED' 'USA_SEMI_EQUIPPED']
Unique values in 'state_building' are : [nan 'AS_NEW' 'GOOD' 'TO_RENOVATE' 'JUST_RENOVATED' 'TO_BE_DONE_UP'
 'TO_RESTORE']
Unique values in 'epc' are : ['C' 'A' nan 'D' 'E' 'B' 'G' 'F' 'A++' 'A+']
Unique values in 'heating_type' are : [nan 'FUELOIL' 'GAS' 'PELLET' 'ELECTRIC' 'CARBON' 'WOOD' 'SOLAR']


In [21]:
num_cols = X.select_dtypes(include='int').columns.tolist()
for column in num_cols:
    unique_values = X[column].unique()
    print(f"Unique values in '{column}' are : {unique_values}")

Unique values in 'zip_code' are : [9185 2275 1700 ... 5021 9572 6533]
Unique values in 'fl_furnished' are : [0 1]
Unique values in 'fl_open_fire' are : [0 1]
Unique values in 'fl_terrace' are : [0 1]
Unique values in 'fl_garden' are : [0 1]
Unique values in 'fl_swimming_pool' are : [0 1]
Unique values in 'fl_double_glazing' are : [1 0]


In [18]:
numeric_cols = X.select_dtypes(include='float').columns.tolist()
for column in numeric_cols:
    max_value = X[column].max()
    min_value = X[column].min()
    print(f"Column '{column}': Max value = {max_value}, Min value = {min_value}")

Column 'total_area_sqm': Max value = 15348.0, Min value = 7.0
Column 'surface_land_sqm': Max value = 950774.0, Min value = 0.0
Column 'nbr_bedrooms': Max value = 200.0, Min value = 0.0
Column 'terrace_sqm': Max value = 3466.0, Min value = 0.0
Column 'garden_sqm': Max value = 150000.0, Min value = 0.0
Column 'primary_energy_consumption_sqm': Max value = 20231122.0, Min value = -99.0


In [20]:
max_value = X['zip_code'].max()
min_value = X['zip_code'].min()
print(f"Column '{'zip_code'}': Max value = {max_value}, Min value = {min_value}")

Column 'zip_code': Max value = 9992, Min value = 1000


In [12]:
column_name = 'zip_code'  # Specify the column name you want to check
if X_train[column_name].isnull().values.any():
    print(f"There are NaN values in column '{column_name}'")
else:
    print(f"No NaN values in column '{column_name}'")

No NaN values in column 'zip_code'


In [15]:
num_cols = X.select_dtypes(include='float').columns.tolist()
for column in num_cols:
    unique_values = sorted(X[column].unique())
    print(f"Unique values in '{column}' are : {unique_values}")

Unique values in 'total_area_sqm' are : [nan, 7.0, 13.0, 24.0, 25.0, 28.0, 29.0, 30.0, 32.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0, 128.0, 129.0, 130.0, 131.0, 132.0, 133.0, 134.0, 135.0, 136.0, 137.0, 138.0, 139.0, 140.0, 141.0, 142.0, 143.0, 144.0, 145.0, 146.0, 147.0, 148.0, 149.0, 150.0, 151.0, 152.0, 153.0, 154.0, 155.0, 156.0, 157.0, 158.0, 159.0, 160.0, 161.0, 162.0, 163.0, 164.0, 165.0, 166.0, 167.0, 168.0, 169.0, 170.0, 171.0, 172.0, 173.0,

In [None]:
# Define row

#row = np.array([province, zip_code, total_area_sqm, surface_land_sqm, 
#nbr_bedrooms, equipped_kitchen, fl_furnished, fl_open_fire, fl_terrace, terrace_sqm, 
#fl_garden, garden_sqm, fl_swimming_pool, state_building, 
#primary_energy_consumption_sqm, epc, heating_type, fl_double_glazing]) 

# Define columns
# columns = X.columns.tolist()

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import pickle
import gzip
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error

In [29]:

numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Define preprocessing steps for numerical and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Fit and transform the preprocessing steps on training data
preprocessed = preprocessor.fit(X_train)

X_train_processed = preprocessed.transform(X_train)
X_test_processed = preprocessed.transform(X_test)

print(type(X_train_processed))
print(type(X_test_processed))


<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [None]:
with gzip.open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessed, f)

In [31]:
regressor = RandomForestRegressor(random_state=42)
    # Train the model using the processed X_train and y_train
regressor.fit(X_train_processed, y_train)

In [None]:
# Save the model to a file
with gzip.open('random_forest_regressor.pkl', 'wb') as f:
    pickle.dump(regressor, f)

In [33]:
# Predict
regressor.predict(X_test_processed)

y_train_pred = regressor.predict(X_train_processed)
y_test_pred = regressor.predict(X_test_processed)

# Evaluation
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

In [34]:
print("Random Forest Regressor Model Evaluation:")
print("Training score: {:.2f} %".format(train_r2*100))
print("Testing score: {:.2f} %".format(test_r2*100))

Random Forest Regressor Model Evaluation:
Training score: 95.58 %
Testing score: 73.96 %


In [None]:

# Save the model to a file
with gzip.open(r"model\random_forest_regressor.pkl", 'wb') as f:
    pickle.dump(regressor, f)