In [2]:
# Import libraries
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
#import matplotlib.pyplot as plt
#import seaborn as sns

In [3]:
# Read the csv file
df = pd.read_csv("../data/cleaned_properties.csv")

df_house = df[(df["property_type"] == "HOUSE") & (df['subproperty_type'] != 'APARTMENT_BLOCK')]

In [4]:
# Name X and y
X = df_house.drop(columns=['price', 'subproperty_type', 'property_type', 'region', 'locality', 'construction_year', 'cadastral_income', 'nbr_frontages', 'fl_floodzone'])
y = df_house['price']

# Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [6]:
print(X_train.columns.tolist())

['province', 'zip_code', 'total_area_sqm', 'surface_land_sqm', 'nbr_bedrooms', 'equipped_kitchen', 'fl_furnished', 'fl_open_fire', 'fl_terrace', 'terrace_sqm', 'fl_garden', 'garden_sqm', 'fl_swimming_pool', 'state_building', 'primary_energy_consumption_sqm', 'epc', 'heating_type', 'fl_double_glazing']


In [16]:
print(f"The features of X_train are:\n {', '.join([str(feature)for feature in X_train.columns])}.\n")

The features of X_train are:
 province, zip_code, total_area_sqm, surface_land_sqm, nbr_bedrooms, equipped_kitchen, fl_furnished, fl_open_fire, fl_terrace, terrace_sqm, fl_garden, garden_sqm, fl_swimming_pool, state_building, primary_energy_consumption_sqm, epc, heating_type, fl_double_glazing.



In [17]:
categorical_cols = X.select_dtypes(include='object').columns.tolist()
for column in categorical_cols:
    unique_values = X[column].unique()
    print(f"Unique values in '{column}' are : {unique_values}")

Unique values in 'province' are : ['East Flanders' 'Antwerp' 'Flemish Brabant' 'West Flanders' 'Hainaut'
 'Liège' 'Brussels' 'Luxembourg' 'Walloon Brabant' 'Namur' 'Limburg']
Unique values in 'equipped_kitchen' are : [nan 'HYPER_EQUIPPED' 'INSTALLED' 'USA_UNINSTALLED' 'SEMI_EQUIPPED'
 'USA_HYPER_EQUIPPED' 'NOT_INSTALLED' 'USA_INSTALLED' 'USA_SEMI_EQUIPPED']
Unique values in 'state_building' are : [nan 'AS_NEW' 'GOOD' 'TO_RENOVATE' 'JUST_RENOVATED' 'TO_BE_DONE_UP'
 'TO_RESTORE']
Unique values in 'epc' are : ['C' 'A' nan 'D' 'E' 'B' 'G' 'F' 'A++' 'A+']
Unique values in 'heating_type' are : [nan 'FUELOIL' 'GAS' 'PELLET' 'ELECTRIC' 'CARBON' 'WOOD' 'SOLAR']


In [21]:
num_cols = X.select_dtypes(include='int').columns.tolist()
for column in num_cols:
    unique_values = X[column].unique()
    print(f"Unique values in '{column}' are : {unique_values}")

Unique values in 'zip_code' are : [9185 2275 1700 ... 5021 9572 6533]
Unique values in 'fl_furnished' are : [0 1]
Unique values in 'fl_open_fire' are : [0 1]
Unique values in 'fl_terrace' are : [0 1]
Unique values in 'fl_garden' are : [0 1]
Unique values in 'fl_swimming_pool' are : [0 1]
Unique values in 'fl_double_glazing' are : [1 0]


In [18]:
numeric_cols = X.select_dtypes(include='float').columns.tolist()
for column in numeric_cols:
    max_value = X[column].max()
    min_value = X[column].min()
    print(f"Column '{column}': Max value = {max_value}, Min value = {min_value}")

Column 'total_area_sqm': Max value = 15348.0, Min value = 7.0
Column 'surface_land_sqm': Max value = 950774.0, Min value = 0.0
Column 'nbr_bedrooms': Max value = 200.0, Min value = 0.0
Column 'terrace_sqm': Max value = 3466.0, Min value = 0.0
Column 'garden_sqm': Max value = 150000.0, Min value = 0.0
Column 'primary_energy_consumption_sqm': Max value = 20231122.0, Min value = -99.0


In [20]:
max_value = X['zip_code'].max()
min_value = X['zip_code'].min()
print(f"Column '{'zip_code'}': Max value = {max_value}, Min value = {min_value}")

Column 'zip_code': Max value = 9992, Min value = 1000
