In [366]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score

In [367]:
df = pd.read_csv("data/properties.csv")

In [368]:
df.columns

Index(['id', 'price', 'property_type', 'subproperty_type', 'region',
       'province', 'locality', 'zip_code', 'latitude', 'longitude',
       'construction_year', 'total_area_sqm', 'surface_land_sqm',
       'nbr_frontages', 'nbr_bedrooms', 'equipped_kitchen', 'fl_furnished',
       'fl_open_fire', 'fl_terrace', 'terrace_sqm', 'fl_garden', 'garden_sqm',
       'fl_swimming_pool', 'fl_floodzone', 'state_building',
       'primary_energy_consumption_sqm', 'epc', 'heating_type',
       'fl_double_glazing', 'cadastral_income'],
      dtype='object')

In [369]:
selected_columns = ['price', 
        'latitude',
        'longitude',            
        'subproperty_type', 
        'locality',
        'total_area_sqm', 
        'nbr_bedrooms',              
        'construction_year', 
        'surface_land_sqm',
        'nbr_frontages', 
        'terrace_sqm', 
        'garden_sqm',
        'fl_swimming_pool', 
        'fl_floodzone',
        'epc',
        'state_building'
        ]
df = df[selected_columns]
df.columns

Index(['price', 'latitude', 'longitude', 'subproperty_type', 'locality',
       'total_area_sqm', 'nbr_bedrooms', 'construction_year',
       'surface_land_sqm', 'nbr_frontages', 'terrace_sqm', 'garden_sqm',
       'fl_swimming_pool', 'fl_floodzone', 'epc', 'state_building'],
      dtype='object')

In [370]:
one_hot_encoded = pd.get_dummies(df['subproperty_type'], prefix='type')
# Concatenate the one-hot encoded columns with the original DataFrame
df = pd.concat([df, one_hot_encoded], axis=1)
# Drop the original categorical column
df.drop('subproperty_type', axis=1, inplace=True)
df.columns

Index(['price', 'latitude', 'longitude', 'locality', 'total_area_sqm',
       'nbr_bedrooms', 'construction_year', 'surface_land_sqm',
       'nbr_frontages', 'terrace_sqm', 'garden_sqm', 'fl_swimming_pool',
       'fl_floodzone', 'epc', 'state_building', 'type_APARTMENT',
       'type_APARTMENT_BLOCK', 'type_BUNGALOW', 'type_CASTLE', 'type_CHALET',
       'type_COUNTRY_COTTAGE', 'type_DUPLEX', 'type_EXCEPTIONAL_PROPERTY',
       'type_FARMHOUSE', 'type_FLAT_STUDIO', 'type_GROUND_FLOOR', 'type_HOUSE',
       'type_KOT', 'type_LOFT', 'type_MANOR_HOUSE', 'type_MANSION',
       'type_MIXED_USE_BUILDING', 'type_OTHER_PROPERTY', 'type_PENTHOUSE',
       'type_SERVICE_FLAT', 'type_TOWN_HOUSE', 'type_TRIPLEX', 'type_VILLA'],
      dtype='object')

In [371]:
one_hot_encoded = pd.get_dummies(df['locality'], prefix='locality')
# Concatenate the one-hot encoded columns with the original DataFrame
df = pd.concat([df, one_hot_encoded], axis=1)
# Drop the original categorical column
df.drop('locality', axis=1, inplace=True)
df.columns

Index(['price', 'latitude', 'longitude', 'total_area_sqm', 'nbr_bedrooms',
       'construction_year', 'surface_land_sqm', 'nbr_frontages', 'terrace_sqm',
       'garden_sqm', 'fl_swimming_pool', 'fl_floodzone', 'epc',
       'state_building', 'type_APARTMENT', 'type_APARTMENT_BLOCK',
       'type_BUNGALOW', 'type_CASTLE', 'type_CHALET', 'type_COUNTRY_COTTAGE',
       'type_DUPLEX', 'type_EXCEPTIONAL_PROPERTY', 'type_FARMHOUSE',
       'type_FLAT_STUDIO', 'type_GROUND_FLOOR', 'type_HOUSE', 'type_KOT',
       'type_LOFT', 'type_MANOR_HOUSE', 'type_MANSION',
       'type_MIXED_USE_BUILDING', 'type_OTHER_PROPERTY', 'type_PENTHOUSE',
       'type_SERVICE_FLAT', 'type_TOWN_HOUSE', 'type_TRIPLEX', 'type_VILLA',
       'locality_Aalst', 'locality_Antwerp', 'locality_Arlon', 'locality_Ath',
       'locality_Bastogne', 'locality_Brugge', 'locality_Brussels',
       'locality_Charleroi', 'locality_Dendermonde', 'locality_Diksmuide',
       'locality_Dinant', 'locality_Eeklo', 'locality_Gent',
  

In [372]:
one_hot_encoded = pd.get_dummies(df['epc'], prefix='epc')
# Concatenate the one-hot encoded columns with the original DataFrame
df = pd.concat([df, one_hot_encoded], axis=1)
# Drop the original categorical column
df.drop('epc', axis=1, inplace=True)
df.columns

Index(['price', 'latitude', 'longitude', 'total_area_sqm', 'nbr_bedrooms',
       'construction_year', 'surface_land_sqm', 'nbr_frontages', 'terrace_sqm',
       'garden_sqm', 'fl_swimming_pool', 'fl_floodzone', 'state_building',
       'type_APARTMENT', 'type_APARTMENT_BLOCK', 'type_BUNGALOW',
       'type_CASTLE', 'type_CHALET', 'type_COUNTRY_COTTAGE', 'type_DUPLEX',
       'type_EXCEPTIONAL_PROPERTY', 'type_FARMHOUSE', 'type_FLAT_STUDIO',
       'type_GROUND_FLOOR', 'type_HOUSE', 'type_KOT', 'type_LOFT',
       'type_MANOR_HOUSE', 'type_MANSION', 'type_MIXED_USE_BUILDING',
       'type_OTHER_PROPERTY', 'type_PENTHOUSE', 'type_SERVICE_FLAT',
       'type_TOWN_HOUSE', 'type_TRIPLEX', 'type_VILLA', 'locality_Aalst',
       'locality_Antwerp', 'locality_Arlon', 'locality_Ath',
       'locality_Bastogne', 'locality_Brugge', 'locality_Brussels',
       'locality_Charleroi', 'locality_Dendermonde', 'locality_Diksmuide',
       'locality_Dinant', 'locality_Eeklo', 'locality_Gent',
       'l

In [373]:
one_hot_encoded = pd.get_dummies(df['state_building'], prefix='state')
# Concatenate the one-hot encoded columns with the original DataFrame
df = pd.concat([df, one_hot_encoded], axis=1)
# Drop the original categorical column
df.drop('state_building', axis=1, inplace=True)
df.columns

Index(['price', 'latitude', 'longitude', 'total_area_sqm', 'nbr_bedrooms',
       'construction_year', 'surface_land_sqm', 'nbr_frontages', 'terrace_sqm',
       'garden_sqm', 'fl_swimming_pool', 'fl_floodzone', 'type_APARTMENT',
       'type_APARTMENT_BLOCK', 'type_BUNGALOW', 'type_CASTLE', 'type_CHALET',
       'type_COUNTRY_COTTAGE', 'type_DUPLEX', 'type_EXCEPTIONAL_PROPERTY',
       'type_FARMHOUSE', 'type_FLAT_STUDIO', 'type_GROUND_FLOOR', 'type_HOUSE',
       'type_KOT', 'type_LOFT', 'type_MANOR_HOUSE', 'type_MANSION',
       'type_MIXED_USE_BUILDING', 'type_OTHER_PROPERTY', 'type_PENTHOUSE',
       'type_SERVICE_FLAT', 'type_TOWN_HOUSE', 'type_TRIPLEX', 'type_VILLA',
       'locality_Aalst', 'locality_Antwerp', 'locality_Arlon', 'locality_Ath',
       'locality_Bastogne', 'locality_Brugge', 'locality_Brussels',
       'locality_Charleroi', 'locality_Dendermonde', 'locality_Diksmuide',
       'locality_Dinant', 'locality_Eeklo', 'locality_Gent',
       'locality_Halle-Vilvoorde',

In [374]:
# df = df.select_dtypes(include=['float64', 'int64'])
correlation = df.corr()
target_correlation = correlation['price'].sort_values(ascending=False)
relevant_features = target_correlation[abs(target_correlation) > 0.5].index.tolist()
print("Relevant features based on correlation:", relevant_features)
print(target_correlation)

Relevant features based on correlation: ['price']
price                 1.000000
nbr_bedrooms          0.352447
locality_Brugge       0.235392
type_VILLA            0.212326
total_area_sqm        0.199650
                        ...   
type_FLAT_STUDIO     -0.063868
locality_Charleroi   -0.072550
locality_Liège       -0.072635
longitude            -0.091013
type_APARTMENT       -0.107107
Name: price, Length: 96, dtype: float64


In [375]:
# Instantiate SimpleImputer with 'mean' strategy
imputer = SimpleImputer(strategy='mean')

# Fit and transform the column with missing values
df['total_area_sqm'] = imputer.fit_transform(df[['total_area_sqm']])
df['surface_land_sqm'] = imputer.fit_transform(df[['surface_land_sqm']])
df['construction_year'] = imputer.fit_transform(df[['construction_year']])
df['latitude'] = imputer.fit_transform(df[['latitude']])
df['longitude'] = imputer.fit_transform(df[['longitude']])

In [376]:
df['nbr_frontages'] = df['nbr_frontages'].fillna(1)
df['nbr_bedrooms'] = df['nbr_bedrooms'].fillna(0)
df['garden_sqm'] = df['garden_sqm'].fillna(0)
df['terrace_sqm'] = df['terrace_sqm'].fillna(0)
df['fl_swimming_pool'] = df['fl_swimming_pool'].fillna(0)
df.columns

Index(['price', 'latitude', 'longitude', 'total_area_sqm', 'nbr_bedrooms',
       'construction_year', 'surface_land_sqm', 'nbr_frontages', 'terrace_sqm',
       'garden_sqm', 'fl_swimming_pool', 'fl_floodzone', 'type_APARTMENT',
       'type_APARTMENT_BLOCK', 'type_BUNGALOW', 'type_CASTLE', 'type_CHALET',
       'type_COUNTRY_COTTAGE', 'type_DUPLEX', 'type_EXCEPTIONAL_PROPERTY',
       'type_FARMHOUSE', 'type_FLAT_STUDIO', 'type_GROUND_FLOOR', 'type_HOUSE',
       'type_KOT', 'type_LOFT', 'type_MANOR_HOUSE', 'type_MANSION',
       'type_MIXED_USE_BUILDING', 'type_OTHER_PROPERTY', 'type_PENTHOUSE',
       'type_SERVICE_FLAT', 'type_TOWN_HOUSE', 'type_TRIPLEX', 'type_VILLA',
       'locality_Aalst', 'locality_Antwerp', 'locality_Arlon', 'locality_Ath',
       'locality_Bastogne', 'locality_Brugge', 'locality_Brussels',
       'locality_Charleroi', 'locality_Dendermonde', 'locality_Diksmuide',
       'locality_Dinant', 'locality_Eeklo', 'locality_Gent',
       'locality_Halle-Vilvoorde',

In [377]:
X = df.drop(columns=['price'])  # Scaled features
y = df['price']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=155)

model = LinearRegression()
model.fit(X_train, y_train)

In [378]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(mse, r2)

103868957373.36295 0.3863177860740905
