## Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read the csv file
df = pd.read_csv("../data/cleaned_properties.csv")

# Display the head
display(df.head())
df.shape
df.columns

# Filter the DataFrame for values APARTMENT and APARTMENT_BLOCK
df_house1 = df[df["property_type"] == "HOUSE"]
df_house2 = df_house1[df_house1['subproperty_type'] != 'APARTMENT_BLOCK']

df_house = df[(df["property_type"] == "HOUSE") & (df['subproperty_type'] != 'APARTMENT_BLOCK')]

df_house.head()
print(df_house.info())
print(df_house.shape)

df_house["subproperty_type"].unique()
print(df_house["locality"].unique())
df_house.isna().sum().sort_values(ascending=False)

Unnamed: 0,price,property_type,subproperty_type,region,province,locality,zip_code,construction_year,total_area_sqm,surface_land_sqm,...,fl_garden,garden_sqm,fl_swimming_pool,fl_floodzone,state_building,primary_energy_consumption_sqm,epc,heating_type,fl_double_glazing,cadastral_income
0,225000.0,APARTMENT,APARTMENT,Flanders,Antwerp,Antwerp,2050,1963.0,100.0,,...,0,0.0,0,0,,231.0,C,GAS,1,922.0
1,449000.0,HOUSE,HOUSE,Flanders,East Flanders,Gent,9185,,,680.0,...,0,0.0,0,0,,221.0,C,,1,406.0
2,335000.0,APARTMENT,APARTMENT,Brussels-Capital,Brussels,Brussels,1070,,142.0,,...,0,0.0,0,1,AS_NEW,,,GAS,0,
3,501000.0,HOUSE,HOUSE,Flanders,Antwerp,Turnhout,2275,2024.0,187.0,505.0,...,0,0.0,0,1,,99.0,A,,0,
4,982700.0,APARTMENT,DUPLEX,Wallonia,Walloon Brabant,Nivelles,1410,2022.0,169.0,,...,1,142.0,0,0,AS_NEW,19.0,A+,GAS,0,


<class 'pandas.core.frame.DataFrame'>
Index: 37211 entries, 1 to 75506
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   price                           37211 non-null  float64
 1   property_type                   37211 non-null  object 
 2   subproperty_type                37211 non-null  object 
 3   region                          37211 non-null  object 
 4   province                        37211 non-null  object 
 5   locality                        37211 non-null  object 
 6   zip_code                        37211 non-null  int64  
 7   construction_year               19969 non-null  float64
 8   total_area_sqm                  32360 non-null  float64
 9   surface_land_sqm                37211 non-null  float64
 10  nbr_frontages                   29563 non-null  float64
 11  nbr_bedrooms                    37211 non-null  float64
 12  equipped_kitchen                20753

construction_year                 17242
cadastral_income                  17168
equipped_kitchen                  16458
heating_type                      13276
state_building                    11529
primary_energy_consumption_sqm    10070
terrace_sqm                        8818
epc                                8721
nbr_frontages                      7648
total_area_sqm                     4851
garden_sqm                         2361
fl_double_glazing                     0
fl_floodzone                          0
fl_swimming_pool                      0
fl_garden                             0
price                                 0
fl_terrace                            0
fl_open_fire                          0
property_type                         0
nbr_bedrooms                          0
surface_land_sqm                      0
zip_code                              0
locality                              0
province                              0
region                                0


## Splitting the data

Creating variables X and y: define the target and the feature.

In [3]:
# Name X and y (specific columns=subset(houses))
X = df_house.drop(columns=['price', 'subproperty_type', 'property_type', 'zip_code', 'locality', 'construction_year', 'cadastral_income'])
y = df_house['price']

# Print shape
print("X shape: ", X.shape)
print("y-shape: ", y.shape)

# Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

X_train.info()

X shape:  (37211, 20)
y-shape:  (37211,)
<class 'pandas.core.frame.DataFrame'>
Index: 29768 entries, 8081 to 31954
Data columns (total 20 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   region                          29768 non-null  object 
 1   province                        29768 non-null  object 
 2   total_area_sqm                  25900 non-null  float64
 3   surface_land_sqm                29768 non-null  float64
 4   nbr_frontages                   23670 non-null  float64
 5   nbr_bedrooms                    29768 non-null  float64
 6   equipped_kitchen                16580 non-null  object 
 7   fl_furnished                    29768 non-null  int64  
 8   fl_open_fire                    29768 non-null  int64  
 9   fl_terrace                      29768 non-null  int64  
 10  terrace_sqm                     22712 non-null  float64
 11  fl_garden                       29768 non-null  int64 

## 1. Imputing missing values

Impute missing values:
- numerical: mean
- categorical: most frequent

Function: Imputing missing values

Impute missing values in both X_train and x_test

So, the correct order of preprocessing steps is:
- Impute missing values
- Encode categorical columns
- Rescale numeric features

In [4]:
from sklearn.impute import SimpleImputer
import pandas as pd

def impute_data(X_train):
    """
    Imputes missing values in both numerical and categorical columns of the input DataFrame using SimpleImputer.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Input DataFrame containing columns with missing values.

    Returns:
    --------
    pandas DataFrame
        DataFrame with missing values imputed.
    """
    # Select columns with numerical and categorical data
    numeric_cols = X_train.select_dtypes(exclude='object').columns.tolist()
    categorical_cols = X_train.select_dtypes(include='object').columns.tolist()

    # Impute missing values for numerical columns
    numeric_imputer = SimpleImputer(strategy='mean')  
    X_train[numeric_cols] = numeric_imputer.fit_transform(X_train[numeric_cols])

    # Impute missing values for categorical columns
    categorical_imputer = SimpleImputer(strategy='most_frequent')  
    X_train[categorical_cols] = categorical_imputer.fit_transform(X_train[categorical_cols])

    return X_train

# Example usage:
#X_train_imputed = impute_data(X_train)
print(X_train.isna().sum().sort_values(ascending=False))

equipped_kitchen                  13188
heating_type                      10644
state_building                     9176
primary_energy_consumption_sqm     8094
terrace_sqm                        7056
epc                                7002
nbr_frontages                      6098
total_area_sqm                     3868
garden_sqm                         1853
fl_floodzone                          0
fl_swimming_pool                      0
region                                0
fl_garden                             0
province                              0
fl_terrace                            0
fl_open_fire                          0
fl_furnished                          0
nbr_bedrooms                          0
surface_land_sqm                      0
fl_double_glazing                     0
dtype: int64


In [5]:
from sklearn.impute import SimpleImputer
import pandas as pd

def impute_data(X_train, X_test):
    """
    Imputes missing values in both numerical and categorical columns of the input DataFrames using SimpleImputer.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Training DataFrame containing columns with missing values.
    X_test : pandas DataFrame
        Testing DataFrame containing columns with missing values.

    Returns:
    --------
    pandas DataFrame
        Training and testing DataFrames with missing values imputed.
    """
    # Select columns with numerical and categorical data in training set
    numeric_cols_train = X_train.select_dtypes(exclude='object').columns.tolist()
    categorical_cols_train = X_train.select_dtypes(include='object').columns.tolist()

    # Select columns with numerical and categorical data in testing set
    numeric_cols_test = X_test.select_dtypes(exclude='object').columns.tolist()
    categorical_cols_test = X_test.select_dtypes(include='object').columns.tolist()

    # Impute missing values for numerical columns in training set
    numeric_imputer_train = SimpleImputer(strategy='mean')  
    X_train[numeric_cols_train] = numeric_imputer_train.fit_transform(X_train[numeric_cols_train])

    # Impute missing values for categorical columns in training set
    categorical_imputer_train = SimpleImputer(strategy='most_frequent')  
    X_train[categorical_cols_train] = categorical_imputer_train.fit_transform(X_train[categorical_cols_train])

    # Impute missing values for numerical columns in testing set
    numeric_imputer_test = SimpleImputer(strategy='mean')  
    X_test[numeric_cols_test] = numeric_imputer_test.fit_transform(X_test[numeric_cols_test])

    # Impute missing values for categorical columns in testing set
    categorical_imputer_test = SimpleImputer(strategy='most_frequent')  
    X_test[categorical_cols_test] = categorical_imputer_test.fit_transform(X_test[categorical_cols_test])

    return X_train, X_test

# Apply imputing to training and test data
X_train_encoded, X_test_encoded = impute_data(X_train.copy(), X_test.copy())

## 2. Encoding categorical data

Function: Encode_data

In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def encode_data(X_train):
    """
    Encodes categorical columns in the input DataFrame using OneHotEncoder.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Input DataFrame containing categorical columns to be encoded.

    Returns:
    --------
    pandas DataFrame
        DataFrame with categorical columns encoded using one-hot encoding.
    """
    # Select the columns with categorical values
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

    # Initialize the encoder
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_array = encoder.fit_transform(X_train[categorical_cols])

    # Convert the encoded array into a DataFrame
    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))

    # Concatenate the encoded DataFrame with the original DataFrame
    result_df = pd.concat([X_train, encoded_df], axis=1)

    # Drop the original categorical columns if needed
    result_df.drop(columns=categorical_cols, axis=1, inplace=True)

    return result_df

    print(result_df.isna().sum().sort_values(ascending=False)) 

# Example usage:
# X_train_encoded = encode_data(X_train)


In [7]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode(X_train, X_test):
    """
    Encodes categorical columns in the input DataFrames using OneHotEncoder.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Input training DataFrame.
    X_test : pandas DataFrame
        Input test DataFrame.

    Returns:
    --------
    Tuple of pandas DataFrames
        Encoded training and test DataFrames.
    """
    # Select the columns with categorical values
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    # Initialize the encoder
    encoder = OneHotEncoder(handle_unknown='ignore')

    # Fit and transform on training data
    X_train_ohe = encoder.fit_transform(X_train[categorical_cols])

    # Transform test data
    X_test_ohe = encoder.transform(X_test[categorical_cols])

    # Convert the encoded data into DataFrames
    X_train_ohe_df = pd.DataFrame(X_train_ohe.toarray(), columns=encoder.get_feature_names_out(categorical_cols))
    X_test_ohe_df = pd.DataFrame(X_test_ohe.toarray(), columns=encoder.get_feature_names_out(categorical_cols))

    # Drop original categorical columns from both training and test data
    X_train.drop(columns=categorical_cols, inplace=True)
    X_test.drop(columns=categorical_cols, inplace=True)

    # Concatenate encoded data with remaining data
    X_train_ohe = pd.concat([X_train.reset_index(drop=True), X_train_ohe_df.reset_index(drop=True)], axis=1)
    X_test_ohe = pd.concat([X_test.reset_index(drop=True), X_test_ohe_df.reset_index(drop=True)], axis=1)

    return X_train_ohe, X_test_ohe

# Apply one-hot encoding to training and test data
X_train_ohe, X_test_ohe = one_hot_encode(X_train.copy(), X_test.copy())


In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def encode_data(X_train, X_test):
    """
    Encodes categorical columns in the input DataFrames using OneHotEncoder.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Training DataFrame containing categorical columns to be encoded.
    X_test : pandas DataFrame
        Testing DataFrame containing categorical columns to be encoded.

    Returns:
    --------
    pandas DataFrame, pandas DataFrame
        DataFrames with categorical columns encoded using one-hot encoding for both training and testing datasets.
    """
    # Select the columns with categorical values in training set
    categorical_cols_train = X_train.select_dtypes(include=['object']).columns.tolist()

    # Select the columns with categorical values in testing set
    categorical_cols_test = X_test.select_dtypes(include=['object']).columns.tolist()

    # Initialize the encoder
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    # Fit and transform encoder on training data
    encoded_array_train = encoder.fit_transform(X_train[categorical_cols_train])
    encoded_array_test = encoder.transform(X_test[categorical_cols_test])

    # Convert the encoded arrays into DataFrames
    encoded_df_train = pd.DataFrame(encoded_array_train, columns=encoder.get_feature_names_out(categorical_cols_train))
    encoded_df_test = pd.DataFrame(encoded_array_test, columns=encoder.get_feature_names_out(categorical_cols_test))

    # Concatenate the encoded DataFrames with the original DataFrames for both training and testing sets
    result_df_train = pd.concat([X_train, encoded_df_train], axis=1)
    result_df_test = pd.concat([X_test, encoded_df_test], axis=1)

    # Drop the original categorical columns if needed
    result_df_train.drop(columns=categorical_cols_train, axis=1, inplace=True)
    result_df_test.drop(columns=categorical_cols_test, axis=1, inplace=True)

    return result_df_train, result_df_test


## 3. Rescaling numeric features with standardization

In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def scale_numeric_features(X_train_ohe, X_test_ohe):
    """
    Scale the numeric features in the training and test datasets using StandardScaler.

    Parameters:
    - X_train_ohe (DataFrame): DataFrame containing the training data with both numeric and non-numeric features.
    - X_test_ohe (DataFrame): DataFrame containing the test data with both numeric and non-numeric features.

    Returns:
    - X_train_stdv (DataFrame): DataFrame containing the scaled numeric features concatenated with the non-numeric features for training data.
    - X_test_stdv (DataFrame): DataFrame containing the scaled numeric features concatenated with the non-numeric features for test data.
    """
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Select numeric features for training data
    numeric_features_train = X_train_ohe.select_dtypes(include=['float64', 'int64'])

    # Fit and transform the numeric features in the training set
    scaled_features_train = scaler.fit_transform(numeric_features_train)

    # Convert the scaled training features array back to a DataFrame
    scaled_df_train = pd.DataFrame(scaled_features_train, columns=numeric_features_train.columns)

    # Select numeric features for test data
    numeric_features_test = X_test_ohe.select_dtypes(include=['float64', 'int64'])

    # Transform the numeric features in the test set
    scaled_features_test = scaler.transform(numeric_features_test)

    # Convert the scaled test features array back to a DataFrame
    scaled_df_test = pd.DataFrame(scaled_features_test, columns=numeric_features_test.columns)

    # Concatenate scaled numeric features with other columns in the original DataFrames
    X_train_stdv = pd.concat([X_train_ohe.drop(numeric_features_train.columns, axis=1), scaled_df_train], axis=1)
    X_test_stdv = pd.concat([X_test_ohe.drop(numeric_features_test.columns, axis=1), scaled_df_test], axis=1)

    return X_train_stdv, X_test_stdv

# Example usage:
X_train_stdv, X_test_stdv = scale_numeric_features(X_train_ohe, X_test_ohe)


## Pipeline: Preprocessing data

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

def preprocess_data(X_train, X_test):
    """
    Preprocesses training and test data including imputation, encoding, and scaling.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Input training DataFrame.
    X_test : pandas DataFrame
        Input test DataFrame.

    Returns:
    --------
    Tuple of pandas DataFrames
        Preprocessed training and test DataFrames.
    """
    # Separate numerical and categorical columns
    numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    # Define preprocessing steps for numerical and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Fit and transform the preprocessing steps on training data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Convert the processed data into DataFrames
    X_train_processed = pd.DataFrame(X_train_processed, columns=numeric_cols.tolist() +
                                     preprocessor.named_transformers_['cat']
                                     .named_steps['onehot'].get_feature_names_out(categorical_cols).tolist())
    X_test_processed = pd.DataFrame(X_test_processed, columns=numeric_cols.tolist() +
                                    preprocessor.named_transformers_['cat']
                                    .named_steps['onehot'].get_feature_names_out(categorical_cols).tolist())

    return X_train_processed, X_test_processed

# Preprocess training and test data
X_train_processed, X_test_processed = preprocess_data(X_train, X_test)


In [11]:
def preprocess_data(X_train, X_test):
    """Preprocesses training and test data including imputation, encoding, and scaling.

    Parameters:
        X_train (DataFrame): Input training DataFrame.
        X_test (DataFrame): Input test DataFrame.

    Returns:
        tuple of pandas DataFrames: Preprocessed training and test DataFrames.
    """    
    # Separate numerical and categorical columns
    numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    # Define preprocessing steps for numerical and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Fit and transform the preprocessing steps on training data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Convert the processed data into DataFrames
    X_train_processed = pd.DataFrame(X_train_processed, columns=numeric_cols.tolist() +
                                     preprocessor.named_transformers_['cat']
                                     .named_steps['onehot'].get_feature_names_out(categorical_cols).tolist())
    X_test_processed = pd.DataFrame(X_test_processed, columns=numeric_cols.tolist() +
                                    preprocessor.named_transformers_['cat']
                                    .named_steps['onehot'].get_feature_names_out(categorical_cols).tolist())

    return X_train_processed, X_test_processed, preprocessor

#  Call preprocess_data
X_train_processed, X_test_processed = preprocess_data(X_train, X_test)

# Save the object to a file
with open('preprocess.pkl', 'wb') as file:
    pickle.dump(preprocess_data, file)

# Save the object to a file
with open('preprocessor.pkl', 'wb') as file:
    pickle.dump(preprocessor, file)

ValueError: too many values to unpack (expected 2)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

def preprocess_data(X_train):
    """ Preprocesses training and test data including imputation, encoding, and scaling.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Input training DataFrame.

    Returns:
    --------
    pandas DataFrame
        Preprocessed training DataFrame.
    """
    # Separate numerical and categorical columns
    numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    # Define preprocessing steps for numerical and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Fit and transform the preprocessing steps on training data
    X_train_processed = preprocessor.fit_transform(X_train)

    # Convert the processed data into DataFrames
    X_train_processed = pd.DataFrame(X_train_processed, columns=numeric_cols.tolist() +
                                     preprocessor.named_transformers_['cat']
                                     .named_steps['onehot'].get_feature_names_out(categorical_cols).tolist())
    print(preprocessor)
    return X_train_processed, preprocessor


def preprocess_data_for_test(X_test, preprocessor):
    """ Preprocesses test data including imputation, encoding, and scaling.

    Parameters:
    -----------
    X_test : pandas DataFrame
        Input test DataFrame.
    preprocessor : sklearn ColumnTransformer
        Fitted preprocessor used to transform the test data.

    Returns:
    --------
    pandas DataFrame
        Preprocessed test DataFrame.
    """
    # Transform the test data using the fitted preprocessor
    X_test_processed = preprocessor.transform(X_test)

    # Get the column names for the transformed data
    numeric_cols = X_test.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X_test.select_dtypes(include=['object']).columns

    transformed_columns = numeric_cols.tolist() + \
                          preprocessor.named_transformers_['cat'].named_steps['onehot'] \
                          .get_feature_names_out(categorical_cols).tolist()

    # Convert the processed data into a DataFrame
    X_test_processed = pd.DataFrame(X_test_processed, columns=transformed_columns)

    return X_test_processed

# Preprocess training data and obtain the preprocessor object
X_train_processed, preprocessor = preprocess_data(X_train)

# Preprocess test data using the preprocessor object
X_test_processed = preprocess_data_for_test(X_test, preprocessor)


ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 Index(['total_area_sqm', 'surface_land_sqm', 'nbr_frontages', 'nbr_bedrooms',
       'fl_furnished', 'fl_open_fire', 'fl_terrace', 'terrace_sqm',
       'fl_garden', 'garden_sqm', 'fl_swimming_pool', 'fl_floodzone',
       'primary_energy_consumption_sqm', 'fl_double_glazing'],
      dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 Index(['region', 'province', 'equipped_kitchen', 'state_building', 'epc',
       'hea

In [17]:
print(X_test.info())
print(X_test.columns.to_list())

<class 'pandas.core.frame.DataFrame'>
Index: 7443 entries, 25920 to 74318
Data columns (total 20 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   region                          7443 non-null   object 
 1   province                        7443 non-null   object 
 2   total_area_sqm                  6460 non-null   float64
 3   surface_land_sqm                7443 non-null   float64
 4   nbr_frontages                   5893 non-null   float64
 5   nbr_bedrooms                    7443 non-null   float64
 6   equipped_kitchen                4173 non-null   object 
 7   fl_furnished                    7443 non-null   int64  
 8   fl_open_fire                    7443 non-null   int64  
 9   fl_terrace                      7443 non-null   int64  
 10  terrace_sqm                     5681 non-null   float64
 11  fl_garden                       7443 non-null   int64  
 12  garden_sqm                      69

In [None]:
import pickle

# Save the preprocessor to a file
with open('preprocessor.pkl', 'wb') as file:
    pickle.dump(preprocessor, file)

# Save the preprocess_data_for_test to a file
with open('preprocessing.pkl', 'wb') as file:
    pickle.dump(preprocess_data_for_test, file)

# New dataframe

In [None]:
import pandas as pd

# Sample data for the new DataFrame
data = {
    'region': ['Flanders', 'Flanders', 'Brussels-Capital', 'Wallonia'],
    'province': ['East Flanders', 'Antwerp', 'Brussels', 'Namur'],
    'total_area_sqm': [125.5, 185.3, 110.7, 150.2],
    'surface_land_sqm': [680, 180, 505, 710],
    'nbr_frontages': [3, 4, 2, 3],
    'nbr_bedrooms': [3, 4, 2, 3],
    'equipped_kitchen': ['nan', 'HYPER_EQUIPPED', 'INSTALLED', 'USA_UNINSTALLED'],
    'fl_furnished': [1, 0, 1, 0],
    'fl_open_fire': [1, 0, 1, 0],
    'fl_terrace': [1, 1, 0, 1],
    'terrace_sqm': [20, 25, None, 30],
    'fl_garden': [1, 0, 1, 0],
    'garden_sqm': [50, None, 60, None],
    'fl_swimming_pool': [1, 0, 1, 0],
    'fl_floodzone': [0, 1, 0, 1],
    'state_building': ['GOOD', 'AS_NEW', None, 'GOOD'],
    'primary_energy_consumption_sqm': [150, None, 280, None],
    'epc': ['A', 'B', 'C', 'D'],
    'heating_type': ['GAS', 'ELECTRIC', 'FUELOIL', None],
    'fl_double_glazing': [1, 0, 1, 0]
}

# Create a new DataFrame
new_data_df = pd.DataFrame(data)

# Display the new DataFrame
print(new_data_df)


In [19]:
columns = df_house[['region', 'province', 'total_area_sqm', 'surface_land_sqm', 'nbr_frontages', 'nbr_bedrooms', 'equipped_kitchen', 'fl_furnished', 'fl_open_fire', 'fl_terrace', 'terrace_sqm', 'fl_garden', 'garden_sqm', 'fl_swimming_pool', 'fl_floodzone', 'state_building', 'primary_energy_consumption_sqm', 'epc', 'heating_type', 'fl_double_glazing']]

# See unique values of multiple columns
for column in columns:
    multi_columns = df_house[column].unique().T
    print(f"Unique values in column '{column}': {multi_columns}")


Unique values in column 'region': ['Flanders' 'Wallonia' 'Brussels-Capital']
Unique values in column 'province': ['East Flanders' 'Antwerp' 'Flemish Brabant' 'West Flanders' 'Hainaut'
 'Liège' 'Brussels' 'Luxembourg' 'Walloon Brabant' 'Namur' 'Limburg']
Unique values in column 'total_area_sqm': [       nan 1.8700e+02 1.5500e+02 2.7700e+02 3.0900e+02 1.5000e+02
 5.8300e+02 1.8500e+02 1.6000e+02 1.2700e+02 1.0900e+02 1.8000e+02
 6.0000e+02 2.2000e+02 7.9000e+01 2.0400e+02 1.2000e+02 4.8000e+02
 1.3000e+02 1.0000e+02 2.0000e+02 1.8100e+02 1.6600e+02 1.4600e+02
 8.6000e+01 1.4800e+02 2.1300e+02 2.4300e+02 2.3500e+02 2.3100e+02
 2.2900e+02 4.3100e+02 1.6200e+02 2.5000e+02 1.5000e+03 1.4500e+02
 1.6800e+02 2.5900e+02 2.1800e+02 1.9100e+02 9.0000e+02 1.3300e+02
 1.4400e+02 1.6100e+02 1.7500e+02 1.8300e+02 1.8600e+02 2.0600e+02
 1.9300e+02 1.7000e+02 1.5600e+02 1.5800e+02 5.3700e+02 2.5500e+02
 2.0500e+02 1.8400e+02 9.5000e+01 3.3000e+02 1.4300e+02 2.5100e+02
 2.8100e+02 2.9000e+02 1.7900e+02 