## Libraries

In [19]:
# Import libraries
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
# Read the csv file
df = pd.read_csv("../data/cleaned_properties.csv")

# Display the head
display(df.head())
df.shape
df.columns

# Filter the DataFrame for values APARTMENT and APARTMENT_BLOCK
df_house1 = df[df["property_type"] == "HOUSE"]
df_house2 = df_house1[df_house1['subproperty_type'] != 'APARTMENT_BLOCK']

df_house = df[(df["property_type"] == "HOUSE") & (df['subproperty_type'] != 'APARTMENT_BLOCK')]

df_house.head()
print(df_house.info())
print(df_house.shape)

df_house["subproperty_type"].unique()
print(df_house["locality"].unique())
df_house.isna().sum().sort_values(ascending=False)

Unnamed: 0,price,property_type,subproperty_type,region,province,locality,zip_code,construction_year,total_area_sqm,surface_land_sqm,...,fl_garden,garden_sqm,fl_swimming_pool,fl_floodzone,state_building,primary_energy_consumption_sqm,epc,heating_type,fl_double_glazing,cadastral_income
0,225000.0,APARTMENT,APARTMENT,Flanders,Antwerp,Antwerp,2050,1963.0,100.0,,...,0,0.0,0,0,,231.0,C,GAS,1,922.0
1,449000.0,HOUSE,HOUSE,Flanders,East Flanders,Gent,9185,,,680.0,...,0,0.0,0,0,,221.0,C,,1,406.0
2,335000.0,APARTMENT,APARTMENT,Brussels-Capital,Brussels,Brussels,1070,,142.0,,...,0,0.0,0,1,AS_NEW,,,GAS,0,
3,501000.0,HOUSE,HOUSE,Flanders,Antwerp,Turnhout,2275,2024.0,187.0,505.0,...,0,0.0,0,1,,99.0,A,,0,
4,982700.0,APARTMENT,DUPLEX,Wallonia,Walloon Brabant,Nivelles,1410,2022.0,169.0,,...,1,142.0,0,0,AS_NEW,19.0,A+,GAS,0,


<class 'pandas.core.frame.DataFrame'>
Index: 37211 entries, 1 to 75506
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   price                           37211 non-null  float64
 1   property_type                   37211 non-null  object 
 2   subproperty_type                37211 non-null  object 
 3   region                          37211 non-null  object 
 4   province                        37211 non-null  object 
 5   locality                        37211 non-null  object 
 6   zip_code                        37211 non-null  int64  
 7   construction_year               19969 non-null  float64
 8   total_area_sqm                  32360 non-null  float64
 9   surface_land_sqm                37211 non-null  float64
 10  nbr_frontages                   29563 non-null  float64
 11  nbr_bedrooms                    37211 non-null  float64
 12  equipped_kitchen                20753

construction_year                 17242
cadastral_income                  17168
equipped_kitchen                  16458
heating_type                      13276
state_building                    11529
primary_energy_consumption_sqm    10070
terrace_sqm                        8818
epc                                8721
nbr_frontages                      7648
total_area_sqm                     4851
garden_sqm                         2361
fl_double_glazing                     0
fl_floodzone                          0
fl_swimming_pool                      0
fl_garden                             0
price                                 0
fl_terrace                            0
fl_open_fire                          0
property_type                         0
nbr_bedrooms                          0
surface_land_sqm                      0
zip_code                              0
locality                              0
province                              0
region                                0


## Splitting the data

Creating variables X and y: define the target and the feature.

In [21]:
# Name X and y (specific columns=subset(houses))
X = df_house.drop(columns=['price', 'subproperty_type', 'property_type', 'zip_code', 'locality', 'construction_year', 'cadastral_income'])
y = df_house['price']

# Print shape
print("X shape: ", X.shape)
print("y-shape: ", y.shape)

# Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

X_train.info()

X shape:  (37211, 20)
y-shape:  (37211,)
<class 'pandas.core.frame.DataFrame'>
Index: 29768 entries, 8081 to 31954
Data columns (total 20 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   region                          29768 non-null  object 
 1   province                        29768 non-null  object 
 2   total_area_sqm                  25900 non-null  float64
 3   surface_land_sqm                29768 non-null  float64
 4   nbr_frontages                   23670 non-null  float64
 5   nbr_bedrooms                    29768 non-null  float64
 6   equipped_kitchen                16580 non-null  object 
 7   fl_furnished                    29768 non-null  int64  
 8   fl_open_fire                    29768 non-null  int64  
 9   fl_terrace                      29768 non-null  int64  
 10  terrace_sqm                     22712 non-null  float64
 11  fl_garden                       29768 non-null  int64 

## 1. Imputing missing values

Impute missing values:
- numerical: mean
- categorical: most frequent

Function: Imputing missing values

In [22]:
from sklearn.impute import SimpleImputer
import pandas as pd

def impute_data(X_train):
    """
    Imputes missing values in both numerical and categorical columns of the input DataFrame using SimpleImputer.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Input DataFrame containing columns with missing values.

    Returns:
    --------
    pandas DataFrame
        DataFrame with missing values imputed.
    """
    # Select columns with numerical and categorical data
    numeric_cols = X_train.select_dtypes(exclude='object').columns.tolist()
    categorical_cols = X_train.select_dtypes(include='object').columns.tolist()

    # Impute missing values for numerical columns
    numeric_imputer = SimpleImputer(strategy='mean')  
    X_train[numeric_cols] = numeric_imputer.fit_transform(X_train[numeric_cols])

    # Impute missing values for categorical columns
    categorical_imputer = SimpleImputer(strategy='most_frequent')  
    X_train[categorical_cols] = categorical_imputer.fit_transform(X_train[categorical_cols])

    return X_train

# Example usage:
#X_train_imputed = impute_data(X_train)
print(X_train.isna().sum().sort_values(ascending=False))

equipped_kitchen                  13188
heating_type                      10644
state_building                     9176
primary_energy_consumption_sqm     8094
terrace_sqm                        7056
epc                                7002
nbr_frontages                      6098
total_area_sqm                     3868
garden_sqm                         1853
fl_floodzone                          0
fl_swimming_pool                      0
region                                0
fl_garden                             0
province                              0
fl_terrace                            0
fl_open_fire                          0
fl_furnished                          0
nbr_bedrooms                          0
surface_land_sqm                      0
fl_double_glazing                     0
dtype: int64


In [23]:
# Impute missing values in both X_train and x_test
X_train_imputed = impute_data(X_train)
X_test_imputed = impute_data(X_test)

## 2. Rescaling data

Function: Encode_data

In [24]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def encode_data(X_train):
    """
    Encodes categorical columns in the input DataFrame using OneHotEncoder.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Input DataFrame containing categorical columns to be encoded.

    Returns:
    --------
    pandas DataFrame
        DataFrame with categorical columns encoded using one-hot encoding.
    """
    # Select the columns with categorical values
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

    # Initialize the encoder
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_array = encoder.fit_transform(X_train[categorical_cols])

    # Convert the encoded array into a DataFrame
    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))

    # Concatenate the encoded DataFrame with the original DataFrame
    result_df = pd.concat([X_train, encoded_df], axis=1)

    # Drop the original categorical columns if needed
    result_df.drop(columns=categorical_cols, axis=1, inplace=True)

    return result_df

    print(result_df.isna().sum().sort_values(ascending=False)) 

# Example usage:
# X_train_encoded = encode_data(X_train)


In [25]:
result_df.isna().sum().sort_values(ascending=False)

NameError: name 'result_df' is not defined


## Pipeline: Preprocessing data

In [26]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

def preprocess_data(X_train, X_test):
    """
    Preprocesses training and test data including imputation, encoding, and scaling.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Input training DataFrame.
    X_test : pandas DataFrame
        Input test DataFrame.

    Returns:
    --------
    Tuple of pandas DataFrames
        Preprocessed training and test DataFrames.
    """
    # Separate numerical and categorical columns
    numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    # Define preprocessing steps for numerical and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Fit and transform the preprocessing steps on training data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Convert the processed data into DataFrames
    X_train_processed = pd.DataFrame(X_train_processed, columns=numeric_cols.tolist() +
                                     preprocessor.named_transformers_['cat']
                                     .named_steps['onehot'].get_feature_names_out(categorical_cols).tolist())
    X_test_processed = pd.DataFrame(X_test_processed, columns=numeric_cols.tolist() +
                                    preprocessor.named_transformers_['cat']
                                    .named_steps['onehot'].get_feature_names_out(categorical_cols).tolist())

    return X_train_processed, X_test_processed

# Preprocess training and test data
X_train_processed, X_test_processed = preprocess_data(X_train, X_test)


## 3. Rescaling numeric features (hint: standardization)

In [27]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# X_train_processed contains numeric features
numeric_features = X_train_processed.select_dtypes(include=['float64', 'int64'])

# Fit and transform the numeric features in the training set
scaled_features = scaler.fit_transform(numeric_features)

# Convert the scaled features array back to a DataFrame
scaled_df = pd.DataFrame(scaled_features, columns=numeric_features.columns)

# Concatenate scaled numeric features with other columns in the original DataFrame
# Assuming X_train_processed contains both categorical and numeric features
final_df = pd.concat([X_train_processed.drop(numeric_features.columns, axis=1), scaled_df], axis=1)

# Use final_df for your linear regression model

# Transform the numeric features in the test set using the parameters learned from the training set
X_test_scaled = scaler.transform(X_test.select_dtypes(include=['float64', 'int64']))


# Combine the scaled numeric features with the encoded categorical features
# You may need to concatenate these with the encoded categorical features from step 2
# Depending on how you've encoded the categorical features

# Now, X_train_scaled and X_test_scaled contain the rescaled numeric features
# You can use these in your linear regression model


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- epc_A
- epc_A+
- epc_A++
- epc_B
- epc_C
- ...


## Linear Regressor model

OneHotEncoder

In [28]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode(X_train, X_test):
    """
    Encodes categorical columns in the input DataFrames using OneHotEncoder.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Input training DataFrame.
    X_test : pandas DataFrame
        Input test DataFrame.

    Returns:
    --------
    Tuple of pandas DataFrames
        Encoded training and test DataFrames.
    """
    # Select the columns with categorical values
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    # Initialize the encoder
    encoder = OneHotEncoder(handle_unknown='ignore')

    # Fit and transform on training data
    X_train_encoded = encoder.fit_transform(X_train[categorical_cols])

    # Transform test data
    X_test_encoded = encoder.transform(X_test[categorical_cols])

    # Convert the encoded data into DataFrames
    X_train_encoded_df = pd.DataFrame(X_train_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_cols))
    X_test_encoded_df = pd.DataFrame(X_test_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_cols))

    # Drop original categorical columns from both training and test data
    X_train.drop(columns=categorical_cols, inplace=True)
    X_test.drop(columns=categorical_cols, inplace=True)

    # Concatenate encoded data with remaining data
    X_train_final = pd.concat([X_train.reset_index(drop=True), X_train_encoded_df.reset_index(drop=True)], axis=1)
    X_test_final = pd.concat([X_test.reset_index(drop=True), X_test_encoded_df.reset_index(drop=True)], axis=1)

    return X_train_final, X_test_final

# Apply one-hot encoding to training and test data
X_train_encoded, X_test_encoded = one_hot_encode(X_train.copy(), X_test.copy())


Linear Regression: imputing NaN, OneHotEncoder

In [29]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model using the processed X_train and y_train
model.fit(X_train_processed, y_train)

# Display score of training model
training_score = model.score(X_train_processed, y_train)
print(training_score*100, "%")


# Once the model is trained, you can use it to make predictions on new data, 
# for example, the processed X_test
predictions = model.predict(X_test_processed)
print(type(predictions))
display(predictions)

# Display score of test model
testing_score = model.score(X_test_processed, y_test)
print(testing_score*100, "%")

print("Training R^2 score:", training_score*100, "%")
print("Testing R^2 score:", testing_score*100, "%")

32.56506221423302 %
<class 'numpy.ndarray'>


array([518912., 203520., 643072., ..., 781312., 360960., 343552.])

41.21152166944702 %
Training R^2 score: 32.56506221423302 %
Testing R^2 score: 41.21152166944702 %


Linear Regression: imputing NaN, OneHotEncoder, StandardScaler

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model using the processed X_train and y_train
model.fit(final_df, y_train)

# Display score of training model
training_score = model.score(final_df, y_train)
print(training_score*100, "%")


# Once the model is trained, you can use it to make predictions on new data, 
# for example, the processed X_test
predictions = model.predict(X_test_scaled)
print(type(predictions))
display(predictions)

# Display score of test model
testing_score = model.score(X_test_scaled, y_test)
print(testing_score*100, "%")

print("Training R^2 score:", training_score*100, "%")
print("Testing R^2 score:", testing_score*100, "%")

NameError: name 'final_df' is not defined

RandomForestRegressor