## Libraries

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read the csv file
df = pd.read_csv("../data/cleaned_properties.csv")

# Display the head
display(df.head())
df.shape
df.columns

# Filter the DataFrame for values APARTMENT and APARTMENT_BLOCK
df_house1 = df[df["property_type"] == "HOUSE"]
df_house2 = df_house1[df_house1['subproperty_type'] != 'APARTMENT_BLOCK']

df_house = df[(df["property_type"] == "HOUSE") & (df['subproperty_type'] != 'APARTMENT_BLOCK')]

df_house.head()
print(df_house.info())
print(df_house.shape)

df_house["subproperty_type"].unique()
print(df_house["locality"].unique())
df_house.isna().sum().sort_values(ascending=False)

## Splitting the data

Creating variables X and y: define the target and the feature.

In [None]:
# Name X and y (specific columns=subset(houses))
X = df_house.drop(columns=['price', 'subproperty_type', 'property_type', 'zip_code', 'locality', 'construction_year', 'cadastral_income'])
y = df_house['price']

# Print shape
print("X shape: ", X.shape)
print("y-shape: ", y.shape)

# Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

X_train.info()
print(X_train.columns.tolist())

## 1. Imputing missing values

Impute missing values:
- numerical: mean
- categorical: most frequent

In [None]:
from sklearn.impute import SimpleImputer
import pandas as pd

# Define DataFrame with missing values
df = X_train
df2 = X_test

# Select columns with numerical and categorical data
numeric_cols = df.select_dtypes(exclude='object').columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()

# Impute missing values for numerical columns
numeric_imputer = SimpleImputer(strategy='mean')  # You can choose 'mean', 'median', 'most_frequent', or a constant value
df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])
df2[numeric_cols] = numeric_imputer.transform(df2[numeric_cols])

# Impute missing values for categorical columns

categorical_imputer = SimpleImputer(strategy='most_frequent')  # You can choose 'most_frequent', 'constant', or a custom value
df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])
df2[categorical_cols] = categorical_imputer.transform(df2[categorical_cols])

# Display the first few rows of the training and testing sets after imputation
print("Training set after imputation:")
display(df.head())
print("\nTesting set after imputation:")
display(df2.head())


Function: Imputing missing values for X_train

In [None]:
from sklearn.impute import SimpleImputer
import pandas as pd

def impute_data(X_train):
    """
    Imputes missing values in both numerical and categorical columns of the input DataFrame using SimpleImputer.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Input DataFrame containing columns with missing values.

    Returns:
    --------
    pandas DataFrame
        DataFrame with missing values imputed.
    """
    # Select columns with numerical and categorical data
    numeric_cols = X_train.select_dtypes(exclude='object').columns.tolist()
    categorical_cols = X_train.select_dtypes(include='object').columns.tolist()

    # Impute missing values for numerical columns
    numeric_imputer = SimpleImputer(strategy='mean')  
    X_train[numeric_cols] = numeric_imputer.fit_transform(X_train[numeric_cols])

    # Impute missing values for categorical columns
    categorical_imputer = SimpleImputer(strategy='most_frequent')  
    X_train[categorical_cols] = categorical_imputer.fit_transform(X_train[categorical_cols])

    #return X_train

# Example usage:
#X_train_imputed = impute_data(X_train)
print(X_train.isna().sum().sort_values(ascending=False))

Function: Imputing missing values for X_train and X_test

In [None]:
from sklearn.impute import SimpleImputer
import pandas as pd

def impute_missing_values(X_train_imputed, X_test_imputed):
    """
    Imputes missing values in both numerical and categorical columns of training and testing DataFrames.

    Parameters:
    X_train_imputed (DataFrame): The training DataFrame containing missing values.
    X_test_imputed (DataFrame): The testing DataFrame containing missing values.

    Returns:
    X_train_imputed (DataFrame): The training DataFrame with missing values imputed.
    X_test_imputed (DataFrame): The testing DataFrame with missing values imputed.
    """
    # Select columns with numerical and categorical data
    numeric_cols = X_train_imputed.select_dtypes(exclude='object').columns.tolist()
    categorical_cols = X_train_imputed.select_dtypes(include='object').columns.tolist()

    # Impute missing values for numerical columns
    numeric_imputer = SimpleImputer(strategy='mean')  
    X_train_imputed[numeric_cols] = numeric_imputer.fit_transform(X_train_imputed[numeric_cols])
    X_test_imputed[numeric_cols] = numeric_imputer.transform(X_test_imputed[numeric_cols])

    # Impute missing values for categorical columns
    categorical_imputer = SimpleImputer(strategy='most_frequent')  
    X_train_imputed[categorical_cols] = categorical_imputer.fit_transform(X_train_imputed[categorical_cols])
    X_test_imputed[categorical_cols] = categorical_imputer.transform(X_test_imputed[categorical_cols])

    return X_train_imputed, X_test_imputed

# Call the function with the train and test DF as input and store the returned imputed DF.
X_train_imputed, X_test_imputed = impute_missing_values(X_train.copy(), X_test.copy())

# Display the first few rows of the training and testing sets after imputation
print("Training set after imputation:")
display(X_train_imputed.head())
print("\nTesting set after imputation:")
display(X_test_imputed.head())


## 2. Encoding categorical data

Convert categorical data to a numerical form.

Data to convert:  'region', 'province', 'equipped_kitchen', 'state_building', 'epc', 'heating_type'

In [None]:
# Select the columns with categorical values
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print("columns_to_encode =", categorical_cols)

# Initialize the encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_array = encoder.fit_transform(X_train[categorical_cols])

# Convert the encoded array into a DataFrame
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))

# Concatenate the encoded DataFrame with the original DataFrame
result_df = pd.concat([X_train, encoded_df], axis=1)

# Drop the original categorical columns if needed
result_df.drop(columns = categorical_cols, axis=1, inplace=True) 

print(result_df.info())
df.isna().sum().sort_values(ascending=False)


Function: Encode_data for X_train

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def encode_data(X_train):
    """
    Encodes categorical columns in the input DataFrame using OneHotEncoder.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Input DataFrame containing categorical columns to be encoded.

    Returns:
    --------
    pandas DataFrame
        DataFrame with categorical columns encoded using one-hot encoding.
    """
    # Select the columns with categorical values
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

    # Initialize the encoder
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_array = encoder.fit_transform(X_train[categorical_cols])

    # Convert the encoded array into a DataFrame
    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))

    # Concatenate the encoded DataFrame with the original DataFrame
    result_df = pd.concat([X_train, encoded_df], axis=1)

    # Drop the original categorical columns if needed
    result_df.drop(columns=categorical_cols, axis=1, inplace=True)

    return result_df

# Example usage:
# X_train_encoded = encode_data(X_train_imputed)


Function: one_hot_encode for X_train, X_test

In [None]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode(X_train, X_test):
    """
    Encodes categorical columns in the input DataFrames using OneHotEncoder.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Input training DataFrame.
    X_test : pandas DataFrame
        Input test DataFrame.

    Returns:
    --------
    Tuple of pandas DataFrames
        Encoded training and test DataFrames.
    """
    # Select the columns with categorical values
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    # Initialize the encoder
    encoder = OneHotEncoder(handle_unknown='ignore')

    # Fit and transform on training data
    X_train_ohe = encoder.fit_transform(X_train[categorical_cols])

    # Transform test data
    X_test_ohe = encoder.transform(X_test[categorical_cols])

    # Convert the encoded data into DataFrames
    X_train_ohe_df = pd.DataFrame(X_train_ohe.toarray(), columns=encoder.get_feature_names_out(categorical_cols))
    X_test_ohe_df = pd.DataFrame(X_test_ohe.toarray(), columns=encoder.get_feature_names_out(categorical_cols))

    # Drop original categorical columns from both training and test data
    X_train.drop(columns=categorical_cols, inplace=True)
    X_test.drop(columns=categorical_cols, inplace=True)

    # Concatenate encoded data with remaining data
    X_train_ohe = pd.concat([X_train.reset_index(drop=True), X_train_ohe_df.reset_index(drop=True)], axis=1)
    X_test_ohe = pd.concat([X_test.reset_index(drop=True), X_test_ohe_df.reset_index(drop=True)], axis=1)

    return X_train_ohe, X_test_ohe

# Apply one-hot encoding to training and test data
X_train_ohe, X_test_ohe = one_hot_encode(X_train_imputed.copy(), X_test_imputed.copy())

# Display the first few rows of the training and testing sets after encoding
print("Training set after encoding:")
display(X_train_ohe.head())
print(X_train_ohe.columns.tolist())
print("\nTesting set after encoding:")
display(X_test_ohe.head())
print(X_test_ohe.columns.tolist())


## 3. Rescaling numeric features with standardization

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# X_train_ohe contains numeric features
numeric_features = X_train_ohe.select_dtypes(include=['float64', 'int64'])

# Fit and transform the numeric features in the training set
scaled_features = scaler.fit_transform(numeric_features)

# Convert the scaled features array back to a DataFrame
scaled_df = pd.DataFrame(scaled_features, columns=numeric_features.columns)

# Concatenate scaled numeric features with other columns in the original DataFrame
# Assuming X_train_ohe contains both categorical and numeric features
final_df = pd.concat([X_train_ohe.drop(numeric_features.columns, axis=1), scaled_df], axis=1)

# Use final_df for your linear regression model

# Transform the numeric features in the test set using the parameters learned from the training set
X_test_scaled = scaler.transform(X_test_ohe.select_dtypes(include=['float64', 'int64']))

# Combine the scaled numeric features with the encoded categorical features
# You may need to concatenate these with the encoded categorical features from step 2
# Depending on how you've encoded the categorical features

# Now, X_train_scaled and X_test_scaled contain the rescaled numeric features
# You can use these in your linear regression model

# Transform the numeric features in the test set using the parameters learned from the training set
X_test_scaled = scaler.transform(X_test_ohe.select_dtypes(include=['float64', 'int64']))


# Combine the scaled numeric features with the encoded categorical features
# You may need to concatenate these with the encoded categorical features from step 2
# Depending on how you've encoded the categorical features

# Now, X_train_scaled and X_test_scaled contain the rescaled numeric features
# You can use these in your linear regression model

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def scale_numeric_features(X_train_ohe, X_test_ohe):
    """
    Scale the numeric features in the training and test datasets using StandardScaler.

    Parameters:
    - X_train_ohe (DataFrame): DataFrame containing the training data with both numeric and non-numeric features.
    - X_test_ohe (DataFrame): DataFrame containing the test data with both numeric and non-numeric features.

    Returns:
    - X_train_stdv (DataFrame): DataFrame containing the scaled numeric features concatenated with the non-numeric features for training data.
    - X_test_stdv (DataFrame): DataFrame containing the scaled numeric features concatenated with the non-numeric features for test data.
    """
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Select numeric features for training data
    numeric_features_train = X_train_ohe.select_dtypes(include=['float64', 'int64'])

    # Fit and transform the numeric features in the training set
    scaled_features_train = scaler.fit_transform(numeric_features_train)

    # Convert the scaled training features array back to a DataFrame
    scaled_df_train = pd.DataFrame(scaled_features_train, columns=numeric_features_train.columns)

    # Select numeric features for test data
    numeric_features_test = X_test_ohe.select_dtypes(include=['float64', 'int64'])

    # Transform the numeric features in the test set
    scaled_features_test = scaler.transform(numeric_features_test)

    # Convert the scaled test features array back to a DataFrame
    scaled_df_test = pd.DataFrame(scaled_features_test, columns=numeric_features_test.columns)

    # Concatenate scaled numeric features with other columns in the original DataFrames
    X_train_stdv = pd.concat([X_train_ohe.drop(numeric_features_train.columns, axis=1), scaled_df_train], axis=1)
    X_test_stdv = pd.concat([X_test_ohe.drop(numeric_features_test.columns, axis=1), scaled_df_test], axis=1)

    return X_train_stdv, X_test_stdv

# Example usage:
X_train_stdv, X_test_stdv = scale_numeric_features(X_train_ohe, X_test_ohe)

# Display the first few rows of the training and testing sets after rescaling
print("Training set after rescaling:")
display(X_train_stdv.head())
print(X_train_stdv.columns.tolist())
print("\nTesting set after rescaling:")
display(X_test_stdv.head())
print(X_test_stdv.columns.tolist())



## Pipeline: Preprocessing data

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

def preprocess_data(X_train, X_test):
    """
    Preprocesses training and test data including imputation, encoding, and scaling.

    Parameters:
    -----------
    X_train : pandas DataFrame
        Input training DataFrame.
    X_test : pandas DataFrame
        Input test DataFrame.

    Returns:
    --------
    Tuple of pandas DataFrames
        Preprocessed training and test DataFrames.
    """
    # Separate numerical and categorical columns
    numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    # Define preprocessing steps for numerical and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Fit and transform the preprocessing steps on training data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Convert the processed data into DataFrames
    X_train_processed = pd.DataFrame(X_train_processed, columns=numeric_cols.tolist() +
                                     preprocessor.named_transformers_['cat']
                                     .named_steps['onehot'].get_feature_names_out(categorical_cols).tolist())
    X_test_processed = pd.DataFrame(X_test_processed, columns=numeric_cols.tolist() +
                                    preprocessor.named_transformers_['cat']
                                    .named_steps['onehot'].get_feature_names_out(categorical_cols).tolist())

    return X_train_processed, X_test_processed

# Preprocess training and test data
X_train_processed, X_test_processed = preprocess_data(X_train, X_test)


In [None]:
# Display the first few rows of the training and testing sets after preprocessing
print("Training set after processing:")
display(X_train_processed.head())
print("\nTesting set after processing:")
display(X_test_processed.head())

## RandomForestRegressor

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
import pandas as pd

**X_train_ohe, X_test_ohe**

In [None]:
# Initialize the RFR model
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)

# Train the model using the processed X_train and y_train
regressor.fit(X_train_ohe, y_train)


In [None]:
# Display score of training model
training_score = regressor.score(X_train_ohe, y_train)

# Prediction
y_test_pred = regressor.predict(X_test_ohe)

# Display score of test model
testing_score = regressor.score(X_test_ohe, y_test)

print("Training R^2 score:", training_score*100, "%")
print("Testing R^2 score:", testing_score*100, "%")

# Evaluate the model
mse = mean_squared_error(y_test, y_test_pred)
print("Mean Squared Error:", mse)

**X_train_stdv, X_test_stdv**

In [None]:
# Initialize the RFR model
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)

# Train the model using the processed X_train and y_train
regressor.fit(X_train_stdv, y_train)


In [None]:
# Display score of training model
training_score = regressor.score(X_train_stdv, y_train)

# Prediction
yst_test_pred = regressor.predict(X_test_stdv)

# Display score of test model
testing_score = regressor.score(X_test_stdv, y_test)

print("Training R^2 score:", training_score*100, "%")
print("Testing R^2 score:", testing_score*100, "%")

# Evaluate the model
mse = mean_squared_error(y_test, yst_test_pred)
print("Mean Squared Error:", mse)

**X_train_processed, X_test_processed**

In [None]:
# Initialize the RFR model
regressor = RandomForestRegressor(n_estimators = 10, random_state = 42)

# Train the model using the processed X_train and y_train
regressor.fit(X_train_processed, y_train)

In [None]:
# Display score of training model
training_score = regressor.score(X_train_processed, y_train)

# Prediction
ypr_test_pred = regressor.predict(X_test_processed)

# Display score of test model
testing_score = regressor.score(X_test_processed, y_test)

print("Training R^2 score:", training_score*100, "%")
print("Testing R^2 score:", testing_score*100, "%")

# Evaluate the model
mse = mean_squared_error(y_test, ypr_test_pred)
print("Mean Squared Error:", mse)


**Evaluation of RandomForestRegressor: X_train_processed, X_test_processed**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd

In [None]:
# Initialize the RaFo model: n_estimators = 10?
rafo = RandomForestRegressor(n_estimators = 10, random_state = 42)

# Train the model using the processed X_train and y_train
rafo.fit(X_train_processed, y_train)

In [None]:
# Predict on the test set
ypi_pred = rafo.predict(X_test_processed)
print("Prediction for test set: {}".format(ypi_pred))

# Evaluate the model
mae = metrics.mean_absolute_error(y_test, ypi_pred)
mse = metrics.mean_squared_error(y_test, ypi_pred)
r2 = np.sqrt(metrics.mean_squared_error(y_test, ypi_pred))

print('Mean Absolute Error:', mae)
print('Mean Square Error:', mse)
print('Root Mean Square Error:', r2)

#Actual value and the predicted value
reg_model_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': ypi_pred})
reg_model_diff

**Evaluation of RandomForestRegressor: X_train_processed, X_test_processed**

Without n-estimators

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error

# Create and fit a Random Forest Regressor model
rf_model = RandomForestRegressor(random_state=42)

# Train the model using the processed X_train and y_train
rf_model.fit(X_train_processed, y_train)

# Display score of training and test model
training_score = rf_model.score(X_train_processed, y_train)
testing_score = rf_model.score(X_test_processed, y_test)

# Predictions
y_train_pred = rf_model.predict(X_train_processed)
y_test_pred = rf_model.predict(X_test_processed)

# Evaluation
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Mean Absolute Error (MAE): 
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# Mean Squared Error (MSE)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

train_rmse = root_mean_squared_error(y_train, y_train_pred)
test_rmse = root_mean_squared_error(y_test, y_test_pred)

print("Random Forest Regressor Model Evaluation:")
print("Training score: ", round(training_score*100, 2), "%")
print("Testing score: ", round(testing_score*100, 2), "%")
print("Training R^2 score:", round(train_r2*100,2),"%")
print("Testing R^2 score:", round(test_r2*100,2),"%")
print("Training MAE:", train_mae)
print("Testing MAE:", test_mae)
print("Training MSE:", train_mse)
print("Testing MSE:", test_mse)
print("Training RMSE:", train_rmse)
print("Testing RMSE:", test_rmse)


1. Mean Absolute Error (MAE): This metric measures the average absolute difference between the predicted values and the actual values. It gives you an idea of how much the predictions deviate from the actual values, on average.

2. Mean Squared Error (MSE): MSE measures the average of the squares of the errors, i.e., the average squared difference between the predicted values and the actual values. It penalizes larger errors more heavily than smaller ones.

3. Root Mean Squared Error (RMSE): RMSE is simply the square root of MSE. It gives you an idea of the average magnitude of the errors in the predicted values. Since it's in the same units as the target variable, it's often easier to interpret than MSE.

4. R-squared (R^2) Score: R^2 score measures the proportion of the variance in the target variable that is predictable from the input features. It ranges from 0 to 1, where 1 indicates a perfect fit and 0 indicates that the model does not explain any of the variance in the target variable better than a horizontal line (the mean of the target variable).

So, when you evaluate your Random Forest Regressor using these metrics, you're essentially checking how close its predictions are to the actual values and how well it explains the variance in the target variable. The goal is to have low values for MAE, MSE, and RMSE, and a high value for R^2.