## Model-training

Libraries

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid", {"axes.facecolor": ".9"})

In [None]:
# Read the csv file
df = pd.read_csv("../data/cleaned_properties.csv")


# Display the head
display(df.head())
df.shape
df.columns


## Filtering the data

In [None]:
# Filter the DataFrame for values APARTMENT and APARTMENT_BLOCK
df_house1 = df[df["property_type"] == "HOUSE"]
df_house2 = df_house1[df_house1['subproperty_type'] != 'APARTMENT_BLOCK']

df_house = df[(df["property_type"] == "HOUSE") & (df['subproperty_type'] != 'APARTMENT_BLOCK')]

df_house.head()
print(df_house.info())
print(df_house.shape)

df_house["subproperty_type"].unique()
print(df_house["locality"].unique())
df_house.isna().sum().sort_values(ascending=False)

## Splitting the data

Creating variables X and y: define the target and the feature.

In [None]:
# Name X and y (specific columns=subset(houses))

X = df_house.drop(columns=['price', 'property_type', 'subproperty_type', 'zip_code', 'locality', 'construction_year', 'cadastral_income']).to_numpy()
y = df_house.price.to_numpy().reshape(-1,1)

In [None]:
# print chape
print("X shape: ", X.shape)
print("y-shape: ", y.shape)

In [None]:
# Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

## Rescaling data

Convert categorical data to a numerical form.

Data to convert:  'region', 'province', 'equipped_kitchen', 'state_building', 'epc', 'heating_type'

In [None]:
columns = df_house[['region', 'province', 'equipped_kitchen', 'state_building', 'epc', 'heating_type']]

# See unique values of multiple columns
for column in columns:
    multi_columns = df_house[column].unique()
    print(f"Unique values in column '{column}': {multi_columns}")

class sklearn.preprocessing.OneHotEncoder(*, categories='auto', drop=None, sparse_output=True, dtype=<class 'numpy.float64'>, handle_unknown='error', min_frequency=None, max_categories=None, feature_name_combiner='concat')

## Encoding

Ohe on dataframe

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Assuming df_house contains the DataFrame and columns contains the relevant columns
columns_to_encode = ['region', 'province', 'equipped_kitchen', 'state_building', 'epc', 'heating_type']

# Extracting the columns to be encoded
data_to_encode = df_house[columns_to_encode]

# Creating the OneHotEncoder object
encoder = OneHotEncoder()

# Fitting and transforming the data
encoded_data = encoder.fit_transform(data_to_encode)

# Converting the encoded data to a DataFrame
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(input_features=columns_to_encode))

# Concatenating the encoded DataFrame with the original DataFrame
df_house_encoded = pd.concat([df_house.drop(columns=columns_to_encode), encoded_df], axis=1)


In [250]:
df_house.shape

(37211, 27)

## Imputing with mean

Features with NaN to imputate with mean
- surface_land_sqm                      36254
- primary_energy_consumption_sqm        26564
- nbr_frontages                         26344
- terrace_sqm                           13140
- total_area_sqm                        7615
- garden_sqm                            2939

In [258]:
import numpy as np
from sklearn.impute import SimpleImputer

# Assuming X_train is your NumPy array and columns_to_impute contains the relevant column names
columns_to_impute = ['surface_land_sqm', 'primary_energy_consumption_sqm', 'nbr_frontages', 'terrace_sqm', 'total_area_sqm', 'garden_sqm']
columns = df_house.drop(columns=['price', 'property_type', 'subproperty_type', 'zip_code', 'locality', 'construction_year', 'cadastral_income']).columns 

# Convert X_train to a DataFrame with appropriate column names
X_train_df = pd.DataFrame(X_train, columns=columns)

# Find the indices of the columns to impute
column_indices = [list(X_train_df.columns).index(col) for col in columns_to_impute]
print(column_indices)

# Create SimpleImputer object
imp_mean = SimpleImputer(strategy='mean')

# Fit the imputer to the selected columns in X_train
imp_mean.fit(X_train[:, column_indices])

# Transform the selected columns in X_train with the fitted imputer
X_train_imputed = X_train.copy()  # Create a copy of X_train
X_train_imputed[:, column_indices] = imp_mean.transform(X_train[:, column_indices])

# X_train_imputed now contains X_train with missing values imputed using the mean strategy for the selected columns

X_train_imputed.shape


[3, 16, 4, 10, 2, 12]


(29768, 20)

Ohe on np.array with imputer

In [269]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Assuming X_train_imputed is your numpy array and columns_to_encode is a list of column names to encode
columns_to_encode = ['region', 'province', 'equipped_kitchen', 'state_building', 'epc', 'heating_type']
columns = df_house.drop(columns=['price', 'property_type', 'subproperty_type', 'zip_code', 'locality', 'construction_year', 'cadastral_income']).columns
df2 = df_house.drop(columns=['price', 'property_type', 'subproperty_type', 'zip_code', 'locality', 'construction_year', 'cadastral_income'])
cols = df2.columns.to_list
print(cols)

# Convert X_train_imputed to a DataFrame with appropriate column names
X_train_df = pd.DataFrame(X_train_imputed, columns=columns)
print(X_train_df.shape)


# Handling NaN values
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')  # You can choose a strategy that fits your data
data_imputed = pd.DataFrame(imputer.fit_transform(X_train_df), columns=X_train_df.columns)
print(data_imputed.shape)

# Creating the OneHotEncoder object
encoder = OneHotEncoder()

# Fitting and transforming the data
encoded_data = encoder.fit_transform(data_imputed)

# Converting the encoded data to a DataFrame
encoded_df = pd.DataFrame(encoded_data.toarray())

# Selecting non-encode columns from the original X_train_imputed array
non_encoded_columns = np.delete(X_train_imputed, np.where(np.isin(X_train_df.columns, columns_to_encode)), axis=1)
print(non_encoded_columns.shape)
print(type(non_encoded_columns))

# Concatenating the non-encoded columns with the encoded array
X_train_encoded = np.concatenate([non_encoded_columns, encoded_array], axis=1)

X_train_encoded.shape

<bound method IndexOpsMixin.tolist of Index(['region', 'province', 'total_area_sqm', 'surface_land_sqm',
       'nbr_frontages', 'nbr_bedrooms', 'equipped_kitchen', 'fl_furnished',
       'fl_open_fire', 'fl_terrace', 'terrace_sqm', 'fl_garden', 'garden_sqm',
       'fl_swimming_pool', 'fl_floodzone', 'state_building',
       'primary_energy_consumption_sqm', 'epc', 'heating_type',
       'fl_double_glazing'],
      dtype='object')>
(29768, 20)
(29768, 20)
(29768, 14)
<class 'numpy.ndarray'>


(29768, 62)

In [None]:
X_train_df.isna().sum().sort_values(ascending=False)

## Relationship between variables

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

sns.set(style = "darkgrid")

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x = df_house['total_area_sqm']
y = df_house['primary_energy_consumption_sqm']
z = df_house['price']

ax.view_init(180, 180)
ax.scatter(x, y, z)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2)
#plt.tight_layout()
axes[0].scatter(df_house['total_area_sqm'], df_house['price'])
axes[1].scatter(df_house['primary_energy_consumption_sqm'],df_house['price'])
plt.show()

Showing correlation coefficients.

In [None]:
df_house.select_dtypes(exclude='object').corr().T

In [None]:
# Make two figures so it is better visualized

plt.figure(figsize=(12,8))

X = df_house.drop(columns=["price"])
y = df["price"]

half = len(X.columns)//2


fig1 = sns.pairplot(df, x_vars=X.columns[half:], y_vars='price')
fig2 = sns.pairplot(df, x_vars=X.columns[:half], y_vars='price')




## Linear Regression model

Load and fit the model

In [249]:
# Import LinearRegression from sklearn
from sklearn.linear_model import LinearRegression

# Create regressor and instantiate LinearRegression class
reg = LinearRegression()
print(type(reg))

# Train the model with X_train and  y_train
reg.fit(X_train_imputed, y_train)

<class 'sklearn.linear_model._base.LinearRegression'>


ValueError: could not convert string to float: 'Flanders'

In [None]:
# Display score of training model
score = reg.score(X_train_encoded, y_train)
print(score*100)

In [None]:
# Use the model on the test dataset
reg.fit(X_test, y_test)

In [None]:
# Display score of test model
score = reg.score(X_test, y_test)
print(score*100)

In [None]:
# Function to flatten 2D lists so it can be used by plotly
def flatten(l):
    return [item for sublist in l for item in sublist]

# Set up and fit the linear regressor
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Flatten the prediction and expected lists
predicted = flatten(lin_reg.predict(X_test))
expected = flatten(y_test.values)

two functions:

train : has as input X train y train output the model already trained

evaluate: Takes (the trainedmodel, X test and y test,) and oututs the score