## Libraries

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read the csv file
df = pd.read_csv("../data/cleaned_properties.csv")

# Display the head
display(df.head())
df.shape
df.columns

# Filter the DataFrame for values APARTMENT and APARTMENT_BLOCK
df_house1 = df[df["property_type"] == "HOUSE"]
df_house2 = df_house1[df_house1['subproperty_type'] != 'APARTMENT_BLOCK']

df_house = df[(df["property_type"] == "HOUSE") & (df['subproperty_type'] != 'APARTMENT_BLOCK')]

df_house.head()
print(df_house.info())
print(df_house.shape)

df_house["subproperty_type"].unique()
print(df_house["locality"].unique())
df_house.isna().sum().sort_values(ascending=False)

## Splitting the data

Creating variables X and y: define the target and the feature.

In [None]:
# Name X and y (specific columns=subset(houses))
X = df_house.drop(columns=['price', 'subproperty_type', 'property_type', 'zip_code', 'locality', 'construction_year', 'cadastral_income'])
y = df_house['price']

# Print shape
print("X shape: ", X.shape)
print("y-shape: ", y.shape)

# Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

X_train.info()

## Rescaling data

Convert categorical data to a numerical form.

Data to convert:  'region', 'province', 'equipped_kitchen', 'state_building', 'epc', 'heating_type'

In [17]:
# Select the columns with categorical values
object_cols = X_train.select_dtypes(include='object').columns
columns_to_encode = object_cols.tolist()
print(columns_to_encode)

# Initiationg the encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

for col in object_cols:
    data = X_train[['col']]
    dfx = X_train
    print(type(data))
    print(type(X_train))

# Transform the data
encoded_data = encoder.fit_transform(data)
print(encoded_data)
print(type(encoded_data))



['region', 'province', 'equipped_kitchen', 'state_building', 'epc', 'heating_type']


KeyError: "None of [Index(['col'], dtype='object')] are in the [columns]"

In [None]:
for col in X_train.columns_to_encode:
    data = X_train[['col']]
    dfx = X_train
    print(type(data))
    print(type(X_train))

data = X_train[['province']]
dfx = X_train
print(type(data))
print(type(X_train))

# Transform the data
encoded_data = encoder.fit_transform(data)
print(encoded_data)
print(type(encoded_data))

# Handle NaN values using SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')  # Fill NaN values with most frequent value
encoded_data_imputed = imputer.fit_transform(encoded_data)

# Convert the encoded data to a dataframe
encoded_df = pd.DataFrame(encoded_data_imputed, columns=encoder.get_feature_names_out(['province']))

# Concatenate the encoded dataframe with the original dataframe
df_encoded = pd.concat([dfx, encoded_df], axis=1)
df_encoded.drop('province', axis=1, inplace=True) 

print(df_encoded.info())

## Imputing with mean

In [None]:
# Drop columns with dtype=object
object_cols = df_encoded.select_dtypes(include='object').columns
df_encoded.drop(object_cols, axis=1, inplace=True)
df_encoded.isna().sum().sort_values(ascending=False)

# Instantiate SimpleImputer with strategy='mean' to impute NaN values with the mean
imputer = SimpleImputer(strategy='median')

# Fit and transform the data using SimpleImputer
X_train['total_area_sqm'] = imputer.fit_transform(X_train[['total_area_sqm']])

