In [172]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load the dataset
df = pd.read_csv('data/properties.csv')

In [173]:
# Define features and target variable
X = df.drop(columns=['id', 'price'])  # Features
y = df['price']  # Target variable

In [174]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=155)

In [175]:
# Create the imputer
imputer = SimpleImputer(strategy='median')

# Select columns with missing values
columns_with_missing_values = X_train.columns[X_train.isnull().any()].tolist()

# Count the number of missing values per column before imputation
missing_values_before = X_train[columns_with_missing_values].isnull().sum()

# Apply the imputer to the selected columns
X_train[columns_with_missing_values] = imputer.fit_transform(X_train[columns_with_missing_values])

# Count the number of missing values per column after imputation
missing_values_after = X_train[columns_with_missing_values].isnull().sum()

# Get the median values for each column
median_values = imputer.statistics_

# Create a DataFrame to display median values and number of imputed values per column
imputation_summary = pd.DataFrame({
    'Column': columns_with_missing_values,
    'Median': median_values,
    'Imputed Values': missing_values_before - missing_values_after
})

# Display the imputation summary
print("Imputation Summary:")
print(imputation_summary)



Imputation Summary:
                                                        Column       Median  \
latitude                                              latitude    50.902228   
longitude                                            longitude     4.377851   
construction_year                            construction_year  1993.000000   
total_area_sqm                                  total_area_sqm   127.000000   
surface_land_sqm                              surface_land_sqm   362.000000   
nbr_frontages                                    nbr_frontages     3.000000   
terrace_sqm                                        terrace_sqm     1.000000   
garden_sqm                                          garden_sqm     0.000000   
primary_energy_consumption_sqm  primary_energy_consumption_sqm   243.000000   
cadastral_income                              cadastral_income   850.000000   

                                Imputed Values  
latitude                                 11264  
longitude   

In [176]:
# Define categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(categorical_features)
print(numerical_features)

['property_type', 'subproperty_type', 'region', 'province', 'locality', 'equipped_kitchen', 'state_building', 'epc', 'heating_type']
['zip_code', 'latitude', 'longitude', 'construction_year', 'total_area_sqm', 'surface_land_sqm', 'nbr_frontages', 'nbr_bedrooms', 'fl_furnished', 'fl_open_fire', 'fl_terrace', 'terrace_sqm', 'fl_garden', 'garden_sqm', 'fl_swimming_pool', 'fl_floodzone', 'primary_energy_consumption_sqm', 'fl_double_glazing', 'cadastral_income']


In [177]:
# Define preprocessing steps for categorical and numerical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='MISSING')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [178]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', PowerTransformer())
])

In [179]:
# Combine preprocessing steps for categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

In [180]:
# Define the model
model = LinearRegression()

In [181]:
# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

In [182]:
# Fit the pipeline (preprocessing + model) on the training data
pipeline.fit(X_train, y_train)

In [183]:
# Make predictions on the testing data
y_pred = pipeline.predict(X_test)

In [184]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 99409099779.96405
R-squared: 0.4126676729981812
