In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Define the features and target
X = pd.read_csv('../data/training/unioned-features-rent.csv', index_col=0)
y = pd.read_csv('../data/training/unioned-targets-rent.csv', index_col=0)['logPrice']

In [3]:
X.head()

Unnamed: 0_level_0,size,rooms,bathrooms,municipality,latitude,longitude,hasLift,hasParkingSpace,propertyType_chalet,propertyType_duplex,propertyType_flat,propertyType_penthouse,propertyType_studio,status_good,status_renew
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
33894210,153.0,5,2,Arroios,38.731805,-9.14286,0,0,0,0,1,0,0,1,0
33513874,102.0,2,2,Estrela,38.710724,-9.171262,1,0,0,0,1,0,0,1,0
33893664,55.0,1,1,Misericórdia,38.709215,-9.145189,0,0,0,0,1,0,0,1,0
33893658,55.0,1,1,Misericórdia,38.71057,-9.145689,0,0,0,0,1,0,0,1,0
33893592,55.0,1,1,Misericórdia,38.708931,-9.144717,0,0,0,0,1,0,0,1,0


In [4]:
y.head()

propertyCode
33894210    7.390181
33513874    7.901007
33893664    6.907755
33893658    6.907755
33893592    6.907755
Name: logPrice, dtype: float64

In [5]:
def target_encode(train_features, test_features, train_target, col, smoothing_factor=10):
    """
    Performs median target encoding with smoothing on the training data and applies to the test data.
    Arguments:
    - train_features: Training features DataFrame
    - test_features: Test features DataFrame
    - train_target: Training target Series
    - col: Categorical column to encode
    - smoothing_factor: Alpha parameter for smoothing

    Returns:
    - Train and test DataFrames with the target encoding applied
    """
    # Merge the features and target
    train_data = train_features.merge(train_target, how='inner', on=train_features.index.name, validate='one_to_one')

    # Global median of the target
    global_median = train_target.median()

    # Group by the column to encode
    agg = train_data.groupby(col)[train_target.name].agg(['median', 'count'])
    agg['smoothed_median'] = (agg['count'] * agg['median'] + smoothing_factor * global_median) / (agg['count'] + smoothing_factor)

    # Map to train and test
    train_features = train_features.copy()
    test_features = test_features.copy()

    train_features[f"{col}_encoded"] = train_features[col].map(agg['smoothed_median']).fillna(global_median)
    train_features = train_features.drop(columns=[col])
    
    if test_features is not None:
        test_features[f"{col}_encoded"] = test_features[col].map(agg['smoothed_median']).fillna(global_median)
        test_features = test_features.drop(columns=[col])
        return train_features, test_features
    else:
        return train_features

In [6]:
# Prepare for K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
errors = []
r2_scores = []

for train_idx, val_idx in kf.split(X):
    # Split data into training and validation sets
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Apply target encoding to the training and validation sets
    X_train, X_val = target_encode(
        train_features=X_train,
        test_features=X_val,
        train_target=y_train,
        col="municipality",
        smoothing_factor=10,
    )

    # Train a linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Evaluate the model on the validation set
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    errors.append(rmse)
    r2_scores.append(r2)

# Report Cross-Validation Performance
print(f"Cross-Validation RMSE: {np.mean(errors):.4f}")
print(f"Cross-Validation R^2: {np.mean(r2_scores):.4f}")

Cross-Validation RMSE: 0.2901
Cross-Validation R^2: 0.4090


In [7]:
X_train

Unnamed: 0_level_0,size,rooms,bathrooms,latitude,longitude,hasLift,hasParkingSpace,propertyType_chalet,propertyType_duplex,propertyType_flat,propertyType_penthouse,propertyType_studio,status_good,status_renew,municipality_encoded
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
33894210,153.0,5,2,38.731805,-9.142860,0,0,0,0,1,0,0,1,0,7.469699
33893664,55.0,1,1,38.709215,-9.145189,0,0,0,0,1,0,0,1,0,7.591897
33893658,55.0,1,1,38.710570,-9.145689,0,0,0,0,1,0,0,1,0,7.591897
33893592,55.0,1,1,38.708931,-9.144717,0,0,0,0,1,0,0,1,0,7.591897
33893469,55.0,1,1,38.708404,-9.147071,0,0,0,0,1,0,0,1,0,7.591897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33897008,120.0,2,3,38.738030,-9.161398,1,0,0,0,1,0,0,1,0,7.586063
33896998,56.0,1,1,38.726483,-9.164154,0,0,0,0,1,0,0,1,0,7.586063
33896858,103.0,2,2,38.773993,-9.097650,1,1,0,0,1,0,0,1,0,7.621665
33896861,70.0,2,1,38.734024,-9.137637,0,0,0,0,1,0,0,1,0,7.469699
