In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# Define the features and target
X = pd.read_csv('../data/idealista/training/unioned-features-rent.csv', index_col=0)
y = pd.read_csv('../data/idealista/training/unioned-targets-rent.csv', index_col=0)['logPrice']

In [None]:
X.head()

In [None]:
y.head()

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=X['municipality'])

In [None]:
sns.countplot(X_train['municipality'], order=X_train['municipality'].value_counts().index)

In [None]:
sns.countplot(X_test['municipality'], order=X_test['municipality'].value_counts().index)

In [6]:
def target_encode(train_features, test_features, train_target, col, smoothing_factor=10):
    """
    Performs median target encoding with smoothing on the training data and applies to the test data.
    Arguments:
    - train_features: Training features DataFrame
    - test_features: Test features DataFrame
    - train_target: Training target Series
    - col: Categorical column to encode
    - smoothing_factor: Alpha parameter for smoothing

    Returns:
    - Train and test DataFrames with the target encoding applied
    """
    # Merge the features and target
    train_data = train_features.merge(train_target, how='inner', on=train_features.index.name, validate='one_to_one')

    # Global median of the target
    global_median = train_target.median()

    # Group by the column to encode
    agg = train_data.groupby(col)[train_target.name].agg(['median', 'count'])
    agg['smoothed_median'] = (agg['count'] * agg['median'] + smoothing_factor * global_median) / (agg['count'] + smoothing_factor)

    # Map to train and test
    train_features = train_features.copy()
    test_features = test_features.copy()

    train_features[f"{col}_encoded"] = train_features[col].map(agg['smoothed_median']).fillna(global_median)
    train_features = train_features.drop(columns=[col])
    
    if test_features is not None:
        test_features[f"{col}_encoded"] = test_features[col].map(agg['smoothed_median']).fillna(global_median)
        test_features = test_features.drop(columns=[col])
        return train_features, test_features
    else:
        return train_features

In [None]:
# Prepare for K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
errors = []
r2_scores = []

for train_idx, val_idx in kf.split(X):
    # Split data into training and validation sets
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Apply target encoding to the training and validation sets
    X_train_encoded, X_val_encoded = target_encode(
        train_features=X_train,
        test_features=X_val,
        train_target=y_train,
        col="municipality",
        smoothing_factor=10,
    )

    # Train a linear regression model
    lin_model = LinearRegression()
    lin_model.fit(X_train_encoded, y_train)

    # Evaluate the model on the validation set
    y_pred = lin_model.predict(X_val_encoded)

    # Calculate the RMSE and R^2
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    errors.append(rmse)
    r2_scores.append(r2)

# Report Cross-Validation Performance
print(f"Cross-Validation RMSE: {np.mean(errors):.4f}")
print(f"Cross-Validation R^2: {np.mean(r2_scores):.4f}")

In [7]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Prepare for K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
errors = []
r2_scores = []

for train_idx, val_idx in kf.split(X):
    # Split data into training and validation sets
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Apply target encoding to the training and validation sets
    X_train_encoded, X_val_encoded = target_encode(
        train_features=X_train,
        test_features=X_val,
        train_target=y_train,
        col="municipality",
        smoothing_factor=10,
    )

    # Train a linear regression model
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train_encoded, y_train)

    # Evaluate the model on the validation set
    y_pred = rf_model.predict(X_val_encoded)

    # Calculate the RMSE and R^2
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    errors.append(rmse)
    r2_scores.append(r2)

# Report Cross-Validation Performance
print(f"Cross-Validation RMSE: {np.mean(errors):.4f}")
print(f"Cross-Validation R^2: {np.mean(r2_scores):.4f}")

In [16]:
def plot_performance(model, X_train, y_train, X_test, y_test):
    # Define the range of the dependent variable
    var_range = [6.3, 8.5]
        
    # Predict on training and test data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Plot actual vs predicted values
    plt.figure(figsize=(12, 4))
    
    # Plot train data
    plt.subplot(1, 2, 1)
    plt.scatter(y_train, y_train_pred, color='royalblue', alpha=0.5, label='Train data')
    plt.plot(var_range, var_range, 'k--', lw=2)
    plt.xlabel(f'Actual price')
    plt.xlim(var_range[0] - 0.5, var_range[1] + 0.5)
    plt.ylabel(f'Predicted price')
    plt.ylim(var_range[0] - 0.5, var_range[1] + 0.5)
    plt.text(var_range[0], var_range[1] - 0.5, "MSE = {:.3f}".format(mean_squared_error(y_train, y_train_pred)))
    plt.text(var_range[0], var_range[1] - 1, "R² = {:.3f}".format(r2_score(y_train, y_train_pred)))
    plt.legend(loc="lower right")
    
    # Plot test data
    plt.subplot(1, 2, 2)
    plt.scatter(y_test, y_test_pred, color='lightskyblue', alpha=0.5, label='Test data')
    plt.plot(var_range, var_range, 'k--', lw=2)
    plt.xlabel(f'Actual price')
    plt.xlim(var_range[0] - 0.5, var_range[1] + 0.5)
    plt.ylabel(f'Predicted price')
    plt.ylim(var_range[0] - 0.5, var_range[1] + 0.5)
    plt.text(var_range[0], var_range[1] - 0.5, "MSE = {:.3f}".format(mean_squared_error(y_test, y_test_pred)))
    plt.text(var_range[0], var_range[1] - 1, "R² = {:.3f}".format(r2_score(y_test, y_test_pred)))
    plt.legend(loc="lower right")
    
    plt.show()

In [None]:
plot_performance(lin_model, X_train_encoded, y_train, X_val_encoded, y_val)

In [None]:
plot_performance(rf_model, X_train_encoded, y_train, X_val_encoded, y_val)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the model and parameter grid
rf = RandomForestRegressor(random_state=42)
param_grid = {
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'n_estimators': [50, 100, 200],
}

# Perform grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train_encoded, y_train)

# Get the best model
best_rf = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

In [None]:
plot_performance(best_rf, X_train_encoded, y_train, X_val_encoded, y_val)