## To-Do

* Entrenar con menos variables, ver la importancia de las variables en RL y RF.
* Entrenar por municipios y/o barrios.

In [1]:
# imports

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import numpy as np
import pandas
from sklearn.metrics import mean_absolute_error

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
folder = '/content/drive/MyDrive/MaestriaDataScience/Tesis/final_dataset_2025-09-08.xlsx'

In [4]:
import pandas as pd

data = pd.read_excel(folder)

In [5]:
data.head()

Unnamed: 0,url,precio,Baños,Superficie,Superficie Construida,Superficie Terreno,Garage,Dormitorio,barrio,municipio,precio_por_m2_construido,precio_por_m2_terreno,precio_por_m2
0,https://www.casasymas.com.uy/propiedad/183266-...,850000,3,429.0,256.0,429.0,4,4,carrasco,E,3320.3125,1981.351981,1981.351981
1,https://www.casasymas.com.uy/propiedad/940-cas...,380000,3,130.0,130.0,150.0,1,3,carrasco,E,2923.076923,2533.333333,2923.076923
2,https://www.casasymas.com.uy/propiedad/159162-...,119000,1,380.0,62.0,380.0,0,2,tres-cruces,B,1919.354839,313.157895,313.157895
3,https://www.casasymas.com.uy/propiedad/133140-...,1780000,4,263.0,263.0,1316.0,3,3,san-nicolas,E,6768.060837,1352.583587,6768.060837
4,https://www.casasymas.com.uy/propiedad/205399-...,370000,3,497.0,195.0,497.0,1,3,buceo,CH,1897.435897,744.466801,744.466801


## Funciones

Para automatizar entrenamiento de modelos y ahorrar tiempo. Luego invocaremos las funciones para entrenar.

In [6]:
def split_data_log(df, features: list, target: list):
  X = features
  y = np.log(target)
  X = X.fillna(0)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  return X_train, X_test, y_train, y_test

In [None]:
def split_data_log_one_hot(df, target: str, variables_numericas: list, variables_categoricas: list = None,
                           test_size: float = 0.2, random_state: int = 42):
    # Target en log
    y = np.log(df[target])

    # Si hay categóricas
    if variables_categoricas and len(variables_categoricas) > 0:
        X = df[variables_numericas + variables_categoricas].fillna(0)

        # OneHotEncoding
        encoder = OneHotEncoder(sparse_output=False, drop="first", handle_unknown="ignore")
        encoded = encoder.fit_transform(X[variables_categoricas])
        encoded_df = pd.DataFrame(
            encoded,
            columns=encoder.get_feature_names_out(variables_categoricas),
            index=X.index
        )

        # Unimos numéricas + categóricas codificadas
        X_encoded = pd.concat([X[variables_numericas], encoded_df], axis=1)

    else:
        # Solo numéricas
        X_encoded = df[variables_numericas].fillna(0)
        encoder = None

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_encoded, y, test_size=test_size, random_state=random_state
    )

    return X_train, X_test, y_train, y_test, encoder

In [None]:
def linear_regression_model(X_train, y_train):
  model_lr = LinearRegression()
  model_lr.fit(X_train, y_train)
  return model_lr

In [None]:
def lr_grid_search(X_train, y_train):

    # Definir el modelo base
    lr = LinearRegression()

    # Grilla de hiperparámetros
    param_grid = {
        'fit_intercept': [True, False],
        # 'normalize': [True, False]  # solo si usás sklearn < 1.0
    }

    # Configurar GridSearch
    grid_search = GridSearchCV(
        estimator=lr,
        param_grid=param_grid,
        cv=5,
        scoring='neg_mean_absolute_error',
        n_jobs=-1
    )

    # Entrenar
    grid_search.fit(X_train, y_train)

    # Retornar el mejor modelo
    print("Mejores parámetros:", grid_search.best_params_)
    print("Mejor score (CV):", grid_search.best_score_)

    return grid_search.best_estimator_


In [None]:
def evaluate_model(model, X_test, y_test):
    y_test_pred_log = model.predict(X_test)

    y_test_pred_real = np.exp(y_test_pred_log)
    y_test_real = np.exp(y_test)

    mse_real = mean_squared_error(y_test_real, y_test_pred_real)
    r2_real = r2_score(y_test_real, y_test_pred_real)
    mae_real = mean_absolute_error(y_test_real, y_test_pred_real)
    medae_real = np.median(np.abs(y_test_real - y_test_pred_real))

    return mse_real, r2_real, mae_real, medae_real

## Train Test Split // Preprocessing de los datos

Todos los datos

In [None]:
X = data[["Baños", "Dormitorio", "Superficie", "Superficie Construida",
          "Superficie Terreno", "Garage", "barrio", "municipio"]]
y = np.log(data["precio"])

In [None]:
X = X.fillna(0)

In [None]:
variables_categoricas = ['barrio', 'municipio']
variables_numericas = ['Baños', 'Dormitorio', 'Superficie', 'Superficie Construida', 'Superficie Terreno', 'Garage']

In [None]:
# aplicamos one hot encoding a variables categoricas (barrio, municipio)

encoder = OneHotEncoder(sparse_output=False, drop="first")

encoded = encoder.fit_transform(X[variables_categoricas])

In [None]:
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(variables_categoricas))

In [None]:
encoded_df.head()

Unnamed: 0,barrio_aires-puros,barrio_atahualpa,barrio_barrio-sur,barrio_bella-vista,barrio_belvedere,barrio_bolivar,barrio_brazo-oriental,barrio_buceo,barrio_capurro,barrio_carrasco,...,barrio_villa-espanola,barrio_villa-garcia,barrio_villa-munoz,municipio_B,municipio_C,municipio_CH,municipio_D,municipio_E,municipio_F,municipio_G
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
X_encoded = pd.concat([X.drop(columns=variables_categoricas), encoded_df], axis=1)

In [None]:
X_encoded.head()

Unnamed: 0,Baños,Dormitorio,Superficie,Superficie Construida,Superficie Terreno,Garage,barrio_aires-puros,barrio_atahualpa,barrio_barrio-sur,barrio_bella-vista,...,barrio_villa-espanola,barrio_villa-garcia,barrio_villa-munoz,municipio_B,municipio_C,municipio_CH,municipio_D,municipio_E,municipio_F,municipio_G
0,3,4,429,256,429,4,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,3,3,130,130,150,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,2,380,62,380,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,4,3,263,263,1316,3,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,3,3,497,195,497,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

## REGRESIÓN LINEAL

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_test_pred_log = model.predict(X_test)

In [None]:
y_test_pred_real = np.exp(y_test_pred_log)
y_test_real = np.exp(y_test)

In [None]:
mse_real = mean_squared_error(y_test_real, y_test_pred_real)
r2_real = r2_score(y_test_real, y_test_pred_real)

print(f"Mean Squared Error: {mse_real}")
print(f"R^2 Score: {r2_real}")

Mean Squared Error: 36088369900.03688
R^2 Score: 0.7058368163580993


In [None]:
rmse_real = np.sqrt(mse_real)
print(f"Root Mean Squared Error (RMSE): {rmse_real}")

Root Mean Squared Error (RMSE): 189969.39200838876


In [None]:
from sklearn.metrics import mean_absolute_error

mae_real = mean_absolute_error(y_test_real, y_test_pred_real)
print(f"Mean Absolute Error (MAE): {mae_real}")


Mean Absolute Error (MAE): 104814.48363384065


## ARBOLES DE DECISIÓN

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV


In [None]:
tree_model = DecisionTreeRegressor(random_state=42)

In [None]:
param_grid = {
    'criterion': ['squared_error', 'absolute_error'],   # enfoquémonos en MSE y MAE
    'max_depth': [3, 5, 7, 10],                        # limitar profundidad
    'min_samples_split': [10, 20, 50],                 # forzar nodos con más datos
    'min_samples_leaf': [5, 10, 20, 50],               # hojas con más ejemplos
    'max_features': ['sqrt', 'log2', None]             # probar restricciones en features
}

In [None]:
grid_search = GridSearchCV(
    estimator=tree_model,
    param_grid=param_grid,
    cv=5,                   # validación cruzada de 5 folds
    n_jobs=-1,              # usa todos los núcleos
    scoring='neg_mean_absolute_error'  # métrica de evaluación
)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
best_model = grid_search.best_estimator_

In [None]:
y_test_pred_arboles = best_model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_test_pred_arboles)
r2 = r2_score(y_test, y_test_pred_arboles)
mae = mean_absolute_error(y_test, y_test_pred_arboles)

In [None]:
print(f"Mean Squared Error (Real): {mse}")
print(f"R^2 Score (Real): {r2}")
print(f"Mean Absolute Error (Real): {mae}")

Mean Squared Error (Real): 0.15651115466915475
R^2 Score (Real): 0.7321765078070769
Mean Absolute Error (Real): 0.2994036879627613


## XGBOOST

In [None]:
!pip install xgboost



In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV


In [None]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

In [None]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

In [None]:
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=3, verbose=1)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [None]:
best_model = grid_search.best_estimator_

In [None]:
y_test_pred_log_xgboost = best_model.predict(X_test)


In [None]:
y_test_pred_real_xgboost = np.exp(y_test_pred_log_xgboost)
y_test_real_xgboost = np.exp(y_test)

In [None]:
mse_real = mean_squared_error(y_test_real_xgboost, y_test_pred_real_xgboost)
r2_real = r2_score(y_test_real_xgboost, y_test_pred_real_xgboost)
mae_real = mean_absolute_error(y_test_real_xgboost, y_test_pred_real_xgboost)

In [None]:
print(f"Mean Squared Error (Real): {mse_real}")
print(f"R^2 Score (Real): {r2_real}")
print(f"Mean Absolute Error (Real): {mae_real}")

Mean Squared Error (Real): 39121200353.3524
R^2 Score (Real): 0.6811156370955109
Mean Absolute Error (Real): 102212.10396773728


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

In [None]:
param_grid = {
    'n_estimators': [200, 400, 600],        # número de árboles
    'max_depth': [None, 10, 20],            # profundidad máxima
    'min_samples_split': [2, 10, 20],       # min de muestras para dividir nodo
    'min_samples_leaf': [1, 5, 10],         # min de muestras por hoja
    'max_features': ['sqrt', 'log2', 0.8]   # nº de features consideradas por split
}

In [None]:
# Configurar GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,                        # 5-fold cross validation
    n_jobs=-1,
    scoring='neg_mean_absolute_error',  # MAE como métrica
    verbose=2
)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [None]:
best_model = grid_search.best_estimator_
print("Mejores parámetros:", grid_search.best_params_)

Mejores parámetros: {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 600}


In [None]:
y_pred = best_model.predict(X_test)

In [None]:
# Predicciones en log
y_pred_log_rf = best_model.predict(X_test)

# Volver a escala real (USD)
y_pred_rf = np.exp(y_pred_log_rf)
y_test_real = np.exp(y_test)

# Métricas en escala real
mae_rf = mean_absolute_error(y_test_real, y_pred_rf)
# rmse_rf = mean_squared_error(y_test_real, y_pred_rf, squared=False)
r2_rf = r2_score(y_test_real, y_pred_rf)

print(f"MAE RF: {mae_rf}")
# print(f"RMSE RF: {rmse_rf}")
print(f"R² RF: {r2_rf}")

MAE RF: 93136.71296232638
R² RF: 0.7142275539761331


## Target y Features

In [None]:
features_simple = ["Baños", "Dormitorio", "Superficie", "Superficie Construida", "Superficie Terreno", "Garage"]

features = ["Baños", "Dormitorio", "Superficie", "Superficie Construida", "Superficie Terreno", "Garage", "precio_por_m2_construido", "precio_por_m2_terreno", "precio_por_m2"]
target = "precio"

## Train Test Splits

In [None]:
X_train_all_municipios_simple, X_test_all_municipios_simple, y_train_all_municipios_simple, y_test_all_municipios_simple, encoder_all_municipios_simple = split_data_log_one_hot(
    data,
    target=target,
    variables_numericas=features_simple,
    variables_categoricas=[]
)

In [None]:
data_ch = data[data["municipio"] == "CH"].copy()

X_train_ch_simple, X_test_ch_simple, y_train_ch_simple, y_test_ch_simple, encoder_ch_simple = split_data_log_one_hot(
    data_ch,
    target=target,
    variables_numericas=features_simple,
    variables_categoricas=[]
)

In [None]:
data_e = data[data["municipio"] == "E"].copy()

X_train_e_simple, X_test_e_simple, y_train_e_simple, y_test_e_simple, encoder_e_simple = split_data_log_one_hot(
    data_e,
    target=target,
    variables_numericas=features_simple,
    variables_categoricas=[]
)

In [None]:
X_train_all_municipios_completo, X_test_all_municipios_completo, y_train_all_municipios_completo, y_test_all_municipios_completo, encoder_all_municipios_completo = split_data_log_one_hot(
    data,
    target=target,
    variables_numericas=features,
    variables_categoricas=[]
)

In [None]:
data_ch = data[data["municipio"] == "CH"].copy()

X_train_ch_completo, X_test_ch_completo, y_train_ch_completo, y_test_ch_completo, encoder_ch_completo= split_data_log_one_hot(
    data_ch,
    target=target,
    variables_numericas=features,
    variables_categoricas=[]
)

In [None]:
data_e = data[data["municipio"] == "E"].copy()

X_train_e_completo, X_test_e_completo, y_train_e_completo, y_test_e_completo, encoder_e_completo = split_data_log_one_hot(
    data_e,
    target=target,
    variables_numericas=features,
    variables_categoricas=[]
)

## Linear Regression all_municipios SIMPLE





In [None]:
# Entrenar modelo de regresión lineal
model_lr_simple = linear_regression_model(X_train_all_municipios_simple, y_train_all_municipios_simple)

print("Modelo entrenado:", model_lr_simple)

Modelo entrenado: LinearRegression()


In [None]:
# 4) Evaluar modelo en escala real (USD)
mse_lr_simple, r2_lr_simple, mae_lr_simple = evaluate_model(model_lr_simple, X_train_all_municipios_simple, y_train_all_municipios_simple)

print(f"Mean Squared Error (Real): {mse_lr_simple}")
print(f"R² Score (Real): {r2_lr_simple}")
print(f"Mean Absolute Error (Real): {mae_lr_simple}")

Mean Squared Error (Real): 50513899007.09271
R² Score (Real): 0.48824826606574256
Mean Absolute Error (Real): 139391.80690684664


## Linear Regression all_municipios features completas

In [None]:
# 3) Entrenar modelo de regresión lineal
model_lr = linear_regression_model(X_train_all_municipios_completo, y_train_all_municipios_completo)

print("Modelo entrenado:", model_lr)

Modelo entrenado: LinearRegression()


In [None]:
# 4) Evaluar modelo en escala real (USD)
mse_lr, r2_lr, mae_lr, medae_lr = evaluate_model(model_lr, X_train_all_municipios_completo, y_train_all_municipios_completo)

print(f"Mean Squared Error (Real): {mse_lr}")
print(f"R² Score (Real): {r2_lr}")
print(f"Mean Absolute Error (Real): {mae_lr}")
print(f"Median Absolute Error (Real): {medae_lr}")


Mean Squared Error (Real): 51177560283.292656
R² Score (Real): 0.48152477380883607
Mean Absolute Error (Real): 117055.24449635451
Median Absolute Error (Real): 61037.38525025919


## Random forest all_municipios SIMPLE

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
def train_random_forest(X_train, y_train):
    rf = RandomForestRegressor(random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    return rf

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

def random_forest_grid_search(X_train, y_train, cv: int = 5, n_jobs: int = -1, verbose: int = 2):

    rf = RandomForestRegressor(random_state=42, n_jobs=-1)

    param_grid = {
        'n_estimators': [200, 400, 600],        # número de árboles
        'max_depth': [None, 10, 20],            # profundidad máxima
        'min_samples_split': [2, 10, 20],       # min de muestras para dividir nodo
        'min_samples_leaf': [1, 5, 10],         # min de muestras por hoja
        'max_features': ['sqrt', 'log2', 0.8]   # nº de features consideradas por split
    }

    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=cv,
        n_jobs=n_jobs,
        scoring='neg_mean_absolute_error',
        verbose=verbose
    )

    grid_search.fit(X_train, y_train)

    print("Mejores parámetros:", grid_search.best_params_)
    print("Mejor score (CV MAE log):", grid_search.best_score_)

    return grid_search.best_estimator_

In [None]:
model_rf_simple = train_random_forest(X_train_all_municipios_simple, y_train_all_municipios_simple)

print("Modelo entrenado:", model_rf_simple)

Modelo entrenado: RandomForestRegressor(n_jobs=-1, random_state=42)


In [None]:
mse_rf_simple, r2_rf_simple, mae_rf_simple = evaluate_model(model_rf_simple, X_test_all_municipios_simple, y_test_all_municipios_simple)

print(f"Mean Squared Error (Real): {mse_rf_simple}")
print(f"R² Score (Real): {r2_rf_simple}")
print(f"Mean Absolute Error (Real): {mae_rf_simple}")

Mean Squared Error (Real): 62201298533.44668
R² Score (Real): 0.49298535639205276
Mean Absolute Error (Real): 139621.4077868757


## Test Random Forest Completo all_municipios

In [None]:
model_rf = train_random_forest(X_train_all_municipios_completo, y_train_all_municipios_completo)

print("Modelo entrenado:", model_rf)

Modelo entrenado: RandomForestRegressor(n_jobs=-1, random_state=42)


In [None]:
mse_rf, r2_rf, mae_rf = evaluate_model(model_rf, X_test_all_municipios_completo, y_test_all_municipios_completo)

print(f"Mean Squared Error (Real): {mse_rf}")
print(f"R² Score (Real): {r2_rf}")
print(f"Mean Absolute Error (Real): {mae_rf}")

Mean Squared Error (Real): 12896809431.429754
R² Score (Real): 0.894875647426558
Mean Absolute Error (Real): 36929.37189468203


## Random Forest GridSearchCV all_municipios SIMPLE

In [None]:
model_rf_gridsearch_simple = random_forest_grid_search(X_train_all_municipios_simple, y_train_all_municipios_simple)

print("Modelo entrenado:", model_rf_gridsearch_simple)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Mejores parámetros: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Mejor score (CV MAE log): -0.3546008669243099
Modelo entrenado: RandomForestRegressor(max_depth=10, max_features='log2', n_estimators=200,
                      n_jobs=-1, random_state=42)


In [None]:
mse_rf_gridsearch_simple, r2_rf_gridsearch_simple, mae_rf_gridsearch_simple = evaluate_model(model_rf_gridsearch_simple, X_test_all_municipios_simple, y_test_all_municipios_simple)

print(f"Mean Squared Error (Real): {mse_rf_gridsearch_simple}")
print(f"R² Score (Real): {r2_rf_gridsearch_simple}")
print(f"Mean Absolute Error (Real): {mae_rf_gridsearch_simple}")

Mean Squared Error (Real): 60366428544.98385
R² Score (Real): 0.5079417315032115
Mean Absolute Error (Real): 135315.74465744686


## RandomForest GridSearchCV all_municipios Completo

In [None]:
model_rf_gridsearch = random_forest_grid_search(X_train_all_municipios_completo, y_train_all_municipios_completo)

print("Modelo entrenado:", model_rf_gridsearch)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Mejores parámetros: {'max_depth': None, 'max_features': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 600}
Mejor score (CV MAE log): -0.1218752698047892
Modelo entrenado: RandomForestRegressor(max_features=0.8, n_estimators=600, n_jobs=-1,
                      random_state=42)


In [None]:
mse_rf_gridsearch, r2_rf_gridsearch, mae_rf_gridsearch = evaluate_model(model_rf_gridsearch, X_test_all_municipios_completo, y_test_all_municipios_completo)

print(f"Mean Squared Error (Real): {mse_rf_gridsearch}")
print(f"R² Score (Real): {r2_rf_gridsearch}")
print(f"Mean Absolute Error (Real): {mae_rf_gridsearch}")

Mean Squared Error (Real): 14737116209.180815
R² Score (Real): 0.8798749560093359
Mean Absolute Error (Real): 41100.742382541124


## Analisis de resultados

In [None]:
def evaluation_table(model, X_test, y_test, original_df, extra_cols: list = None):

    # Predicciones
    y_pred_log = model.predict(X_test)
    y_pred_real = np.exp(y_pred_log)
    y_test_real = np.exp(y_test)

    # Crear DataFrame con errores
    results = pd.DataFrame({
        "Precio Real": y_test_real,
        "Precio Predicho": y_pred_real,
        "Error Absoluto": np.abs(y_test_real - y_pred_real),
        "Error %": np.abs(y_test_real - y_pred_real) / y_test_real * 100
    }, index=y_test.index)

    # Agregar columnas adicionales si existen en original_df
    if extra_cols:
        for col in extra_cols:
            if col in original_df.columns:
                results[col] = original_df.loc[y_test.index, col].values

    results = results.sort_values(by="Error %", ascending=False)

    return results.reset_index(drop=True)

In [None]:
extra_cols = ["url", "Superficie", "Baños", "Dormitorio", "barrio"]

data = data.reset_index(drop=True)
X_test_all_municipios_completo = X_test_all_municipios_completo.reset_index(drop=True)
y_test_all_municipios_completo = y_test_all_municipios_completo.reset_index(drop=True)

results = evaluation_table(model_rf_gridsearch, X_test_all_municipios_completo, y_test_all_municipios_completo, data, extra_cols)

results.head(20)

Unnamed: 0,Precio Real,Precio Predicho,Error Absoluto,Error %,url,Superficie,Baños,Dormitorio,barrio
0,55000,129752,74752,136,https://www.casasymas.com.uy/propiedad/209205-...,87.0,1,2,union
1,2000000,654866,1345134,67,https://www.casasymas.com.uy/propiedad/132760-...,250.0,3,3,carrasco
2,170000,263074,93074,55,https://www.casasymas.com.uy/propiedad/205375-...,389.0,4,4,carrasco
3,305000,146791,158209,52,https://www.casasymas.com.uy/propiedad/209282-...,100.0,1,2,cerrito
4,120000,181828,61828,52,https://www.casasymas.com.uy/propiedad/172471-...,156.0,3,3,carrasco-norte
5,157000,227824,70824,45,https://www.casasymas.com.uy/propiedad/159142-...,95.0,1,3,parque-batlle
6,70000,97234,27234,39,https://www.casasymas.com.uy/propiedad/160318-...,82.0,1,3,jacinto-vera
7,420000,258710,161290,38,https://www.casasymas.com.uy/propiedad/179321-...,,4,4,san-nicolas
8,850000,558270,291730,34,https://www.casasymas.com.uy/propiedad/159157-...,100.0,1,3,aguada
9,128000,170025,42025,33,https://www.casasymas.com.uy/propiedad/202350-...,59.0,2,2,malvin


## RF Municipio E SIMPLE

In [None]:
model_rf_municipio_e_simple = train_random_forest(X_train_e_simple, y_train_e_simple)

print("Modelo entrenado:", model_rf_municipio_e_simple)

Modelo entrenado: RandomForestRegressor(n_jobs=-1, random_state=42)


In [None]:
mse_municipio_e_simple, r2_municipio_e_simple, mae_municipio_e_simple = evaluate_model(model_rf_municipio_e_simple, X_test_e_simple, y_test_e_simple)

print(f"Mean Squared Error (Real): {mse_municipio_e_simple}")
print(f"R² Score (Real): {r2_municipio_e_simple}")
print(f"Mean Absolute Error (Real): {mae_municipio_e_simple}")

Mean Squared Error (Real): 60643780573.24016
R² Score (Real): 0.5120008830220814
Mean Absolute Error (Real): 142054.6011102912


In [None]:
pd.set_option("display.float_format", "{:,.0f}".format)

extra_cols = ["url", "Superficie", "Baños", "Dormitorio", "barrio"]

results = evaluation_table(model_rf_municipio_e_simple, X_test_e_simple, y_test_e_simple, data_e, extra_cols)

results.head(20)

Unnamed: 0,Precio Real,Precio Predicho,Error Absoluto,Error %,url,Superficie,Baños,Dormitorio,barrio
0,125000,1036038,911038,729,https://www.casasymas.com.uy/propiedad/119558-...,690.0,4,2,malvin-norte
1,99000,463255,364255,368,https://www.casasymas.com.uy/propiedad/79956-c...,315.0,3,3,malvin-norte
2,350000,929842,579842,166,https://www.casasymas.com.uy/propiedad/190292-...,,4,4,malvin-norte
3,145000,297468,152468,105,https://www.casasymas.com.uy/propiedad/183832-...,221.0,2,3,parque-rivera
4,350000,619417,269417,77,https://www.casasymas.com.uy/propiedad/208135-...,,3,4,malvin
5,1800000,652516,1147484,64,https://www.casasymas.com.uy/propiedad/63219-c...,603.0,3,4,punta-gorda
6,230000,364190,134190,58,https://www.casasymas.com.uy/propiedad/188240-...,114.0,2,3,malvin-norte
7,375000,582518,207518,55,https://www.casasymas.com.uy/propiedad/195652-...,217.0,3,4,punta-gorda
8,134000,200446,66446,50,https://www.casasymas.com.uy/propiedad/130556-...,280.0,1,3,malvin-norte
9,460000,232298,227702,50,https://www.casasymas.com.uy/propiedad/203811-...,90.0,3,4,punta-gorda


In [None]:
print(len(results))

70


## RF Municipio E Completo

In [None]:
model_rf_municipio_e = train_random_forest(X_train_e_completo, y_train_e_completo)

print("Modelo entrenado:", model_rf_municipio_e)

Modelo entrenado: RandomForestRegressor(n_jobs=-1, random_state=42)


In [None]:
mse_municipio_e, r2_municipio_e, mae_municipio_e = evaluate_model(model_rf_municipio_e, X_test_e_completo, y_test_e_completo)

print(f"Mean Squared Error (Real): {mse_municipio_e}")
print(f"R² Score (Real): {r2_municipio_e}")
print(f"Mean Absolute Error (Real): {mae_municipio_e}")

Mean Squared Error (Real): 8377518411.217138
R² Score (Real): 0.9325863007138411
Mean Absolute Error (Real): 46633.757455632454


## RF all_municipios SIMPLE

In [None]:
model_rf_all_municipios_simple = train_random_forest(X_train_all_municipios_simple, y_train_all_municipios_simple)

print("Modelo entrenado:", model_rf_all_municipios_simple)

Modelo entrenado: RandomForestRegressor(n_jobs=-1, random_state=42)


In [None]:
mse_rf_all_municipios_simple, r2_rf_all_municipios_simple, mae_rf_all_municipios_simple = evaluate_model(model_rf_all_municipios_simple, X_test_all_municipios_simple, y_test_all_municipios_simple)

print(f"Mean Squared Error (Real): {mse_rf_all_municipios_simple}")
print(f"R² Score (Real): {r2_rf_all_municipios_simple}")
print(f"Mean Absolute Error (Real): {mae_rf_all_municipios_simple}")

Mean Squared Error (Real): 62201298533.446686
R² Score (Real): 0.49298535639205265
Mean Absolute Error (Real): 139621.4077868757


In [None]:
pd.set_option("display.float_format", "{:,.0f}".format)

extra_cols = ["url", "Superficie", "Baños", "Dormitorio", "barrio"]

results = evaluation_table(model_rf_all_municipios_simple, X_test_all_municipios_simple, y_test_all_municipios_simple, data, extra_cols)

results.head(40)

Unnamed: 0,Precio Real,Precio Predicho,Error Absoluto,Error %,url,Superficie,Baños,Dormitorio,barrio
0,137900,555905,418005,303,https://www.casasymas.com.uy/propiedad/205556-...,454.0,3,5,bolivar
1,128000,511724,383724,300,https://www.casasymas.com.uy/propiedad/151589-...,720.0,3,5,colon
2,70000,276241,206241,295,https://www.casasymas.com.uy/propiedad/210321-...,75.0,1,3,conciliacion
3,120000,456205,336205,280,https://www.casasymas.com.uy/propiedad/199197-...,753.0,3,3,sayago
4,85000,300165,215165,253,https://www.casasymas.com.uy/propiedad/159205-...,435.0,2,3,conciliacion
5,64000,177288,113288,177,https://www.casasymas.com.uy/propiedad/127393-...,106.0,2,2,aguada
6,165000,455888,290888,176,https://www.casasymas.com.uy/propiedad/170043-...,1236.0,2,3,pajas-blancas
7,75000,202599,127599,170,https://www.casasymas.com.uy/propiedad/190700-...,100.0,2,4,malvin-norte
8,268000,673728,405728,151,https://www.casasymas.com.uy/propiedad/158828-...,333.0,3,3,union
9,59500,147704,88204,148,https://www.casasymas.com.uy/propiedad/118244-...,195.0,1,3,conciliacion


## RF all_municipios Completo

In [None]:
model_rf_all_municipios = train_random_forest(X_train_all_municipios_completo, y_train_all_municipios_completo)

print("Modelo entrenado:", model_rf_all_municipios)

Modelo entrenado: RandomForestRegressor(n_jobs=-1, random_state=42)


In [None]:
mse_rf_all_municipios, r2_rf_all_municipios, mae_rf_all_municipios = evaluate_model(model_rf_all_municipios, X_test_all_municipios_completo, y_test_all_municipios_completo)

print(f"Mean Squared Error (Real): {mse_rf_all_municipios}")
print(f"R² Score (Real): {r2_rf_all_municipios}")
print(f"Mean Absolute Error (Real): {mae_rf_all_municipios}")

Mean Squared Error (Real): 12896809431.429754
R² Score (Real): 0.894875647426558
Mean Absolute Error (Real): 36929.37189468204


## GridSearch RandomForest Municipio E SIMPLE

In [None]:
model_rf_gridsearch_municipio_e_simple = random_forest_grid_search(X_train_e_simple, y_train_e_simple)

print("Modelo entrenado:", model_rf_gridsearch_municipio_e_simple)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Mejores parámetros: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Mejor score (CV MAE log): -0.28011392976766536
Modelo entrenado: RandomForestRegressor(max_depth=10, max_features='log2', n_estimators=200,
                      n_jobs=-1, random_state=42)


In [None]:
mse_rf_gridsearch_municipio_e_simple, r2_rf_gridsearch_municipio_e_simple, mae_rf_gridsearch_municipio_e_simple = evaluate_model(model_rf_gridsearch_municipio_e_simple, X_test_e_simple, y_test_e_simple)

print(f"Mean Squared Error (Real): {mse_rf_gridsearch_municipio_e_simple}")
print(f"R² Score (Real): {r2_rf_gridsearch_municipio_e_simple}")
print(f"Mean Absolute Error (Real): {mae_rf_gridsearch_municipio_e_simple}")

Mean Squared Error (Real): 57889364181.27024
R² Score (Real): 0.5341656088087179
Mean Absolute Error (Real): 131736.47153064967


## RandomForest GridSearch Municipio E Completo

In [None]:
model_rf_gridsearch_municipio_e = random_forest_grid_search(X_train_e_completo, y_train_e_completo)

print("Modelo entrenado:", model_rf_gridsearch_municipio_e)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Mejores parámetros: {'max_depth': 20, 'max_features': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Mejor score (CV MAE log): -0.11658899678731971
Modelo entrenado: RandomForestRegressor(max_depth=20, max_features=0.8, n_estimators=400,
                      n_jobs=-1, random_state=42)


In [None]:
mse_rf_gridsearch_municipio_e, r2_rf_gridsearch_municipio_e, mae_rf_gridsearch_municipio_e = evaluate_model(model_rf_gridsearch_municipio_e, X_test_e_completo, y_test_e_completo)

print(f"Mean Squared Error (Real): {mse_rf_gridsearch_municipio_e}")
print(f"R² Score (Real): {r2_rf_gridsearch_municipio_e}")
print(f"Mean Absolute Error (Real): {mae_rf_gridsearch_municipio_e}")

Mean Squared Error (Real): 8566828401.6145735
R² Score (Real): 0.9310629275455493
Mean Absolute Error (Real): 46492.871401304525


## Random Forest GridSearch todos los municipios SIMPLE

In [None]:
model_rf_gridsearch_all_municipios_simple = random_forest_grid_search(X_train_all_municipios_simple, y_train_all_municipios_simple)

print("Modelo entrenado:", model_rf_gridsearch_all_municipios_simple)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Mejores parámetros: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Mejor score (CV MAE log): -0.3546008669243099
Modelo entrenado: RandomForestRegressor(max_depth=10, max_features='log2', n_estimators=200,
                      n_jobs=-1, random_state=42)


In [None]:
mse_rf_gridsearch_all_municipios_simple, r2_rf_gridsearch_all_municipios_simple, mae_rf_gridsearch_all_municipios_simple = evaluate_model(model_rf_gridsearch_all_municipios_simple, X_test_all_municipios_simple, y_test_all_municipios_simple)

print(f"Mean Squared Error (Real): {mse_rf_gridsearch_all_municipios_simple}")
print(f"R² Score (Real): {r2_rf_gridsearch_all_municipios_simple}")
print(f"Mean Absolute Error (Real): {mae_rf_gridsearch_all_municipios_simple}")

Mean Squared Error (Real): 60366428544.983765
R² Score (Real): 0.5079417315032122
Mean Absolute Error (Real): 135315.74465744683


## Random Forest GridSearch todos los municipios COMPLETO

In [None]:
model_rf_gridsearch_all_municipios = random_forest_grid_search(X_train_all_municipios_completo, y_train_all_municipios_completo)

print("Modelo entrenado:", model_rf_gridsearch_all_municipios)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Mejores parámetros: {'max_depth': None, 'max_features': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 600}
Mejor score (CV MAE log): -0.12187526980478924
Modelo entrenado: RandomForestRegressor(max_features=0.8, n_estimators=600, n_jobs=-1,
                      random_state=42)


In [None]:
mse_rf_gridsearch_all_municipios, r2_rf_gridsearch_all_municipios, mae_rf_gridsearch_all_municipios = evaluate_model(model_rf_gridsearch_all_municipios, X_test_all_municipios_completo, y_test_all_municipios_completo)

print(f"Mean Squared Error (Real): {mse_rf_gridsearch_all_municipios}")
print(f"R² Score (Real): {r2_rf_gridsearch_all_municipios}")
print(f"Mean Absolute Error (Real): {mae_rf_gridsearch_all_municipios}")

Mean Squared Error (Real): 14737116209.18083
R² Score (Real): 0.8798749560093357
Mean Absolute Error (Real): 41100.74238254107


## Model Performance

* `model_lr_categoricas`: refiere a Linear Regression con target `precio` y con

In [None]:
def crear_tabla_resultados(nombres, mse_list, r2_list, mae_list):

    resultados = []

    for nombre, mae, mse, r2 in zip(nombres, mae_list, mse_list, r2_list):
        resultados.append({
            "Modelo": nombre,
            "MAE": mae,
            "MSE": mse,
            "R2": r2
        })

    df_resultados = pd.DataFrame(resultados)
    pd.options.display.float_format = '{:,.2f}'.format
    return df_resultados


In [None]:
all_models_simple = [model_lr_simple, model_rf_simple, model_rf_gridsearch_simple, model_rf_municipio_e_simple, model_rf_all_municipios_simple, model_rf_gridsearch_municipio_e_simple,model_rf_gridsearch_all_municipios_simple ]
mse_models_simple = [mse_lr_simple, mse_rf_simple, mse_rf_gridsearch_simple, mse_municipio_e_simple, mse_rf_all_municipios_simple, mse_rf_gridsearch_municipio_e_simple, mse_rf_gridsearch_all_municipios_simple]
r2_models_simple = [r2_lr_simple, r2_rf_simple, r2_rf_gridsearch_simple, r2_municipio_e_simple, r2_rf_all_municipios_simple, r2_rf_gridsearch_municipio_e_simple, r2_rf_gridsearch_all_municipios_simple]
mae_models_simple = [mae_lr_simple, mae_rf_simple, mae_rf_gridsearch_simple, mae_municipio_e_simple, mae_rf_all_municipios_simple, mae_rf_gridsearch_municipio_e_simple, mae_rf_gridsearch_all_municipios_simple]

In [None]:
all_models = [model_lr, model_rf, model_rf_gridsearch, model_rf_municipio_e, model_rf_all_municipios, model_rf_gridsearch_municipio_e]
mse_models = [mse_lr, mse_rf, mse_rf_gridsearch, mse_municipio_e, mse_rf_all_municipios, mse_rf_gridsearch_municipio_e]
r2_models = [r2_lr, r2_rf, r2_rf_gridsearch, r2_municipio_e, r2_rf_all_municipios, r2_rf_gridsearch_municipio_e]
mae_models = [mae_lr, mae_rf, mae_rf_gridsearch, mae_municipio_e, mae_rf_all_municipios, mae_rf_gridsearch_municipio_e]

In [None]:
# Nombres de los modelos (los mismos para simple y normal)
model_names = [
    "LR",
    "RF municipio_ch",
    "RF municipio_e",
    "RF all municipios",
    "RF GridSearch municipio_ch",
    "RF GridSearch municipio_e",
    "RF Gridsearch all_municipios"
]

# DataFrame para los modelos simples
tabla_simple = crear_tabla_resultados(model_names, mse_models_simple, r2_models_simple, mae_models_simple)

# DataFrame para los modelos completos
tabla = crear_tabla_resultados(model_names, mse_models, r2_models, mae_models)

print("Resultados modelos simples:")
tabla_simple.head(10)

Resultados modelos simples:


Unnamed: 0,Modelo,MAE,MSE,R2
0,LR,139391.81,50513899007.09,0.49
1,RF municipio_ch,139621.41,62201298533.45,0.49
2,RF municipio_e,135315.74,60366428544.98,0.51
3,RF all municipios,142054.6,60643780573.24,0.51
4,RF GridSearch municipio_ch,139621.41,62201298533.45,0.49
5,RF GridSearch municipio_e,131736.47,57889364181.27,0.53
6,RF Gridsearch all_municipios,135315.74,60366428544.98,0.51


In [None]:
print("Resultados modelos completos:")
tabla.head(10)

Resultados modelos completos:


Unnamed: 0,Modelo,MAE,MSE,R2
0,LR,117055.24,51177560283.29,0.48
1,RF municipio_ch,36929.37,12896809431.43,0.89
2,RF municipio_e,41100.74,14737116209.18,0.88
3,RF all municipios,46633.76,8377518411.22,0.93
4,RF GridSearch municipio_ch,36929.37,12896809431.43,0.89
5,RF GridSearch municipio_e,46492.87,8566828401.61,0.93
