In [1]:
# === Importaciones ===
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    mean_absolute_percentage_error, explained_variance_score,
    make_scorer
)

In [2]:
# === Definir scorers personalizados ===
def get_regression_scorers():
    return {
        'mae': make_scorer(mean_absolute_error, greater_is_better=False),
        'mse': make_scorer(mean_squared_error, greater_is_better=False),
        'rmse': make_scorer(lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred)), greater_is_better=False),
        'mape': make_scorer(mean_absolute_percentage_error, greater_is_better=False),
        'r2': make_scorer(r2_score),
        'evs': make_scorer(explained_variance_score)
    }

scoring = get_regression_scorers()

In [3]:
# === Carga de datos ===
df = pd.read_csv("../data/datos_limpios_eda.csv")
df.head

<bound method NDFrame.head of                 zona  PrecioActual  metros  habitaciones  tiene_ascensor  \
0      ciudad-lineal        355000      69           2.0               1   
1        carabanchel        149000      91           3.0               0   
2              usera        195000      58           1.0               1   
3             tetuan        715000     140           3.0               1   
4         arganzuela       1257000     135           3.0               1   
...              ...           ...     ...           ...             ...   
10383         centro        189000      36           2.0               0   
10384         centro       2600000     245           2.0               1   
10385         tetuan        219000      56           1.0               1   
10386    carabanchel        165000      74           3.0               0   
10387       san-blas        178000      54           2.0               1   

      localizacion  planta_num  baños  PrecioActual_log  

In [4]:
# === Separar variables explicativas y target ===
X = df.drop(columns=["PrecioActual"])
y = df["PrecioActual"]
# === Codificación one-hot para TODAS las variables categóricas ===
X_encoded = pd.get_dummies(X, drop_first=True)

In [5]:
# === División en train/test ===
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

In [6]:
# === 1. LinearRegression ===
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

lr_params = {
    'model__fit_intercept': [True, False]
}

lr_cv = GridSearchCV(
    estimator=lr_pipeline,
    param_grid=lr_params,
    scoring=scoring,
    refit='r2',
    cv=5,
    n_jobs=-1,
    return_train_score=True
)

lr_cv.fit(X_train, y_train)

In [7]:
# === 2. RandomForestRegressor ===
rf_pipeline = Pipeline([
    ('model', RandomForestRegressor(random_state=42))
])

rf_params = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5]
}

rf_cv = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=rf_params,
    scoring=scoring,
    refit='r2',
    cv=5,
    n_jobs=-1,
    return_train_score=True
)

rf_cv.fit(X_train, y_train)

In [8]:
# === 3. XGBRegressor ===
xgb_pipeline = Pipeline([
    ('model', XGBRegressor(random_state=42, verbosity=0))
])

xgb_params = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.1, 0.05],
    'model__max_depth': [3, 6],
    'model__subsample': [0.8, 1.0]
}

xgb_cv = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=xgb_params,
    scoring=scoring,
    refit='r2',
    cv=5,
    n_jobs=-1,
    return_train_score=True
)

xgb_cv.fit(X_train, y_train)

In [9]:
# === Evaluación en test ===
models = {
    "Linear Regression": lr_cv,
    "Random Forest": rf_cv,
    "XGBoost": xgb_cv
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name}")
    print(f"R2: {r2_score(y_test, y_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")


Linear Regression
R2: 0.7838
RMSE: 565339.20
MAE: 298553.35

Random Forest
R2: 0.9960
RMSE: 77072.27
MAE: 2959.24

XGBoost
R2: 0.9577
RMSE: 249978.16
MAE: 22135.57
