In [44]:
import requests
from bs4 import BeautifulSoup
from time import sleep
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error,
    root_mean_squared_error,
    r2_score,
)

import category_encoders as ce
from xgboost import XGBRegressor

# üíæ Data Importation

**CODE WITH THE DATA EXTRACTION**

BASE_URL = "https://www.properati.com.co/s/bogota-d-c-colombia/venta"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.google.com/",
}

PARAMS = {
    "propertyType": "studio,apartment,house,commercial,office"
}

session = requests.Session()
session.headers.update(HEADERS)

data = []

for page in range(1, 10000):
    url = f"{BASE_URL}/{page}"

    try:
        response = session.get(url, params=PARAMS, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Page error({page}): {e}")
        continue

    soup = BeautifulSoup(response.text, "html.parser")

    listings = soup.find_all("article")

    for item in listings:
        def get_text(tag, class_name):
            el = item.find(tag, class_=class_name)
            return el.get_text(strip=True) if el else None

        price = get_text("div", "price")
        location = get_text("div", "location")
        title = get_text("a", "title")
        bedrooms = get_text("span", "properties__bedrooms")
        bathrooms = get_text("span", "properties__bathrooms")
        area = get_text("span", "properties__area")
        parking = get_text("span", "properties__amenity__car_park")

        data.append({
            "price": price,
            "location": location,
            "type": title,
            "bedrooms": bedrooms,
            "bathrooms": bathrooms,
            "area": area,
            "parking": parking
        })

    sleep(1)

df = pd.DataFrame(data)
df.head()

df.to_csv("properati.csv", index=False)

In [45]:
df = pd.read_csv("properati.csv")
df.head()

Unnamed: 0,price,location,type,bedrooms,bathrooms,area,parking
0,Desde $ 859.500.000,"Usaqu√©n, Zona Norte, Bogot√° D.C, Cundinamarca",ùêÉùêîùêÄùêã ùüèùüéùüè ùêáùêéùêîùêíùêÑ,2 - 4 habitaciones,3 - 4 ba√±os,Desde 85 m¬≤,
1,Desde $ 466.475.500,"Suba, Zona Noroccidental, Bogot√° D.C, Cundinam...",Hacienda Los Lagos Apartamentos,2 - 3 habitaciones,2 ba√±os,Desde 54 m¬≤,
2,$ 13.047.900.000,"Niza, Suba, Zona Noroccidental, Bogot√° D.C, Cu...",Oficina en Venta en Niza,,,2.538 m¬≤,Parqueadero
3,$ 740.000.000,"Fontib√≥n, Zona Occidental, Bogot√° D.C, Cundina...",Local comercial en Venta en Fontib√≥n,,,243 m¬≤,
4,$ 1.500.000.000,"Puente Aranda, Zona Centro, Bogot√° D.C, Cundin...",Apartamento en Venta en Puente Aranda,3 habitaciones,"4,5 ba√±os",205 m¬≤,Parqueadero


# üîç Initial Data Exploration

In [46]:
df.shape

(5002, 7)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5002 entries, 0 to 5001
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   price      5002 non-null   object
 1   location   5002 non-null   object
 2   type       5002 non-null   object
 3   bedrooms   3540 non-null   object
 4   bathrooms  4342 non-null   object
 5   area       4462 non-null   object
 6   parking    2684 non-null   object
dtypes: object(7)
memory usage: 273.7+ KB


In [48]:
df.isnull().sum()

price           0
location        0
type            0
bedrooms     1462
bathrooms     660
area          540
parking      2318
dtype: int64

# ‚ùå Handling Missing Values

In [49]:
mode_bedroom = df['bedrooms'].mode()[0]
df.loc[:, 'bedrooms'] = df['bedrooms'].fillna(mode_bedroom)

mode_bathroom = df['bathrooms'].mode()[0]
df.loc[:, 'bathrooms'] = df['bathrooms'].fillna(mode_bathroom)

df.isnull().sum()

price           0
location        0
type            0
bedrooms        0
bathrooms       0
area          540
parking      2318
dtype: int64

# üìä Manipulating the data

In [50]:
# Drop Ad values
df = df.iloc[2:].reset_index(drop=True)

# Obtain just values from price
df["price"] = (
    df["price"]
    .str.replace(r"\D", "", regex=True)
    .pipe(pd.to_numeric, errors="coerce")
    .astype("Int64")
)

# Obtain just values from area and replace NaN with mean_area
df["area"] = (
    df["area"]
        .str.replace(r"\D", "", regex=True)
        .pipe(pd.to_numeric, errors="coerce")
        .astype("Int64")
)
mean_area = df["area"].mean().round()
df["area"] = df["area"].fillna(mean_area)

# Obtain the location, type and bedrooms correct part
df["location"] = df["location"].str.split(",").str[0]
df["type"] = df["type"].str.split().str[0]
df["bedrooms"] = df["bedrooms"].str.split().str[0].astype(int)

# Obtain bathrooms
df["bathrooms"] = (
    df["bathrooms"]
        .astype(str)
        .str.split().str[0]
        .str.replace(",", ".", regex=False)
        .astype(float)
)

# Change parking to binary
df["parking"] = df["parking"].notna().astype(int)

# Cap extreme values (outliers) from price and area
df = df[df["area"] < df["area"].quantile(0.99)]
df = df[df["price"] < df["price"].quantile(0.99)]

# Transform price and area
df["log_area"] = np.log(df["area"])
df["log_price"] = np.log(df["price"])

#Drop useless columns
df = df.drop(columns={"price", "area"})

df.head()

Unnamed: 0,location,type,bedrooms,bathrooms,parking,log_area,log_price
1,Fontib√≥n,Local,3,2.0,0,5.493,20.422
2,Puente Aranda,Apartamento,3,4.5,1,5.323,21.129
3,Puente Aranda,Casa,4,4.0,1,5.624,21.64
4,Niza,Casa,4,4.0,0,5.892,21.717
5,El Retiro,Apartamento,3,5.0,0,5.553,22.084


# üí™ Training and testing models

In [51]:
# train-test split
X = df.drop("log_price", axis=1)
y = df["log_price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
# categorize different features
num_features = ["bedrooms", "bathrooms", "log_area", "parking"]
cat_features = ["type"]
target_encode_features = ["location"]

In [53]:
# linear
linear_preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("scaler", StandardScaler()),
            ("poly", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False))
        ]), num_features),

        ("location_te", ce.TargetEncoder(cols=target_encode_features,smoothing=10,min_samples_leaf=20),
         target_encode_features),

        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ]
)

In [54]:
# trees
tree_preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_features),
        ("location_te", ce.TargetEncoder(cols=target_encode_features,smoothing=10,min_samples_leaf=20),
         target_encode_features),
        ("cat", "drop", cat_features)
    ]
)

In [55]:
# models
models = {
    "Linear Regression": (
        linear_preprocessor,
        Ridge(alpha=1.0)
    ),

    "Random Forest": (
        tree_preprocessor,
        RandomForestRegressor(
            n_estimators=300,
            max_depth=None,
            min_samples_leaf=3,
            random_state=42,
            n_jobs=-1
        )
    ),

    "XGBoost": (
        tree_preprocessor,
        XGBRegressor(
            n_estimators=400,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="reg:squarederror",
            random_state=42,
            n_jobs=-1
        )
    )
}


In [56]:
results = []

for name, (preprocessor, model) in models.items():

    pipeline = Pipeline(
        steps=[
            ("preprocessing", preprocessor),
            ("model", model)
        ]
    )

    # Fit
    pipeline.fit(X_train, y_train)

    # Predict
    y_pred = pipeline.predict(X_test)

    # Metrics
    y_test_price = np.exp(y_test)
    y_pred_price = np.exp(y_pred)

    mae_price = mean_absolute_error(y_test_price, y_pred_price)
    rmse_price = root_mean_squared_error(y_test_price, y_pred_price)
    r2 = r2_score(y_test, y_pred)

    # Cross-validation (log RMSE)
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_rmse = -cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=cv,
        scoring="neg_root_mean_squared_error",
        n_jobs=-1
    ).mean()

    results.append({
        "Model": name,
        "MAE_price": mae_price,
        "RMSE_price": rmse_price,
        "R¬≤": r2,
        "CV RMSE (log)": cv_rmse
    })


In [57]:
#Compare results
results_df = (
    pd.DataFrame(results)
      .sort_values("RMSE_price")
      .reset_index(drop=True)
)

pd.options.display.float_format = '{:,.3f}'.format

results_df

Unnamed: 0,Model,MAE_price,RMSE_price,R¬≤,CV RMSE (log)
0,XGBoost,415845579.676,792663230.126,0.686,0.52
1,Random Forest,412990727.397,800769112.966,0.671,0.528
2,Linear Regression,481305208.057,912527366.462,0.608,0.598


The best model to use is **XGBoost** with the best RMSE and R squared, **random forest** is also good with the best performance in MAE