# ML Model for Madrid

This notebook builds a RandomForest model on the Madrid dataset. It removes non-target PRICE columns and uses Weights & Biases (wandb) for experiment tracking.

In [9]:
import sys
import os
import pandas as pd
from dotenv import load_dotenv

load_dotenv() # Load the environment variables from .env file

import importlib
# Añadir la ruta al directorio src para importar módulos personalizados
sys.path.append("../src")

# Unload the module if already loaded
import idealista18.loader as loader
import idealista18.enricher as enricher
import idealista18.util as util

os.makedirs("../models", exist_ok=True)

## Connect to Weights & Biases

In [10]:
import wandb

# 1) Grab the W&B token

wandb_token = os.getenv("WANDB_API_KEY")
if wandb_token is None:
    raise RuntimeError("WANDB_API_KEY not found in environment. Did you create a .env with that variable?")
# 2) Login with the token
wandb.login(key=wandb_token)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/davidreyblanco/.netrc


True

In [11]:
# Load and clean data for Madrid
# Load data and remove outliers (spatial)
CITY = "Madrid"

df = enricher.load_enriched_data(city_name=CITY, 
                                          deduplicate_by_adid=True,
                                          enrich_census_code=False, 
                                          enrich_idealista_areas=False)


print("Data loaded:")
print(df.head())

Data loaded:
                 ASSETID   PRICE    UNITPRICE ADTYPOLOGYID ADOPERATIONID  \
0  A15019136831406238029  126000  2680.851064         HOME          SALE   
1   A6677225905472065344  235000  4351.851852         HOME          SALE   
2  A13341979748618524775  373000  4973.333333         HOME          SALE   
3   A4775182175615276542  284000  5916.666667         HOME          SALE   
4   A2492087730711701973  228000  4560.000000         HOME          SALE   

   CONSTRUCTEDAREA  ROOMNUMBER  BATHNUMBER  HASTERRACE  HASLIFT  ...  \
0               47           1           1           0        1  ...   
1               54           1           1           0        0  ...   
2               75           2           1           0        0  ...   
3               48           1           1           0        1  ...   
4               50           0           1           0        0  ...   

   DISTANCE_TO_METRO  DISTANCE_TO_CASTELLANA  LONGITUDE   LATITUDE  CITYNAME  \
0           0.872

In [12]:
df.columns

Index(['ASSETID', 'PRICE', 'UNITPRICE', 'ADTYPOLOGYID', 'ADOPERATIONID',
       'CONSTRUCTEDAREA', 'ROOMNUMBER', 'BATHNUMBER', 'HASTERRACE', 'HASLIFT',
       'HASAIRCONDITIONING', 'AMENITYID', 'HASPARKINGSPACE',
       'ISPARKINGSPACEINCLUDEDINPRICE', 'PARKINGSPACEPRICE',
       'HASNORTHORIENTATION', 'HASSOUTHORIENTATION', 'HASEASTORIENTATION',
       'HASWESTORIENTATION', 'HASBOXROOM', 'HASWARDROBE', 'HASSWIMMINGPOOL',
       'HASDOORMAN', 'HASGARDEN', 'ISDUPLEX', 'ISSTUDIO', 'ISINTOPFLOOR',
       'CONSTRUCTIONYEAR', 'FLOORCLEAN', 'FLATLOCATIONID',
       'CADCONSTRUCTIONYEAR', 'CADMAXBUILDINGFLOOR', 'CADDWELLINGCOUNT',
       'CADASTRALQUALITYID', 'BUILTTYPEID_1', 'BUILTTYPEID_2', 'BUILTTYPEID_3',
       'DISTANCE_TO_CITY_CENTER', 'DISTANCE_TO_METRO',
       'DISTANCE_TO_CASTELLANA', 'LONGITUDE', 'LATITUDE', 'CITYNAME',
       'ADTYPOLOGY', 'ADOPERATION', 'LOCATIONID', 'LOCATIONNAME', 'CUSEC'],
      dtype='object')

In [13]:
TARGET_COLUMN = "UNITPRICE"
#
# 1) Remove all PRICE-related columns except 'UNITPRICE'
# 2) Remove unnecessary columns:  Typology, Operation, AssetID, Period
#
exclude_columns = ['ADTYPOLOGY', 'ADOPERATION', 'ASSETID', 'PERIOD', 
                   'ADTYPOLOGYID', 'ADOPERATIONID', 'CITYNAME',
                   'LOCATIONID', 'LOCATIONNAME', 'CUSEC']
if not df.empty:
    price_cols = [col for col in df.columns if col in exclude_columns
                                                       or (col.endswith('PRICE') and col != TARGET_COLUMN)]

    print("Dropping columns:", price_cols)
    df = df.drop(columns=price_cols)
else:
    print("Dataframe is empty")

Dropping columns: ['ASSETID', 'PRICE', 'ADTYPOLOGYID', 'ADOPERATIONID', 'ISPARKINGSPACEINCLUDEDINPRICE', 'PARKINGSPACEPRICE', 'CITYNAME', 'ADTYPOLOGY', 'ADOPERATION', 'LOCATIONID', 'LOCATIONNAME', 'CUSEC']


In [14]:
df.columns

Index(['UNITPRICE', 'CONSTRUCTEDAREA', 'ROOMNUMBER', 'BATHNUMBER',
       'HASTERRACE', 'HASLIFT', 'HASAIRCONDITIONING', 'AMENITYID',
       'HASPARKINGSPACE', 'HASNORTHORIENTATION', 'HASSOUTHORIENTATION',
       'HASEASTORIENTATION', 'HASWESTORIENTATION', 'HASBOXROOM', 'HASWARDROBE',
       'HASSWIMMINGPOOL', 'HASDOORMAN', 'HASGARDEN', 'ISDUPLEX', 'ISSTUDIO',
       'ISINTOPFLOOR', 'CONSTRUCTIONYEAR', 'FLOORCLEAN', 'FLATLOCATIONID',
       'CADCONSTRUCTIONYEAR', 'CADMAXBUILDINGFLOOR', 'CADDWELLINGCOUNT',
       'CADASTRALQUALITYID', 'BUILTTYPEID_1', 'BUILTTYPEID_2', 'BUILTTYPEID_3',
       'DISTANCE_TO_CITY_CENTER', 'DISTANCE_TO_METRO',
       'DISTANCE_TO_CASTELLANA', 'LONGITUDE', 'LATITUDE'],
      dtype='object')

Create a matrix (covariates) and a vector (target)

In [15]:
# Prepare features and target variable
if TARGET_COLUMN in df.columns:
    X = df.drop(columns=[TARGET_COLUMN])
    y = df[TARGET_COLUMN]
    print("Features and target prepared.")
else:
    X, y = df, None
    print("UNITPRICE column not found in dataframe.")

Features and target prepared.


# Dataset como "artefacto" del proyecto

## Subir el artefacto

En este caso creamos un proyecto aparte que tenga el dataset, lo usaremos en los distintos proyectos de modelado

In [16]:
from sklearn.model_selection import train_test_split

# Añadimos un dataset al registro de artefactos de W&B

TEAM_ENTITY = "mds9"  # Replace with your team entity
DATASET = "idealista18"
random_state = 42  # For reproducibility

DATASET_PROJECT = f"{DATASET}_{CITY.lower()}_dataset_raw_{TEAM_ENTITY.lower()}"

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Save train/test splits as CSV files
X_train_file = "X_train.csv.gz"
X_test_file = "X_test.csv.gz"
y_train_file = "y_train.csv.gz"
y_test_file = "y_test.csv.gz"

X_train.to_csv(X_train_file, index=False, compression='gzip')
X_test.to_csv(X_test_file, index=False, compression='gzip')
y_train.to_csv(y_train_file, index=False, compression='gzip')
y_test.to_csv(y_test_file, index=False, compression='gzip')

# Log the splits as resources in the artifact
run_dataset = wandb.init(project=f"{DATASET_PROJECT}", job_type="train-model")
artifact_ds = wandb.Artifact(name=f"{DATASET}_splits", type="dataset")
artifact_ds.add_file(X_train_file)
artifact_ds.add_file(X_test_file)
artifact_ds.add_file(y_train_file)
artifact_ds.add_file(y_test_file)
artifact_ds.save()
run_dataset.finish()

print("------------------------------------------")
print("Uploaded splits from W&B artifact:")
print("------------------------------------------")

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


------------------------------------------
Uploaded splits from W&B artifact:
------------------------------------------
X_train shape: (60642, 35)
X_test shape: (15161, 35)
y_train shape: (60642,)
y_test shape: (15161,)


## Descargar el artefacto

Ahora recuperamos los datasets subido como artefacto

In [17]:
import pandas as pd
import wandb

# Download the dataset splits artifact from W&B
artifact_name = f"{DATASET}_splits:latest"
with wandb.init(project=DATASET_PROJECT) as run:
    artifact = run.use_artifact(artifact_name, type="dataset")
    artifact_dir = artifact.download()

    # Load the splits from the artifact directory
    X_train = pd.read_csv(f"{artifact_dir}/X_train.csv.gz", compression='gzip')
    X_test = pd.read_csv(f"{artifact_dir}/X_test.csv.gz", compression='gzip')
    y_train = pd.read_csv(f"{artifact_dir}/y_train.csv.gz", compression='gzip').squeeze()
    y_test = pd.read_csv(f"{artifact_dir}/y_test.csv.gz", compression='gzip').squeeze()

print("------------------------------------------")
print("Loaded splits from W&B artifact:")
print("------------------------------------------")

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

[34m[1mwandb[0m:   4 of 4 files downloaded.  


------------------------------------------
Loaded splits from W&B artifact:
------------------------------------------
X_train shape: (60642, 35)
X_test shape: (15161, 35)
y_train shape: (60642,)
y_test shape: (15161,)


# Entrenamiento de modelos

## Random Forests

In [18]:
# Train/test split and model training
import unidecode as unidecode
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error, median_absolute_error

TEAM_ENTITY = "mds9"  # Replace with your team entity
ALGORITHM = "Random Forest"
DATASET = "idealista18"
algorithm_slug = unidecode.unidecode(ALGORITHM).lower().replace(" ", "")
PROJECT = f"{DATASET}_{CITY.lower()}_{algorithm_slug}_{TARGET_COLUMN}_{TEAM_ENTITY.lower()}"


# 1) Grab the W&B token
wandb_token = os.getenv("WANDB_API_KEY")
if wandb_token is None:
    raise RuntimeError("WANDB_API_KEY not found in environment. Did you create a .env with that variable?")

# 2) Login with the token
wandb.login(key=wandb_token)

for n_estimators in [10, 20, 50, 100]:
    for max_depth in [5, 10]:

        # 3) Train some models
        with wandb.init(project=PROJECT, config={"algorithm": ALGORITHM,
                                                "target": TARGET_COLUMN,
                                                "city": CITY,
                                                "dataset": DATASET }) as run:

            #
            # Create a grid search for hyperparameters
            #
                print(f"{n_estimators} estimators, max depth {max_depth}")
                model = RandomForestRegressor(n_estimators=n_estimators, max_depth = 10, random_state=random_state)
                model.fit(X_train, y_train)
                preds = model.predict(X_test)
                mse = mean_squared_error(y_test, preds)
                r2 = r2_score(y_test, preds)
                mae = mean_absolute_error(y_test, preds)
                mape = mean_absolute_percentage_error(y_test, preds)
                medape = median_absolute_error(y_test, preds) / y_test.median()

                print(f"MAPE: {mape:.2f} MedAPE: {medape:.4f} MSE:{mse}")

                run.log({
                    "mse": mse,
                    "r2": r2,
                    "mape": mape,
                    "medape": medape,
                    "mae": mae,
                    "random_state": random_state,
                    'n_train': len(X_train),
                    'n_test': len(X_test),
                    "n_estimators": n_estimators,
                    "max_depth": max_depth, 
                })

                # Create a model filename fingerprint
                
                model_fp = f"estimators-{n_estimators}_depth-{max_depth}"
                model_artifact_name = f"{DATASET}_{CITY.lower()}_{algorithm_slug}_{TARGET_COLUMN}_{model_fp}"

                model_filename = f"../models/{model_artifact_name}.pickle"
                joblib.dump(model, model_filename, compress=('gzip', 3))
                model_size = os.path.getsize(model_filename)
                print(f"Model saved to {model_filename} ({model_size / 1024:.2f} KB)")

                # Guardamos el modelo en el repositorio de modelos de W&B
                model_artifact_name = f'idealista18_madrid_{ALGORITHM.lower().replace(" ", "_")}'

                artifact = wandb.Artifact(name=model_artifact_name, 
                                        type="model", 
                                        description=f"{ALGORITHM} model for Madrid housing prices")
                artifact.metadata["tags"] = [model_fp, CITY, DATASET]
                artifact.metadata["aliases"] = [model_fp]
                artifact.add_file(local_path=model_filename)
                artifact.save()
                run.log_artifact(artifact, aliases=["latest", "production", model_fp])

                df = pd.DataFrame({
                    "actual_price": y_test,
                    "predicted_price": preds,
                    "error": abs(y_test - preds),            # e.g. point‐size or color
                })
                df = wandb.Table(dataframe=df)

                # 1) Simple scatter with axis labels and a color dimension (error)
                run.log({
                    "price_scatter": wandb.plot.scatter(
                        df,
                        x="actual_price",
                        y="predicted_price",
                        title="Predicted vs Actual Price",
                        #xname="Actual (EUR)",
                        #yname="Predicted (EUR)"
                    )
                })

                # 2) If you want to color‐code each point by “error”:
                run.log({
                    "price_scatter_colored": wandb.plot.scatter(
                        df,
                        x="actual_price",
                        y="predicted_price",
                        title="Predicted vs Actual Price (colored by error)",
                        #xname="Actual (EUR)",
                        #yname="Predicted (EUR)",
                        #color="error",    # column in df
                        #colorscale="Viridis"  # any Plotly colorscale name
                    )
                })



[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/davidreyblanco/.netrc


10 estimators, max depth 5
MAPE: 0.17 MedAPE: 0.1108 MSE:631121.6930521954
Model saved to ../models/idealista18_madrid_randomforest_UNITPRICE_estimators-10_depth-5.pickle (417.74 KB)


0,1
mae,▁
mape,▁
max_depth,▁
medape,▁
mse,▁
n_estimators,▁
n_test,▁
n_train,▁
r2,▁
random_state,▁

0,1
mae,549.74302
mape,0.16665
max_depth,5.0
medape,0.11081
mse,631121.69305
n_estimators,10.0
n_test,15161.0
n_train,60642.0
r2,0.7837
random_state,42.0


10 estimators, max depth 10
MAPE: 0.17 MedAPE: 0.1108 MSE:631121.6930521954
Model saved to ../models/idealista18_madrid_randomforest_UNITPRICE_estimators-10_depth-10.pickle (417.74 KB)


0,1
mae,▁
mape,▁
max_depth,▁
medape,▁
mse,▁
n_estimators,▁
n_test,▁
n_train,▁
r2,▁
random_state,▁

0,1
mae,549.74302
mape,0.16665
max_depth,10.0
medape,0.11081
mse,631121.69305
n_estimators,10.0
n_test,15161.0
n_train,60642.0
r2,0.7837
random_state,42.0


20 estimators, max depth 5
MAPE: 0.17 MedAPE: 0.1094 MSE:621594.5504005004
Model saved to ../models/idealista18_madrid_randomforest_UNITPRICE_estimators-20_depth-5.pickle (838.02 KB)


0,1
mae,▁
mape,▁
max_depth,▁
medape,▁
mse,▁
n_estimators,▁
n_test,▁
n_train,▁
r2,▁
random_state,▁

0,1
mae,544.87351
mape,0.16532
max_depth,5.0
medape,0.10942
mse,621594.5504
n_estimators,20.0
n_test,15161.0
n_train,60642.0
r2,0.78697
random_state,42.0


20 estimators, max depth 10
MAPE: 0.17 MedAPE: 0.1094 MSE:621594.5504005004
Model saved to ../models/idealista18_madrid_randomforest_UNITPRICE_estimators-20_depth-10.pickle (838.02 KB)


0,1
mae,▁
mape,▁
max_depth,▁
medape,▁
mse,▁
n_estimators,▁
n_test,▁
n_train,▁
r2,▁
random_state,▁

0,1
mae,544.87351
mape,0.16532
max_depth,10.0
medape,0.10942
mse,621594.5504
n_estimators,20.0
n_test,15161.0
n_train,60642.0
r2,0.78697
random_state,42.0


50 estimators, max depth 5
MAPE: 0.16 MedAPE: 0.1095 MSE:619515.1251445579
Model saved to ../models/idealista18_madrid_randomforest_UNITPRICE_estimators-50_depth-5.pickle (2095.98 KB)


0,1
mae,▁
mape,▁
max_depth,▁
medape,▁
mse,▁
n_estimators,▁
n_test,▁
n_train,▁
r2,▁
random_state,▁

0,1
mae,543.7475
mape,0.16493
max_depth,5.0
medape,0.10946
mse,619515.12514
n_estimators,50.0
n_test,15161.0
n_train,60642.0
r2,0.78768
random_state,42.0


50 estimators, max depth 10
MAPE: 0.16 MedAPE: 0.1095 MSE:619515.1251445579
Model saved to ../models/idealista18_madrid_randomforest_UNITPRICE_estimators-50_depth-10.pickle (2095.98 KB)


0,1
mae,▁
mape,▁
max_depth,▁
medape,▁
mse,▁
n_estimators,▁
n_test,▁
n_train,▁
r2,▁
random_state,▁

0,1
mae,543.7475
mape,0.16493
max_depth,10.0
medape,0.10946
mse,619515.12514
n_estimators,50.0
n_test,15161.0
n_train,60642.0
r2,0.78768
random_state,42.0


100 estimators, max depth 5
MAPE: 0.16 MedAPE: 0.1090 MSE:618601.940988909
Model saved to ../models/idealista18_madrid_randomforest_UNITPRICE_estimators-100_depth-5.pickle (4195.51 KB)


0,1
mae,▁
mape,▁
max_depth,▁
medape,▁
mse,▁
n_estimators,▁
n_test,▁
n_train,▁
r2,▁
random_state,▁

0,1
mae,543.34304
mape,0.16481
max_depth,5.0
medape,0.109
mse,618601.94099
n_estimators,100.0
n_test,15161.0
n_train,60642.0
r2,0.78799
random_state,42.0


100 estimators, max depth 10
MAPE: 0.16 MedAPE: 0.1090 MSE:618601.940988909
Model saved to ../models/idealista18_madrid_randomforest_UNITPRICE_estimators-100_depth-10.pickle (4195.51 KB)


0,1
mae,▁
mape,▁
max_depth,▁
medape,▁
mse,▁
n_estimators,▁
n_test,▁
n_train,▁
r2,▁
random_state,▁

0,1
mae,543.34304
mape,0.16481
max_depth,10.0
medape,0.109
mse,618601.94099
n_estimators,100.0
n_test,15161.0
n_train,60642.0
r2,0.78799
random_state,42.0


## Simple regression Tree

In [19]:
import unidecode as unidecode
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error, median_absolute_error
from sklearn.tree import DecisionTreeRegressor

DATASET = "idealista18"
TEAM_ENTITY = "mds9"  # Replace with your team entity
ALGORITHM = "Decision Tree"
algorithm_slug = unidecode.unidecode(ALGORITHM).lower().replace(" ", "")
PROJECT = f"{DATASET}_{CITY.lower()}_{algorithm_slug}_{TARGET_COLUMN}_{TEAM_ENTITY.lower()}"

for max_depth in [5, 10, 30]:

        # 3) Train some models
        with wandb.init(project=PROJECT, config={"algorithm": ALGORITHM,
                                                "target": TARGET_COLUMN,
                                                "city": CITY,
                                                "dataset": DATASET }) as run:

            #
            # Create a grid search for hyperparameters
            #
                random_state=42
                print(f"{n_estimators} estimators, max depth {max_depth}")
                model = DecisionTreeRegressor(max_depth=max_depth, random_state=random_state)
                model.fit(X_train, y_train)
                preds = model.predict(X_test)
                mse = mean_squared_error(y_test, preds)
                r2 = r2_score(y_test, preds)
                mae = mean_absolute_error(y_test, preds)
                mape = mean_absolute_percentage_error(y_test, preds)
                medape = median_absolute_error(y_test, preds) / y_test.median()

                print(f"MAPE: {mape:.2f} MedAPE: {medape:.4f} MSE:{mse}")

                run.log({
                    "mse": mse,
                    "r2": r2,
                    "mape": mape,
                    "medape": medape,
                    "mae": mae,
                    "random_state": random_state,
                    'n_train': len(X_train),
                    'n_test': len(X_test),
                    "n_estimators": n_estimators,
                    "max_depth": max_depth, 
                })

                # Create a model filename fingerprint
                
                model_fp = f"depth-{max_depth}"
                model_artifact_name = f"{DATASET}_{CITY.lower()}_{algorithm_slug}_{TARGET_COLUMN}_{model_fp}"

                model_filename = f"../models/{model_artifact_name}.pickle"
                joblib.dump(model, model_filename, compress=('gzip', 3))
                model_size = os.path.getsize(model_filename)
                print(f"Model saved to {model_filename} ({model_size / 1024:.2f} KB)")

                # Guardamos el modelo en el repositorio de modelos de W&B = idealista18_madrid_random_forest
                model_artifact_name = f'idealista18_madrid_{ALGORITHM.lower().replace(" ", "_")}'

                artifact = wandb.Artifact(name=model_artifact_name, 
                                        type="model", 
                                        description=f"{ALGORITHM} model for Madrid housing prices")

                artifact.metadata["tags"] = [model_fp, CITY, DATASET]
                artifact.metadata["aliases"] = [model_fp]
                artifact.add_file(local_path=model_filename)
                artifact.save()
                run.log_artifact(artifact, aliases=["latest", "production", model_fp])

                df = pd.DataFrame({
                    "actual_price": y_test,
                    "predicted_price": preds,
                    "error": abs(y_test - preds),            # e.g. point‐size or color
                })
                df = wandb.Table(dataframe=df)

                # 1) Simple scatter with axis labels and a color dimension (error)
                run.log({
                    "price_scatter": wandb.plot.scatter(
                        df,
                        x="actual_price",
                        y="predicted_price",
                        title="Predicted vs Actual Price",
                        #xname="Actual (EUR)",
                        #yname="Predicted (EUR)"
                    )
                })

                # 2) If you want to color‐code each point by “error”:
                run.log({
                    "price_scatter_colored": wandb.plot.scatter(
                        df,
                        x="actual_price",
                        y="predicted_price",
                        title="Predicted vs Actual Price (colored by error)",
                        #xname="Actual (EUR)",
                        #yname="Predicted (EUR)",
                        #color="error",    # column in df
                        #colorscale="Viridis"  # any Plotly colorscale name
                    )
                })


100 estimators, max depth 5
MAPE: 0.22 MedAPE: 0.1524 MSE:996174.9396154651
Model saved to ../models/idealista18_madrid_decisiontree_UNITPRICE_depth-5.pickle (3.05 KB)


0,1
mae,▁
mape,▁
max_depth,▁
medape,▁
mse,▁
n_estimators,▁
n_test,▁
n_train,▁
r2,▁
random_state,▁

0,1
mae,719.49142
mape,0.21843
max_depth,5.0
medape,0.1524
mse,996174.93962
n_estimators,100.0
n_test,15161.0
n_train,60642.0
r2,0.65859
random_state,42.0


100 estimators, max depth 10
MAPE: 0.18 MedAPE: 0.1181 MSE:719719.8749876076
Model saved to ../models/idealista18_madrid_decisiontree_UNITPRICE_depth-10.pickle (43.04 KB)


0,1
mae,▁
mape,▁
max_depth,▁
medape,▁
mse,▁
n_estimators,▁
n_test,▁
n_train,▁
r2,▁
random_state,▁

0,1
mae,589.6656
mape,0.17759
max_depth,10.0
medape,0.11808
mse,719719.87499
n_estimators,100.0
n_test,15161.0
n_train,60642.0
r2,0.75334
random_state,42.0


100 estimators, max depth 30
MAPE: 0.18 MedAPE: 0.1022 MSE:870278.5097891707
Model saved to ../models/idealista18_madrid_decisiontree_UNITPRICE_depth-30.pickle (1903.94 KB)


0,1
mae,▁
mape,▁
max_depth,▁
medape,▁
mse,▁
n_estimators,▁
n_test,▁
n_train,▁
r2,▁
random_state,▁

0,1
mae,600.72034
mape,0.17964
max_depth,30.0
medape,0.10219
mse,870278.50979
n_estimators,100.0
n_test,15161.0
n_train,60642.0
r2,0.70174
random_state,42.0


## Simple regression

In [20]:
import unidecode as unidecode
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error, median_absolute_error
from sklearn.linear_model import LinearRegression

TEAM_ENTITY = "mds9"  # Replace with your team entity
ALGORITHM = "Multiple Regression"
CITY = "Madrid"
DATASET = "idealista18"
algorithm_slug = unidecode.unidecode(ALGORITHM).lower().replace(" ", "")
PROJECT = f"{DATASET}_{CITY.lower()}_{algorithm_slug}_{TARGET_COLUMN}_{TEAM_ENTITY.lower()}"


# 1) Grab the W&B token
wandb_token = os.getenv("WANDB_API_KEY")
if wandb_token is None:
    raise RuntimeError("WANDB_API_KEY not found in environment. Did you create a .env with that variable?")

# 2) Login with the token
wandb.login(key=wandb_token)


    # 3) Train some models
with wandb.init(project=PROJECT, config={"algorithm": ALGORITHM,
                                                "target": TARGET_COLUMN,
                                                "city": CITY,
                                                "dataset": DATASET }) as run:

            #
            # Create a grid search for hyperparameters
            #
                random_state=42
                print(f"{n_estimators} estimators, max depth {max_depth}")
                model = LinearRegression()
                # Fill NA values in X_train and X_test with column means
                X_train_lr = X_train.fillna(X_train.mean())
                X_test_lr = X_test.fillna(X_train.mean())
                model.fit(X_train_lr, y_train)
                preds = model.predict(X_test_lr)
                mse = mean_squared_error(y_test, preds)
                r2 = r2_score(y_test, preds)
                mae = mean_absolute_error(y_test, preds)
                mape = mean_absolute_percentage_error(y_test, preds)
                medape = median_absolute_error(y_test, preds) / y_test.median()

                print(f"MAPE: {mape:.2f} MedAPE: {medape:.4f} MSE:{mse}")

                run.log({
                    "mse": mse,
                    "r2": r2,
                    "mape": mape,
                    "medape": medape,
                    "mae": mae,
                    "random_state": random_state,
                    'n_train': len(X_train),
                    'n_test': len(X_test),
                    "n_estimators": n_estimators,
                    "max_depth": max_depth, 
                })

                # Create a model filename fingerprint
                
                model_fp = f"plain"
                model_artifact_name = f"{DATASET}_{CITY.lower()}_{algorithm_slug}_{TARGET_COLUMN}_{model_fp}"

                model_filename = f"../models/{model_artifact_name}.pickle"
                joblib.dump(model, model_filename, compress=('gzip', 3))
                model_size = os.path.getsize(model_filename)
                print(f"Model saved to {model_filename} ({model_size / 1024:.2f} KB)")

                # Guardamos el modelo en el repositorio de modelos de W&B
                model_artifact_name = f'idealista18_madrid_{ALGORITHM.lower().replace(" ", "_")}'

                artifact = wandb.Artifact(name=model_artifact_name, 
                                        type="model", 
                                        description=f"{ALGORITHM} model for Madrid housing prices")
                artifact.metadata["tags"] = [model_fp, CITY, DATASET]
                artifact.metadata["aliases"] = [model_fp]
                artifact.add_file(local_path=model_filename)
                artifact.save()
                run.log_artifact(artifact, aliases=["latest", "production", model_fp])

                df = pd.DataFrame({
                    "actual_price": y_test,
                    "predicted_price": preds,
                    "error": abs(y_test - preds),            # e.g. point‐size or color
                })
                df = wandb.Table(dataframe=df)

                # 1) Simple scatter with axis labels and a color dimension (error)
                run.log({
                    "price_scatter": wandb.plot.scatter(
                        df,
                        x="actual_price",
                        y="predicted_price",
                        title="Predicted vs Actual Price",
                        #xname="Actual (EUR)",
                        #yname="Predicted (EUR)"
                    )
                })

                # 2) If you want to color‐code each point by “error”:
                run.log({
                    "price_scatter_colored": wandb.plot.scatter(
                        df,
                        x="actual_price",
                        y="predicted_price",
                        title="Predicted vs Actual Price (colored by error)",
                        #xname="Actual (EUR)",
                        #yname="Predicted (EUR)",
                        #color="error",    # column in df
                        #colorscale="Viridis"  # any Plotly colorscale name
                    )
                })

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/davidreyblanco/.netrc


100 estimators, max depth 30
MAPE: 0.26 MedAPE: 0.1846 MSE:1148602.965288902
Model saved to ../models/idealista18_madrid_multipleregression_UNITPRICE_plain.pickle (1.44 KB)


0,1
mae,▁
mape,▁
max_depth,▁
medape,▁
mse,▁
n_estimators,▁
n_test,▁
n_train,▁
r2,▁
random_state,▁

0,1
mae,798.64719
mape,0.26446
max_depth,30.0
medape,0.1846
mse,1148602.96529
n_estimators,100.0
n_test,15161.0
n_train,60642.0
r2,0.60635
random_state,42.0


# Uso de un "artefacto" de tipo modelo

Ahora descargamos el artefacto desde el proyecto, tomamos la última versión del artefacto

In [21]:
tag = "latest"  # or any specific tag you want to use

TEAM_ENTITY = "mds9"  # Replace with your team entity
ALGORITHM = "Random Forest"
DATASET = "idealista18"
algorithm_slug = unidecode.unidecode(ALGORITHM).lower().replace(" ", "")
PROJECT = f"{DATASET}_{CITY.lower()}_{algorithm_slug}_{TARGET_COLUMN}_{TEAM_ENTITY.lower()}"

#
#   Buscamos una configuración de modelo específica
#
target_configuration = "estimators-100_depth-5"

api = wandb.Api()

collections = api.artifact_collections(project_name = PROJECT, type_name = 'model')


target_model = None
# Get the list of collections
for collection in collections:
    print(f"📁 Collection: {collection.name} - Project: {PROJECT}")
    
    # Now iterate over artifacts in the collection
    for artifact in collection.artifacts():
        print(f"  🔹 Artifact: {artifact.name} - Version: {artifact.version}")
        print(f"  🔹 Tags: {artifact.metadata['tags']}")
        if target_configuration in artifact.metadata['tags']:
            target_model = artifact
            print(f"  🔸 Found Target configuration found: {target_configuration}")
            break


📁 Collection: idealista18_madrid_random_forest - Project: idealista18_madrid_randomforest_UNITPRICE_mds9
  🔹 Artifact: idealista18_madrid_random_forest:v12 - Version: v12
  🔹 Tags: ['estimators-50_depth-5', 'Madrid', 'idealista18']
  🔹 Artifact: idealista18_madrid_random_forest:v11 - Version: v11
  🔹 Tags: ['estimators-20_depth-10', 'Madrid', 'idealista18']
  🔹 Artifact: idealista18_madrid_random_forest:v10 - Version: v10
  🔹 Tags: ['estimators-20_depth-5', 'Madrid', 'idealista18']
  🔹 Artifact: idealista18_madrid_random_forest:v9 - Version: v9
  🔹 Tags: ['estimators-10_depth-10', 'Madrid', 'idealista18']
  🔹 Artifact: idealista18_madrid_random_forest:v8 - Version: v8
  🔹 Tags: ['estimators-10_depth-5', 'Madrid', 'idealista18']
  🔹 Artifact: idealista18_madrid_random_forest:v7 - Version: v7
  🔹 Tags: ['estimators-100_depth-10', 'Madrid', 'idealista18']
  🔹 Artifact: idealista18_madrid_random_forest:v6 - Version: v6
  🔹 Tags: ['estimators-100_depth-5', 'Madrid', 'idealista18']
  🔸 Found

In [22]:
if target_model is not None:
    artifact_dir = artifact.download()
    files = os.listdir(artifact_dir)
    model_filename = files[0]  
    print(f"Model file found: {model_filename}")

    loaded_model = joblib.load(os.path.join(artifact_dir, model_filename))

    # Run inference
    loaded_preds = loaded_model.predict(X_test)
    print("Inference complete. Example predictions:", loaded_preds[:5])
else:
    print(f"❌ Target configuration '{target_configuration}' not found in any artifacts.")


[34m[1mwandb[0m:   1 of 1 files downloaded.  


Model file found: idealista18_madrid_randomforest_UNITPRICE_estimators-100_depth-5.pickle
Inference complete. Example predictions: [7018.54468806 1864.30063768 1567.57233307 2380.41820296 2809.19881021]


Ya tenemos el mismo método en la librería

In [23]:
loaded_model = util.get_wandb_model(PROJECT, target_configuration, debug=False)

loaded_preds = loaded_model.predict(X_test)
print("Inference complete. Example predictions:", loaded_preds[:5])

[34m[1mwandb[0m:   1 of 1 files downloaded.  


Inference complete. Example predictions: [7018.54468806 1864.30063768 1567.57233307 2380.41820296 2809.19881021]


## Descarga de datos

Descargamos los datos desde la utilidad

In [24]:
X_train, X_test, y_train, y_test = util.get_dataset(f"{DATASET}_splits:latest", DATASET_PROJECT, debug=False)

[34m[1mwandb[0m:   4 of 4 files downloaded.  
