In [28]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [29]:
DATA_DIR = Path("../../data/model3_salaries/3.processed")
CSV_PATH = DATA_DIR / "salaries_unified.csv"

df = pd.read_csv(CSV_PATH)

In [30]:
# --------------------------------------------------------------------
# Definimos qué columnas usaremos como features
# --------------------------------------------------------------------
FEATURES_CATEG = [
    "role_label",
    "seniority",
    "country",
    "company_size",
    "employment_type",
    "source_dataset",   # para que el modelo aprenda diferencias entre fuentes
]

FEATURES_NUM = [
    "work_year",
    "remote_ratio",
]

TARGET = "salary_in_usd"

# Aseguramos tipos numéricos en las columnas numéricas
df["work_year"] = pd.to_numeric(df["work_year"], errors="coerce")
df["remote_ratio"] = pd.to_numeric(df["remote_ratio"], errors="coerce")

# Extraemos X e y
X = df[FEATURES_CATEG + FEATURES_NUM].copy()
y = df[TARGET].values

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (4309, 8)
y shape: (4309,)


In [31]:
# --------------------------------------------------------------------
# Train / Test split
# --------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

len(X_train), len(X_test)


(3447, 862)

In [32]:
# --------------------------------------------------------------------
# Preprocesador:
# - Categóricas: imputar valor más frecuente + OneHotEncoder
# - Numéricas: imputar mediana
# --------------------------------------------------------------------
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, FEATURES_CATEG),
        ("num", numeric_transformer, FEATURES_NUM),
    ]
)


In [33]:
# --------------------------------------------------------------------
# Modelo base: HistGradientBoostingRegressor
# (hiperparámetros razonables como punto de partida)
# --------------------------------------------------------------------
base_regressor = HistGradientBoostingRegressor(
    loss="squared_error",
    max_depth=6,
    learning_rate=0.05,
    max_iter=600,
    random_state=42,
)

# --------------------------------------------------------------------
# Wrap en TransformedTargetRegressor para trabajar en log-salario
# --------------------------------------------------------------------
tt_regressor = TransformedTargetRegressor(
    regressor=base_regressor,
    func=np.log1p,    # y -> log(1 + y)
    inverse_func=np.expm1,  # z -> exp(z) - 1
)

# --------------------------------------------------------------------
# Pipeline completo: preprocesado + modelo
# --------------------------------------------------------------------
salary_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", tt_regressor),
    ]
)


In [34]:
%%time
salary_model.fit(X_train, y_train)


CPU times: total: 22.6 s
Wall time: 3.99 s


In [35]:
# --------------------------------------------------------------------
# Evaluación: calculamos MAE, RMSE y R² en TRAIN y TEST
# --------------------------------------------------------------------
def regression_report(y_true, y_pred, label="TEST"):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"=== {label} ===")
    print(f"MAE : {mae:,.0f} USD")
    print(f"RMSE: {rmse:,.0f} USD")
    print(f"R²  : {r2:,.3f}")
    print()

# Predicciones
y_pred_train = salary_model.predict(X_train)
y_pred_test = salary_model.predict(X_test)

regression_report(y_train, y_pred_train, label="TRAIN")
regression_report(y_test, y_pred_test, label="TEST")


=== TRAIN ===
MAE : 35,444 USD
RMSE: 47,449 USD
R²  : 0.433

=== TEST ===
MAE : 36,970 USD
RMSE: 47,990 USD
R²  : 0.367



In [37]:
from pathlib import Path
import joblib

# 1) Mira tu directorio de trabajo real
print("CWD (current working directory):", Path.cwd())

# 2) Usa una ruta robusta basada en la CWD
MODEL_DIR = Path.cwd().parents[2] / "models" / "salaries"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

MODEL_PATH = MODEL_DIR / "salary_from_profile_hgb_v1.pkl"

print("Voy a guardar en:", MODEL_PATH)

# 3) Guarda el modelo y mira qué devuelve joblib
res = joblib.dump(salary_model, MODEL_PATH)
print("Joblib devuelve:", res)

# 4) Comprueba si el fichero existe
print("¿Existe el fichero?", MODEL_PATH.exists())


CWD (current working directory): c:\Users\Fiona A\Desktop\IAPython\proyectos\TechCareer\TechCareer\src\model3_salaries_training
Voy a guardar en: c:\Users\Fiona A\Desktop\IAPython\proyectos\TechCareer\models\salaries\salary_from_profile_hgb_v1.pkl
Joblib devuelve: ['c:\\Users\\Fiona A\\Desktop\\IAPython\\proyectos\\TechCareer\\models\\salaries\\salary_from_profile_hgb_v1.pkl']
¿Existe el fichero? True


In [38]:
from pathlib import Path
import joblib

# Raíz del repo (la TechCareer interna)
PROJECT_ROOT = Path.cwd().parents[1]
print("PROJECT_ROOT:", PROJECT_ROOT)

MODEL_DIR = PROJECT_ROOT / "models" / "salaries"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

MODEL_PATH = MODEL_DIR / "salary_from_profile_hgb_v1.pkl"

joblib.dump(salary_model, MODEL_PATH)

print("✅ Modelo guardado en:", MODEL_PATH)
print("¿Existe el fichero?", MODEL_PATH.exists())


PROJECT_ROOT: c:\Users\Fiona A\Desktop\IAPython\proyectos\TechCareer\TechCareer
✅ Modelo guardado en: c:\Users\Fiona A\Desktop\IAPython\proyectos\TechCareer\TechCareer\models\salaries\salary_from_profile_hgb_v1.pkl
¿Existe el fichero? True
