
# 🧪 Treino Comparativo (4–5 Modelos) — **Match Candidato × Vaga***

Notebook ajustado para:
- Ler **`data/processed/decision_consolidated.parquet`**;
- Harmonizar colunas com/sem prefixo (ex.: `prospect_status` ⇄ `prospect__prospect_status`);
- Criar **labels** de *match* de forma resiliente;
- Fazer **split robusto** (estratificado e por grupos quando possível, com *fallbacks*);
- Treinar **4–5 modelos** e salvar o **melhor** (`models/recommender.pkl` + `models/recommender_meta.json`).

> Use este notebook quando o merge/prefixos alterarem nomes de colunas e o split anterior quebrar.


In [10]:

from pathlib import Path
import sys

# Ajuste o caminho base se necessário (Windows)
BASE = Path(r"C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision")
DATA_PROCESSED = BASE / "data" / "processed"
MODELS_DIR = BASE / "models"
PARQUET_PATH = DATA_PROCESSED / "decision_consolidated.parquet"

# Permitir imports do pacote local src/
if str(BASE) not in sys.path:
    sys.path.insert(0, str(BASE))

# Garante pastas (não apaga nada)
for p in [DATA_PROCESSED, MODELS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

print("BASE          :", BASE)
print("PARQUET_PATH  :", PARQUET_PATH)
print("MODELS_DIR    :", MODELS_DIR)


BASE          : C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision
PARQUET_PATH  : C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision\data\processed\decision_consolidated.parquet
MODELS_DIR    : C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision\models


In [11]:

import pandas as pd
import numpy as np

if not PARQUET_PATH.exists():
    raise FileNotFoundError(f"Parquet não encontrado em {PARQUET_PATH}")

df_raw = pd.read_parquet(PARQUET_PATH)
print("Shape bruto:", df_raw.shape)
df_raw.head(3)


Shape bruto: (53759, 114)


Unnamed: 0,pair_id,job_id,applicant_id,prospect__prospect_name,prospect__prospect_status,prospect__prospect_status_norm,prospect__candidatura_dt,prospect__atualizacao_dt,prospect__prospect_comment,prospect__prospect_comment_len,...,app__id_ibrati,app__email_corporativo,app__cargo_atual,app__projeto_atual,app__cliente,app__unidade,app__data_admissao,app__data_ultima_promocao,app__nome_superior_imediato,app__email_superior_imediato
0,4530::25632,4530,25632,José Vieira,Encaminhado ao Requisitante,encaminhado ao requisitante,2021-03-25,2021-03-25,"Encaminhado para - PJ R$ 72,00/hora",36,...,,,,,,,NaT,NaT,,
1,4530::25529,4530,25529,Srta. Isabela Cavalcante,Encaminhado ao Requisitante,encaminhado ao requisitante,2021-03-22,2021-03-23,"encaminhado para - R$ 6.000,00 – CLT Full , n...",67,...,,,,,,,NaT,NaT,,
2,4531::25364,4531,25364,Sra. Yasmin Fernandes,Contratado pela Decision,contratado pela decision,2021-03-17,2021-04-12,Data de Inicio: 12/04/2021,26,...,,,,,,,NaT,NaT,,


In [15]:
# Tenta importar módulos do projeto
try:
    from src.preprocessing import basic_preprocessing as _basic_preprocessing
except Exception:
    _basic_preprocessing = None

try:
    from src.feature_engineering import make_features as _make_features
except Exception:
    _make_features = None

try:
    from src.labeling import label_match as _label_match, label_engagement as _label_engagement
except Exception:
    _label_match, _label_engagement = None, None

import re
import pandas as pd
import numpy as np  # <- necessário em ensure_labels

def _norm(s):
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return ""
    return " ".join(str(s).strip().lower().split())

def _pick(df: pd.DataFrame, cands):
    for c in cands:
        if c in df.columns:
            return c
    return None

def ensure_preprocessing(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()
    # 1) aplica basic_preprocessing se existir
    if _basic_preprocessing is not None:
        try:
            X = _basic_preprocessing(X)
        except Exception as e:
            print("basic_preprocessing falhou, seguindo com harmonização local:", e)

    # 2) harmonia de status/comentário (cria com/sem prefixo)
    stcol = _pick(X, ["prospect__prospect_status", "prospect_status"])
    if stcol:
        X["prospect__prospect_status_norm"] = X[stcol].apply(_norm)
        X["prospect_status_norm"] = X["prospect__prospect_status_norm"]

    cmt = _pick(X, ["prospect__prospect_comment", "prospect_comment"])
    if cmt:
        X["prospect__prospect_comment_len"] = X[cmt].apply(lambda v: len(v) if isinstance(v, str) else 0)
        X["prospect_comment_len"] = X["prospect__prospect_comment_len"]

    return X

def ensure_features(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()
    if _make_features is not None:
        try:
            X = _make_features(X)
            return X
        except Exception as e:
            print("make_features falhou, seguindo sem features adicionais:", e)
    return X  # fallback: retorna df sem alterações se falhar

def ensure_labels(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    # ----- target_match -----
    if _label_match is not None:
        try:
            X["target_match"] = _label_match(X)
        except Exception as e:
            print("label_match (src) falhou, aplicando local:", e)
            patt = re.compile(r"(contrat|admit|oferta aceita|aprovad)", re.IGNORECASE)
            col = _pick(X, ["prospect__prospect_status_norm", "prospect_status_norm"])
            if col:
                X["target_match"] = X[col].fillna("").apply(lambda s: int(bool(patt.search(str(s)))))
            else:
                X["target_match"] = pd.Series([np.nan] * len(X))
    else:
        patt = re.compile(r"(contrat|admit|oferta aceita|aprovad)", re.IGNORECASE)
        col = _pick(X, ["prospect__prospect_status_norm", "prospect_status_norm"])
        if col:
            X["target_match"] = X[col].fillna("").apply(lambda s: int(bool(patt.search(str(s)))))
        else:
            X["target_match"] = pd.Series([np.nan] * len(X))

    # ----- target_engagement -----
    if _label_engagement is not None:
        try:
            X["target_engagement"] = _label_engagement(X)
        except Exception as e:
            print("label_engagement (src) falhou, aplicando local:", e)
            col = _pick(X, ["prospect__prospect_comment_len", "prospect_comment_len"])
            if col:
                thr = X[col].median()
                X["target_engagement"] = (X[col] >= thr).astype(int)
            else:
                X["target_engagement"] = pd.Series([np.nan] * len(X))
    else:
        col = _pick(X, ["prospect__prospect_comment_len", "prospect_comment_len"])
        if col:
            thr = X[col].median()
            X["target_engagement"] = (X[col] >= thr).astype(int)
        else:
            X["target_engagement"] = pd.Series([np.nan] * len(X))

    return X
_


Unnamed: 0,pair_id,job_id,applicant_id,prospect__prospect_name,prospect__prospect_status,prospect__prospect_status_norm,prospect__candidatura_dt,prospect__atualizacao_dt,prospect__prospect_comment,prospect__prospect_comment_len,...,app__id_ibrati,app__email_corporativo,app__cargo_atual,app__projeto_atual,app__cliente,app__unidade,app__data_admissao,app__data_ultima_promocao,app__nome_superior_imediato,app__email_superior_imediato
0,4530::25632,4530,25632,José Vieira,Encaminhado ao Requisitante,encaminhado ao requisitante,2021-03-25,2021-03-25,"Encaminhado para - PJ R$ 72,00/hora",36,...,,,,,,,NaT,NaT,,
1,4530::25529,4530,25529,Srta. Isabela Cavalcante,Encaminhado ao Requisitante,encaminhado ao requisitante,2021-03-22,2021-03-23,"encaminhado para - R$ 6.000,00 – CLT Full , n...",67,...,,,,,,,NaT,NaT,,
2,4531::25364,4531,25364,Sra. Yasmin Fernandes,Contratado pela Decision,contratado pela decision,2021-03-17,2021-04-12,Data de Inicio: 12/04/2021,26,...,,,,,,,NaT,NaT,,


In [16]:

# Features candidatas (presentes ou criadas)
NUM_COLS = [c for c in ["prospect_comment_len","prospect__prospect_comment_len",
                        "feat_skill_overlap","feat_senioridade","feat_senioridade_gap","feat_ingles_match"]
            if c in df.columns]

CAT_COLS = [c for c in ["job__nivel_profissional","app__area"] if c in df.columns]

data = df.dropna(subset=["target_match"]).copy()
if "pair_id" in data.columns:
    data = data.drop_duplicates(subset=["pair_id"])

y = data["target_match"].astype(int) if "target_match" in data.columns else pd.Series([], dtype=int)
X = data[NUM_COLS + CAT_COLS].copy() if len(data) else pd.DataFrame(columns=NUM_COLS + CAT_COLS)

# Diagnóstico
print("Amostras após dropna(target):", len(X))
print("Distribuição do y:", y.value_counts(dropna=False).to_dict())

# Escolha da coluna de grupo (vaga)
def pick_group_col(df_):
    for c in ["job__id","job__vaga","job__codigo","job__cod","job__nome","job__titulo","app__area"]:
        if c in df_.columns:
            return c
    return None

group_col = pick_group_col(data)
groups = data[group_col] if group_col is not None else None
print("Coluna de grupos:", group_col)
if groups is not None:
    print("N grupos distintos:", pd.Series(groups).nunique())


Amostras após dropna(target): 53759
Distribuição do y: {0: 44485, 1: 9274}
Coluna de grupos: job__nome
N grupos distintos: 79


In [21]:

from sklearn.model_selection import train_test_split

def stratified_group_split(X, y, groups=None, test_size=0.2, random_state=42):
    X = X.reset_index(drop=True)
    y = pd.Series(y).reset_index(drop=True)
    groups = (pd.Series(groups).reset_index(drop=True)) if groups is not None else None

    if len(X) == 0:
        raise ValueError("X ficou vazio após filtros/merge. Verifique se o target foi criado e se as features existem.")
    if y.nunique() < 2:
        print("⚠️ Target tem uma única classe. Fazendo split sem estratificação.")
        return train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=None)

    # tenta StratifiedGroupKFold com n_splits seguro
    try:
        from sklearn.model_selection import StratifiedGroupKFold
        n_splits = max(2, int(1 / test_size))
        if groups is not None:
            n_splits = min(n_splits, int(pd.Series(groups).nunique()))
        n_splits = min(n_splits, len(X))
        if n_splits < 2:
            raise ValueError("n_splits insuficiente para SGKF.")
        sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        idx_tr, idx_te = next(sgkf.split(X, y, groups))
        return X.iloc[idx_tr], X.iloc[idx_te], y.iloc[idx_tr], y.iloc[idx_te]
    except Exception as e:
        print("StratifiedGroupKFold indisponível:", e)

    # fallback: GroupShuffleSplit
    if groups is not None:
        try:
            from sklearn.model_selection import GroupShuffleSplit
            gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
            idx_tr, idx_te = next(gss.split(X, y, groups))
            return X.iloc[idx_tr], X.iloc[idx_te], y.iloc[idx_tr], y.iloc[idx_te]
        except Exception as e:
            print("GroupShuffleSplit falhou:", e)

    # último recurso: split estratificado simples
    return train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

Xtr, Xte, ytr, yte = stratified_group_split(X, y, groups, test_size=0.2, random_state=42)
print("Shapes -> Xtr:", Xtr.shape, "| Xte:", Xte.shape)


StratifiedGroupKFold indisponível: '<' not supported between instances of 'NoneType' and 'NoneType'
GroupShuffleSplit falhou: '<' not supported between instances of 'NoneType' and 'NoneType'
Shapes -> Xtr: (43007, 6) | Xte: (10752, 6)


In [25]:

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier

def make_preprocessor(num_cols, cat_cols):
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    pre = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                              ("ohe", ohe)]), cat_cols),
        ],
        remainder="drop"
    )
    return pre

pre = make_preprocessor(NUM_COLS, CAT_COLS)

candidates = {
    "logreg": Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))]),
    "rf"    : Pipeline([("pre", pre), ("clf", RandomForestClassifier(n_estimators=400, class_weight="balanced"))]),
    "gb"    : Pipeline([("pre", pre), ("clf", GradientBoostingClassifier())]),
    "hgb"   : Pipeline([("pre", pre), ("clf", HistGradientBoostingClassifier())]),
}

# XGBoost opcional
try:
    from xgboost import XGBClassifier
    candidates["xgb"] = Pipeline([("pre", pre), ("clf", XGBClassifier(
        n_estimators=400, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8,
        objective="binary:logistic", eval_metric="logloss", tree_method="hist"
    ))])
    print("XGBoost habilitado.")
except Exception as e:
    print("XGBoost indisponível:", e)


XGBoost habilitado.


In [26]:

from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve

results = {}
for name, pipe in candidates.items():
    pipe.fit(Xtr, ytr)
    proba = pipe.predict_proba(Xte)[:,1]
    pred05 = (proba >= 0.5).astype(int)
    precisions, recalls, thresholds = precision_recall_curve(yte, proba)
    f1_scores = 2*precisions*recalls/(precisions+recalls+1e-9)
    best_idx = int(np.argmax(f1_scores))
    best_thr = float(thresholds[best_idx]) if best_idx < len(thresholds) else 0.5
    pred_best = (proba >= best_thr).astype(int)

    results[name] = {
        "model": pipe,
        "AUC": float(np.round(roc_auc_score(yte, proba), 4)),
        "F1@0.5": float(np.round(f1_score(yte, pred05), 4)),
        "BestThreshold": best_thr,
        "F1@Best": float(np.round(f1_score(yte, pred_best), 4))
    }

import pandas as pd
df_results = pd.DataFrame([{**{"model":k}, **{m:v for m,v in v.items() if m!='model'}} for k,v in results.items()])
df_results.sort_values(["F1@Best","AUC"], ascending=False).reset_index(drop=True)


[WinError 2] O sistema não pode encontrar o arquivo especificado
  File "C:\Users\dphat\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executab

Unnamed: 0,model,AUC,F1@0.5,BestThreshold,F1@Best
0,xgb,0.7758,0.2933,0.255327,0.5187
1,hgb,0.7762,0.2871,0.301506,0.5184
2,gb,0.7745,0.2542,0.309529,0.5153
3,logreg,0.7598,0.4889,0.431332,0.509
4,rf,0.7601,0.5045,0.467681,0.5057


In [27]:

# Seleciona melhor por (F1@Best, AUC)
best_name = None
best_score = (-1.0, -1.0)
for name, res in results.items():
    score = (res["F1@Best"], res["AUC"])
    if score > best_score:
        best_score = score
        best_name = name

best = results[best_name]
print("Melhor modelo:", best_name, "| F1@Best:", best["F1@Best"], "| AUC:", best["AUC"], "| BestThreshold:", best["BestThreshold"])

# Descobrir coluna de título da vaga (útil para o app)
def pick_job_title_col(df_):
    for c in ["job__titulo","job__nome","job__descricao","job__descricao_vaga"]:
        if c in df_.columns:
            return c
    return None

job_title_col = pick_job_title_col(df)

# Salvar artefatos
import joblib, json
MODELS_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(best["model"], MODELS_DIR / "recommender.pkl")
with open(MODELS_DIR / "recommender_meta.json", "w", encoding="utf-8") as f:
    json.dump({
        "best_threshold": best["BestThreshold"],
        "metrics": {k: v for k,v in best.items() if k!="model"},
        "num_cols": [c for c in NUM_COLS],
        "cat_cols": [c for c in CAT_COLS],
        "group_col": group_col,
        "job_title_col": job_title_col
    }, f, indent=2, ensure_ascii=False)

print("Artefatos salvos em:", MODELS_DIR)


Melhor modelo: xgb | F1@Best: 0.5187 | AUC: 0.7758 | BestThreshold: 0.25532734394073486
Artefatos salvos em: C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision\models


In [28]:

print("\n📊 Resultados (ordenados):")
display(df_results.sort_values(["F1@Best","AUC"], ascending=False).reset_index(drop=True))

print("\nArquivos em models/:")
for p in MODELS_DIR.glob("*"):
    print("-", p.name)



📊 Resultados (ordenados):


Unnamed: 0,model,AUC,F1@0.5,BestThreshold,F1@Best
0,xgb,0.7758,0.2933,0.255327,0.5187
1,hgb,0.7762,0.2871,0.301506,0.5184
2,gb,0.7745,0.2542,0.309529,0.5153
3,logreg,0.7598,0.4889,0.431332,0.509
4,rf,0.7601,0.5045,0.467681,0.5057



Arquivos em models/:
- recommender.pkl
- recommender_meta.json
- tfidf_vectorizer.joblib
- train.py
- utils.py
