# Datathon Fase 5 — Notebook
# Diego Peres Hatanaka

## 1. Setup & Imports

In [1]:

%pip -q install scikit-learn==1.5.0 pandas==2.2.2 joblib==1.4.2 scipy==1.13.1


Note: you may need to restart the kernel to use updated packages.


## 2. Normalização dos Dados

In [4]:

from pathlib import Path
import json, pandas as pd
base = Path("C:/Users/dphat/OneDrive/Documentos/Cursos/FIAP/PosTech_DataAnalytics/fase5/Datathon Decision")
raw = base / "data" / "raw"
# Funções de flatten (simplificadas)
def normalize_text(s): 
    return s or ""
def load_dict(p): 
    import json; 
    return json.load(open(p, "r", encoding="utf-8"))
applicants = load_dict(raw/"applicants.json")
vagas = load_dict(raw/"vagas.json")
# Exibir contagens
print("Applicants:", len(applicants), "Vagas:", len(vagas))


Applicants: 42482 Vagas: 14081


## 3. Treino do Pipeline de Similaridade

In [5]:

from src.train import train
stats = train(base / "data" / "processed", base / "models")
stats


{'vocab_size': 20000}

## 4. Exemplo de Ranking para uma Vaga

In [10]:

import numpy as np
import pandas as pd
from joblib import load
from sklearn.metrics.pairwise import cosine_similarity
from src.utils import normalize_text

# 1) Carregar dados
apps = pd.read_csv(base / "data" / "processed" / "applicants.csv")
vagas = pd.read_csv(base / "data" / "processed" / "vagas.csv")

# 2) Carregar o vetorizador atual do projeto
vec_path = base / "models" / "tfidf_vectorizer.joblib"
vec = load(vec_path)

# (opcional) sanity check
try:
    vocab_size = len(vec.get_feature_names_out())
    print("Vocab size:", vocab_size)
except Exception as e:
    print("Aviso: não consegui ler o vocabulário.", e)

# 3) Selecionar uma vaga
vrow = vagas.iloc[0]

def compute_scores(vec, apps_df, vaga_row, batch=10000):
    # --- montar texto da vaga (1 string) ---
    vaga_text = f"{vaga_row.get('requisitos_texto','')} {vaga_row.get('stack_desejada','')}"
    job_text = [normalize_text(str(vaga_text))]
    B = vec.transform(job_text)  # shape: (1, vocab)

    # --- montar textos dos candidatos (1 por linha) ---
    cv_vals   = apps_df["cv_text_pt"].fillna("").astype(str).values
    stack_vals= apps_df["stack"].fillna("").astype(str).values
    n = len(apps_df)

    scores_parts = []
    for start in range(0, n, batch):
        end = min(n, start + batch)
        # lista de strings normalizadas (um item por candidato)
        app_text_batch = [normalize_text(cv_vals[i] + " " + stack_vals[i]) for i in range(start, end)]
        A = vec.transform(app_text_batch)  # shape: (batch, vocab)
        sims = cosine_similarity(A, B)[:, 0]  # shape: (batch,)
        scores_parts.append(sims)

    scores = np.concatenate(scores_parts, axis=0)
    assert scores.shape[0] == n, f"scores len={scores.shape[0]} != n_apps={n}"
    return scores

scores = compute_scores(vec, apps, vrow, batch=10000)

print("scores shape:", scores.shape, "| n_apps:", len(apps))
out = apps.assign(match_score=scores).sort_values("match_score", ascending=False).head(20)
out


Vocab size: 20000
scores shape: (42482,) | n_apps: 42482


Unnamed: 0,applicant_id,nome,senioridade,area_atuacao,pretensao_salarial,localidade,stack,idiomas,cv_text_pt,cv_text_en,cargo_atual,match_score
21462,27124,Eduardo Caldeira,,,,,,,service delivery management for infraestructur...,,,0.451839
41381,37788,Sra. Letícia Ramos,,,,,,,\n\n\ncontato\n\nprincipais competências re...,,,0.443031
30308,29441,Leonardo Carvalho,,,,,,,incident manager\n\nsummary of skills and prof...,,,0.441924
24100,20009,Bento Pinto,,,,,,,"sr it manager / mba\naddress são paulo, sp, 04...",,,0.42706
41003,40186,Bárbara Gomes,,,,,,,\n\n\n\n● professional with 18 years of experi...,,,0.426514
37426,36176,Srta. Liz da Conceição,,,,,,,42 years old – uberlandia – brazil\n\ncover le...,,,0.416064
37348,36098,Danilo Casa Grande,,,,,,,"isaque melo (71) 98110 - 6579 brazilian, divor...",,,0.415494
26157,39112,Dr. Ravi Jesus,,,,,,,"single, 34 y/o, available to travel;\n\n\ninte...",,,0.413141
39709,41394,João Vitor Pereira,,,,,,,"+55 11 99263 1457\n\nbpo director, digital ope...",,,0.409767
16905,46417,Srta. Allana Marques,,,,,,,citizenship: brazilian and italian\nmobile/ wh...,,,0.409241
