# Criando Estrutura de Pastas

In [1]:
# Bibliotecas

import json, re, unicodedata
import pandas as pd
from pathlib import Path
import numpy as np

In [2]:
# Criando estrutura de pastas do projeto:

# Caminho base do projeto
base = Path("C:/Users/dphat/OneDrive/Documentos/Cursos/FIAP/PosTech_DataAnalytics/fase5/Datathon_Decision")

# Estrutura de Pastas
for p in [
    "app", "src", "models", "data", "tests", "notebooks"
]:
    (base / p).mkdir(parents=True, exist_ok=True)

print(f"✅ Estrutura criada dentro de: {base}")

✅ Estrutura criada dentro de: C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon_Decision


# Gerando os Módulos do Pipeline

In [3]:


# Caminho base do projeto (atenção ao espaço no nome da pasta!)
base = Path(r"C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision")

# Cria a pasta src dentro da base
src_dir = base / "src"
src_dir.mkdir(parents=True, exist_ok=True)

# Conteúdo do arquivo
data_io_content = r'''
from pathlib import Path
import pandas as pd

def load_consolidated(path: str = "data/processed/decision_consolidated.parquet") -> pd.DataFrame:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Base consolidada não encontrada em {p.resolve()}.")
    return pd.read_parquet(p)

def save_consolidated(df: pd.DataFrame, path: str = "data/processed/decision_consolidated.parquet"):
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(path, index=False)
'''

# Criar o arquivo
file_path = src_dir / "data_io.py"
file_path.write_text(data_io_content, encoding="utf-8")

print(f"✅ Arquivo criado em: {file_path.resolve()}")


✅ Arquivo criado em: C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision\src\data_io.py


In [5]:
from pathlib import Path

base = Path(r"C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision")
src_dir = base / "src"
src_dir.mkdir(parents=True, exist_ok=True)

preprocessing_content = r'''
import pandas as pd

TEXT_CANDS = [
    "prospect__prospect_status", "prospect_status",
    "job__nivel_profissional",
    "app__area"
]
COMMENT_CANDS = ["prospect__prospect_comment", "prospect_comment"]

def _norm(s):
    if s is None or (isinstance(s, float) and pd.isna(s)): 
        return ""
    return " ".join(str(s).strip().lower().split())

def _pick(df: pd.DataFrame, cands):
    for c in cands:
        if c in df.columns:
            return c
    return None

def basic_preprocessing(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    # normaliza textos se existirem
    for c in TEXT_CANDS:
        if c in X.columns:
            X[c] = X[c].apply(_norm)

    # comprimento do comentário (cria versões com e sem prefixo)
    cmt = _pick(X, COMMENT_CANDS)
    if cmt:
        if "prospect__prospect_comment_len" not in X.columns:
            X["prospect__prospect_comment_len"] = X[cmt].apply(lambda x: len(x) if isinstance(x, str) else 0)
        # alias sem prefixo (para retrocompatibilidade)
        X["prospect_comment_len"] = X["prospect__prospect_comment_len"]

    # status normalizado (cria versões com e sem prefixo)
    stcol = _pick(X, ["prospect__prospect_status", "prospect_status"])
    if stcol:
        X["prospect__prospect_status_norm"] = X[stcol].apply(_norm)
        X["prospect_status_norm"] = X["prospect__prospect_status_norm"]

    return X
'''
(src_dir / "preprocessing.py").write_text(preprocessing_content, encoding="utf-8")
print("✅ src/preprocessing.py atualizado.")


✅ src/preprocessing.py atualizado.


In [6]:
# Caminho base do projeto
base = Path(r"C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision")

# Cria a pasta src dentro da base
src_dir = base / "src"
src_dir.mkdir(parents=True, exist_ok=True)

# Conteúdo do arquivo feature_engineering.py
feature_engineering_content = r'''
import pandas as pd
import numpy as np

def _map_sen(v):
    v = str(v).lower()
    if "jun" in v or "jr" in v: return 1
    if "pleno" in v or "mid" in v or "middle" in v: return 2
    if "sen" in v: return 3
    return np.nan

def _has_english(v): 
    s = str(v).lower()
    return ("ingl" in s) or ("english" in s)

def _overlap(a: str, b: str) -> float:
    if not a or not b: return 0.0
    A, B = set(str(a).lower().split()), set(str(b).lower().split())
    if not A or not B: return 0.0
    return len(A & B)/len(A | B)

def make_features(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    # overlap de skills (tenta achar colunas prováveis)
    job_comp = next((c for c in X.columns if "competencia" in c or "competencias" in c or "competências" in c), None)
    app_skill = next((c for c in X.columns if "conhecimento" in c or "skill" in c), None)
    X["feat_skill_overlap"] = X.apply(
        lambda r: _overlap(r.get(job_comp, ""), r.get(app_skill, "")), axis=1
    )

    # senioridade e gap
    if "job__nivel_profissional" in X.columns: X["sen_job"] = X["job__nivel_profissional"].apply(_map_sen)
    if "nivel_profissional" in X.columns:      X["sen_app"] = X["nivel_profissional"].apply(_map_sen)
    X["feat_senioridade"] = X.get("sen_job", np.nan)
    X["feat_senioridade_gap"] = X.get("sen_app", np.nan) - X.get("sen_job", np.nan)

    # idioma
    if "job__nivel_ingles" in X.columns: X["feat_req_ingles"] = X["job__nivel_ingles"].apply(_has_english)
    if "nivel_ingles" in X.columns:      X["feat_app_ingles"] = X["nivel_ingles"].apply(_has_english)
    if "feat_req_ingles" in X.columns and "feat_app_ingles" in X.columns:
        X["feat_ingles_match"] = (X["feat_req_ingles"] & X["feat_app_ingles"]).astype(int)
    return X
'''

# Criar o arquivo
file_path = src_dir / "feature_engineering.py"
file_path.write_text(feature_engineering_content, encoding="utf-8")

print(f"✅ Arquivo criado em: {file_path.resolve()}")


✅ Arquivo criado em: C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision\src\feature_engineering.py


In [10]:
from pathlib import Path

base = Path(r"C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision")
src_dir = base / "src"

labeling_content = r'''
import pandas as pd
import numpy as np
import re

def _pick(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            return c
    return None

def label_match(df: pd.DataFrame) -> pd.Series:
    col = _pick(df, ["prospect__prospect_status_norm", "prospect_status_norm"])
    if col is None:
        return pd.Series([np.nan] * len(df))
    patt = re.compile(r"(contrat|admit|oferta aceita|aprovad)", re.IGNORECASE)
    return df[col].fillna("").apply(lambda s: int(bool(patt.search(str(s)))))

def label_engagement(df: pd.DataFrame) -> pd.Series:
    col = _pick(df, ["prospect__prospect_comment_len", "prospect_comment_len"])
    if col is None:
        return pd.Series([np.nan] * len(df))
    thr = df[col].median()
    return (df[col] >= thr).astype(int)
'''
(src_dir / "labeling.py").write_text(labeling_content, encoding="utf-8")
print("✅ src/labeling.py atualizado.")


✅ src/labeling.py atualizado.


In [8]:
from pathlib import Path

# Caminho base do projeto
base = Path(r"C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision")

# Cria a pasta src dentro da base
src_dir = base / "src"
src_dir.mkdir(parents=True, exist_ok=True)

# Conteúdo do arquivo model_utils.py
model_utils_content = r'''
import json
import joblib
import pandas as pd
from pathlib import Path

def load_model(model_path="models/recommender.pkl", meta_path="models/recommender_meta.json"):
    pipe = joblib.load(model_path)
    best_thr = 0.5
    p = Path(meta_path)
    if p.exists():
        best_thr = json.load(open(p))["best_threshold"]
    return pipe, best_thr

def predict_with_threshold(pipe, X: pd.DataFrame, thr: float):
    proba = pipe.predict_proba(X)[:,1]
    pred = (proba >= thr).astype(int)
    return proba, pred
'''

# Criar o arquivo
file_path = src_dir / "model_utils.py"
file_path.write_text(model_utils_content, encoding="utf-8")

print(f"✅ Arquivo criado em: {file_path.resolve()}")


✅ Arquivo criado em: C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision\src\model_utils.py


In [11]:
from pathlib import Path

# Caminho base do projeto
base = Path(r"C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision")

# Cria a pasta src dentro da base
src_dir = base / "src"
src_dir.mkdir(parents=True, exist_ok=True)

# Conteúdo do arquivo train.py
train_content = r'''
import json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve
import joblib

from src.data_io import load_consolidated
from src.preprocessing import basic_preprocessing
from src.feature_engineering import make_features
from src.labeling import label_match, label_engagement

def make_ohe_dense():
    return OneHotEncoder(handle_unknown="ignore", sparse_output=False)

def train_and_save(
    in_path="data/processed/decision_consolidated.parquet",
    model_path="models/recommender.pkl",
    meta_path="models/recommender_meta.json"
):
    df = load_consolidated(in_path)
    df = basic_preprocessing(df)
    df = make_features(df)

    df["target_match"] = label_match(df)
    df["target_engagement"] = label_engagement(df)

    num_cols = [c for c in ["prospect_comment_len","feat_skill_overlap","feat_senioridade","feat_senioridade_gap","feat_ingles_match"] if c in df.columns]
    cat_cols = [c for c in ["job__nivel_profissional","app__area"] if c in df.columns]

    data = df.dropna(subset=["target_match"]).copy()
    if "pair_id" in data.columns:
        data = data.drop_duplicates(subset=["pair_id"])
    y = data["target_match"].astype(int)
    X = data[num_cols + cat_cols].copy()
                                    
    ohe = make_ohe_dense()                                

    pre = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", Pipeline([
                ("imp", SimpleImputer(strategy="most_frequent")),
                ("ohe", ohe)
            ]), cat_cols),
        ],
        remainder="drop"
    )

    pipe = Pipeline([
        ("pre", pre),
        ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", solver="lbfgs"))
    ])

    if y.nunique() >= 2 and y.value_counts().min() >= 2:
        Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    else:
        Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)

    pipe.fit(Xtr, ytr)

    proba = pipe.predict_proba(Xte)[:,1]
    pred05 = pipe.predict(Xte)

    precisions, recalls, thresholds = precision_recall_curve(yte, proba)
    f1_scores = 2*precisions*recalls/(precisions+recalls+1e-9)
    best_idx = int(np.argmax(f1_scores))
    best_thr = float(thresholds[best_idx]) if best_idx < len(thresholds) else 0.5
    pred_best = (proba >= best_thr).astype(int)

    metrics = {
        "AUC": float(np.round(roc_auc_score(yte, proba), 4)),
        "F1@0.5": float(np.round(f1_score(yte, pred05), 4)),
        "BestThreshold": best_thr,
        "F1@Best": float(np.round(f1_score(yte, pred_best), 4)),
        "yte_pos_ratio": float(np.round(yte.mean(), 4)),
        "num_cols": num_cols,
        "cat_cols": cat_cols
    }

    Path(model_path).parent.mkdir(parents=True, exist_ok=True)
    joblib.dump(pipe, model_path)
    with open(meta_path, "w") as f:
        json.dump({"best_threshold": best_thr, "metrics": metrics}, f, indent=2)

    return metrics
'''

# Criar o arquivo
file_path = src_dir / "train.py"
file_path.write_text(train_content, encoding="utf-8")

print(f"✅ Arquivo criado em: {file_path.resolve()}")


✅ Arquivo criado em: C:\Users\dphat\OneDrive\Documentos\Cursos\FIAP\PosTech_DataAnalytics\fase5\Datathon Decision\src\train.py
