In [19]:
import os

if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('../src')

from config import load_config, get_abs_path
from log_config import logger
from feature_engineering import transform_new_data
import pandas as pd
import sys

logger.remove()  # remove o logger default que foi importado do log_config
logger.add(sys.stderr, level='CRITICAL')

config = load_config()

paths = config['paths']

# Todos os dados que entrarem na aplicação vão passar passar pelo pipeline de limpeza e construção das camadas do Data Lake.
# Então nós já vamos pegar da camada silves pra otimizar
df_prospects = pd.read_parquet(get_abs_path(paths['prospects_silver']))
df_vagas = pd.read_parquet(get_abs_path(paths['vagas_silver']))

## Pra começar vamos puxar os jsons que nós temos e vamos extrair uma pessoa aleatória para simular uma predição

In [20]:
um_prospect_aleatorio = df_prospects.sample(1)
um_prospect_aleatorio

Unnamed: 0,cod_vaga,titulo,nome,codigo,situacao_candidado,data_candidatura,ultima_atualizacao,comentario,recrutador
30142,13534,consultor tax expert jde cgemjp00268243,Dr. Diego Barbosa,48969,prospect,2024-10-15,15-10-2024,desconhecido,Laura Pacheco


## Desse prospect aleatório, nós vamos fazer as limpezas necessárias para chegar na tabela de predição

In [21]:
def clean_features_data(df):
    colunas_remover = [
        'analista_responsavel',
        'cidade',
        'cliente',
        'cod_vaga',
        'codigo',
        'data_candidatura',
        'data_final',
        'data_inicial',
        'data_requicisao',
        'empresa_divisao',
        'estado',
        'limite_esperado_para_contratacao',
        'local_trabalho',
        'nome',
        'recrutador',
        'regiao',
        'requisitante',
        'situacao_candidado',
        'solicitante_cliente',
        'ultima_atualizacao',
        'tipo_contratacao',
        'nivel_academico',
    ]
    colunas_remover_2 = [
        'titulo',
        'comentario',
        'titulo_vaga',
        'prazo_contratacao',
        'prioridade_vaga',
        'nivel profissional',
        'nivel_ingles',
        'nivel_espanhol',
        'areas_atuacao',
        'principais_atividades',
        'competencia_tecnicas_e_comportamentais',
        'demais_observacoes',
        'equipamentos_necessarios',
        'habilidades_comportamentais_necessarias',
        'valor_venda',
        'valor_compra_1',
    ]

    colunas_remover.extend(colunas_remover_2)

    df = df.drop(columns=colunas_remover)

    features = [
        col
        for col in df.columns
        if col != 'target' and pd.api.types.is_numeric_dtype(df[col])
    ]
    X = df[features]

    return X

In [22]:
um_caso_aleatorio = um_prospect_aleatorio.merge(
    df_vagas, on='cod_vaga', how='left', suffixes=('', '_vaga')
)

encoders_path = '/home/felip/projetos/datathon-mle/Datathon Decision/4_gold/encoders'

um_caso_aleatorio = transform_new_data(um_caso_aleatorio, encoders_path=encoders_path)

um_caso_aleatorio = clean_features_data(um_caso_aleatorio)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[String Similarity]: 100%|██████████| 1/1 [00:00<00:00, 227.22it/s]


[Embeddings] Gerando embeddings dos títulos...


Processing batches: 100%|██████████| 1/1 [00:00<00:00, 12.92it/s]


In [26]:
um_caso_aleatorio.columns

Index(['nivel_acad_doutorado completo', 'nivel_acad_doutorado cursando',
       'nivel_acad_ensino fundamental completo',
       'nivel_acad_ensino medio completo',
       'nivel_acad_ensino medio incompleto',
       'nivel_acad_ensino superior completo',
       'nivel_acad_ensino superior cursando',
       'nivel_acad_ensino superior incompleto',
       'nivel_acad_ensino tecnico completo',
       'nivel_acad_ensino tecnico cursando',
       'nivel_acad_ensino tecnico incompleto', 'nivel_acad_mestrado completo',
       'nivel_acad_mestrado cursando', 'nivel_acad_pos graduacao completo',
       'nivel_acad_pos graduacao cursando',
       'nivel_acad_pos graduacao incompleto', 'nivel_acad_None',
       'tipo_contr_candidato podera escolher hunting pj autonomo',
       'tipo_contr_candidato podera escolher pj autonomo',
       'tipo_contr_clt cotas', 'tipo_contr_clt cotas clt full',
       'tipo_contr_clt cotas cooperado',
       'tipo_contr_clt cotas cooperado estagiario pj autonomo',
 

In [24]:
um_caso_aleatorio.shape

(1, 84)

## Teste com a inferência na API

In [4]:
import pandas as pd
from config import load_config, get_abs_path

config = load_config()
paths = config['paths']

df = pd.read_parquet(get_abs_path(paths['dataset_features']))
df

Unnamed: 0,tipo_contratacao,nivel_academico,target,nivel_acad_doutorado completo,nivel_acad_doutorado cursando,nivel_acad_ensino fundamental completo,nivel_acad_ensino medio completo,nivel_acad_ensino medio incompleto,nivel_acad_ensino superior completo,nivel_acad_ensino superior cursando,...,demais_observacoes_emb_min,demais_observacoes_emb_max,comentario_nchar,comentario_nwords,comentario_emb_mean,comentario_emb_std,comentario_emb_min,comentario_emb_max,titulo_sim_ratio,sim_titulo_vs_vaga
0,pj autonomo,ensino superior completo,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.161850,0.117181,26,5,-0.000235,0.051030,-0.176741,0.204723,1.000000,1.000000
1,pj autonomo,ensino superior completo,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.161850,0.117181,48,8,0.000284,0.051030,-0.155262,0.165039,1.000000,1.000000
2,pj autonomo,ensino medio completo,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.185271,0.210202,22,5,0.000357,0.051030,-0.165430,0.155121,1.000000,1.000000
3,pj autonomo,ensino medio completo,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.185271,0.210202,12,1,0.000110,0.051031,-0.177052,0.497576,1.000000,1.000000
4,pj autonomo,ensino tecnico completo,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.153048,0.165421,12,1,0.000110,0.051031,-0.177052,0.497576,0.066667,0.188192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56697,clt full pj autonomo,ensino superior completo,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.129788,0.161187,12,1,0.000110,0.051031,-0.177052,0.497576,0.250000,0.124157
56698,clt full pj autonomo,ensino superior completo,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.177052,0.497576,58,7,0.000314,0.051030,-0.126359,0.117563,1.000000,1.000000
56699,clt full pj autonomo,ensino superior completo,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.177052,0.497576,12,1,0.000110,0.051031,-0.177052,0.497576,1.000000,1.000000
56700,clt full pj autonomo,ensino superior completo,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.177052,0.497576,12,1,0.000110,0.051031,-0.177052,0.497576,1.000000,1.000000


In [5]:
df = df.drop(columns=['target'])

caso_de_uma_pessoa_aleatoria = df.sample(1)
caso_de_uma_pessoa_aleatoria

Unnamed: 0,tipo_contratacao,nivel_academico,nivel_acad_doutorado completo,nivel_acad_doutorado cursando,nivel_acad_ensino fundamental completo,nivel_acad_ensino medio completo,nivel_acad_ensino medio incompleto,nivel_acad_ensino superior completo,nivel_acad_ensino superior cursando,nivel_acad_ensino superior incompleto,...,demais_observacoes_emb_min,demais_observacoes_emb_max,comentario_nchar,comentario_nwords,comentario_emb_mean,comentario_emb_std,comentario_emb_min,comentario_emb_max,titulo_sim_ratio,sim_titulo_vs_vaga
19638,clt full pj autonomo,ensino superior completo,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.133502,0.211642,12,1,0.00011,0.051031,-0.177052,0.497576,1.0,1.0


In [6]:
random_row_df = df.sample(n=1)
payload_dict = random_row_df.iloc[0].to_dict()

In [7]:
import json

json.dumps(payload_dict)

'{"tipo_contratacao": "clt full", "nivel_academico": "ensino superior cursando", "nivel_acad_doutorado completo": 0.0, "nivel_acad_doutorado cursando": 0.0, "nivel_acad_ensino fundamental completo": 0.0, "nivel_acad_ensino medio completo": 0.0, "nivel_acad_ensino medio incompleto": 0.0, "nivel_acad_ensino superior completo": 0.0, "nivel_acad_ensino superior cursando": 1.0, "nivel_acad_ensino superior incompleto": 0.0, "nivel_acad_ensino tecnico completo": 0.0, "nivel_acad_ensino tecnico cursando": 0.0, "nivel_acad_ensino tecnico incompleto": 0.0, "nivel_acad_mestrado completo": 0.0, "nivel_acad_mestrado cursando": 0.0, "nivel_acad_pos graduacao completo": 0.0, "nivel_acad_pos graduacao cursando": 0.0, "nivel_acad_pos graduacao incompleto": 0.0, "nivel_acad_None": 0.0, "tipo_contr_candidato podera escolher hunting pj autonomo": 0.0, "tipo_contr_candidato podera escolher pj autonomo": 0.0, "tipo_contr_clt cotas": 0.0, "tipo_contr_clt cotas clt full": 0.0, "tipo_contr_clt cotas cooperado"

In [8]:
payload_dict

{'tipo_contratacao': 'clt full',
 'nivel_academico': 'ensino superior cursando',
 'nivel_acad_doutorado completo': 0.0,
 'nivel_acad_doutorado cursando': 0.0,
 'nivel_acad_ensino fundamental completo': 0.0,
 'nivel_acad_ensino medio completo': 0.0,
 'nivel_acad_ensino medio incompleto': 0.0,
 'nivel_acad_ensino superior completo': 0.0,
 'nivel_acad_ensino superior cursando': 1.0,
 'nivel_acad_ensino superior incompleto': 0.0,
 'nivel_acad_ensino tecnico completo': 0.0,
 'nivel_acad_ensino tecnico cursando': 0.0,
 'nivel_acad_ensino tecnico incompleto': 0.0,
 'nivel_acad_mestrado completo': 0.0,
 'nivel_acad_mestrado cursando': 0.0,
 'nivel_acad_pos graduacao completo': 0.0,
 'nivel_acad_pos graduacao cursando': 0.0,
 'nivel_acad_pos graduacao incompleto': 0.0,
 'nivel_acad_None': 0.0,
 'tipo_contr_candidato podera escolher hunting pj autonomo': 0.0,
 'tipo_contr_candidato podera escolher pj autonomo': 0.0,
 'tipo_contr_clt cotas': 0.0,
 'tipo_contr_clt cotas clt full': 0.0,
 'tipo_cont

In [7]:
import requests

api_url = 'http://127.0.0.1:3000/predict'  # Endpoint definido no service.py

response = requests.post(
    api_url,
    headers={'Content-Type': 'application/json'},
    json=payload_dict,  # 'requests' converte o dict para JSON automaticamente
)

# Verificar se a requisição foi bem-sucedida (código 2xx)
# response.raise_for_status() # Levanta um erro para códigos HTTP ruins (4xx ou 5xx)

In [9]:
response.content

b'{"error":"Not able to process the request in 60 seconds"}'

In [None]:
# Imprimir a resposta da API
print(f'Status Code: {response.status_code}')
print('Resposta da API:')
print(response.json())  # Converte a resposta JSON da API em um dict Python

In [11]:
import pickle

with open('../'+ paths['modelo_treinado'], 'rb') as f:
    dados_carregados = pickle.load(f)

modelo = dados_carregados['model']
imputer = dados_carregados['imputer']
scaler = dados_carregados['scaler']
features = dados_carregados['features']

In [14]:
features = [
        col
        for col in df.columns
        if col != 'target' and pd.api.types.is_numeric_dtype(df[col])
    ]
df = df[features]

caso_de_uma_pessoa_aleatoria = df.sample(1)
caso_de_uma_pessoa_aleatoria

Unnamed: 0,nivel_acad_doutorado completo,nivel_acad_doutorado cursando,nivel_acad_ensino fundamental completo,nivel_acad_ensino medio completo,nivel_acad_ensino medio incompleto,nivel_acad_ensino superior completo,nivel_acad_ensino superior cursando,nivel_acad_ensino superior incompleto,nivel_acad_ensino tecnico completo,nivel_acad_ensino tecnico cursando,...,demais_observacoes_emb_min,demais_observacoes_emb_max,comentario_nchar,comentario_nwords,comentario_emb_mean,comentario_emb_std,comentario_emb_min,comentario_emb_max,titulo_sim_ratio,sim_titulo_vs_vaga
9678,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.150535,0.166965,12,1,0.00011,0.051031,-0.177052,0.497576,1.0,1.0


In [15]:
caso_de_uma_pessoa_aleatoria.values[0]

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  

In [16]:
caso_imputer = imputer.transform([caso_de_uma_pessoa_aleatoria.values[0]])[0]
caso_scaler = scaler.transform([caso_imputer])[0]
modelo.predict_proba([caso_scaler])



array([[0.65071429, 0.34928571]])

In [18]:
type(modelo)

sklearn.ensemble._forest.RandomForestClassifier

In [None]:
modelo.predict_log_proba

In [17]:
modelo

In [33]:
pd.DataFrame(caso_scaler).to_dict(orient='split')

{'index': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84],
 'columns': [0],
 'data': [[-0.0536109596622732],
  [0.0],
  [0.0],
  [-0.011501726586678177],
  [-0.34871656695933395],
  [-0.0269819722534416],
  [0.5493574952857494],
  [-0.190343703073752],
  [-0.10758239089608226],
  [-0.2658824563999154],
  [-0.020470406369719855],
  [-0.026569716472705562],
  [-0.011501726586678179],
  [-0.013281342715995811],
  [-0.06604453909502439],
  [-0.034842032732843226],
  [-0.01626697374183083],
  [0.0],
  [-0.011501726586678179],
  [-0.0115017265866781

In [29]:
caso_scaler

array([-0.05361096,  0.        ,  0.        , -0.01150173, -0.34871657,
       -0.02698197,  0.5493575 , -0.1903437 , -0.10758239, -0.26588246,
       -0.02047041, -0.02656972, -0.01150173, -0.01328134, -0.06604454,
       -0.03484203, -0.01626697,  0.        , -0.01150173, -0.01150173,
       -0.15174522, -0.02698197, -0.01626697, -0.03903137, -0.02529278,
       -0.0322057 , -0.46855697, -0.02047041, -0.00813268, -0.02348272,
       -0.00939091, -0.03354965, -0.05150205, -0.02047041, -0.08402333,
       -0.01242342,  2.66298875, -0.23447938, -0.00813268, -0.01626697,
       -0.11742895, -0.19593198, -0.01484932, -0.35212691, -0.00664023,
       -0.01328134, -0.00664023, -0.01150173, -0.71257375, -0.05319579,
       -0.02895559, -0.31454936, -0.10199157, -0.02615098, -0.00664023,
       -0.03484203, -0.01049948, -0.01408715,  0.        , -0.23281937,
       -0.1821328 , -0.67181586,  0.05823488,  0.20194439,  0.14673566,
       -0.70014354, -0.69343643,  0.32598417,  0.18023588,  0.38