# Feature Engineering

### Imports

In [49]:
import pandas as pd

### Load Dataset

In [50]:
df = pd.read_csv('../data/processed/df.csv') 
df.head()

Unnamed: 0,vacancy_contract_type,vacancy_sap,vacancy_region,vacancy_pcd,vacancy_professional_level,vacancy_education_level,vacancy_english_level,vacancy_spanish_level,prospect_candidate_status,prospect_application_date,candidate_ddd_mobile,candidate_pcd,candidate_certifications,candidate_academic_level,candidate_english_level,candidate_spanish_level
0,CLT,Não,São Paulo,Não,Analista,Ensino Superior Incompleto,Técnico,Nenhum,Em processo seletivo,07-12-2018,11,Nao informado,0,Nao informado,Nao informado,Nao informado
1,CLT,Não,São Paulo,Não,Analista,Ensino Superior Incompleto,Técnico,Nenhum,Em processo seletivo,07-12-2018,11,Nao informado,0,Nao informado,Nao informado,Nao informado
2,CLT,Não,São Paulo,Não,Analista,Ensino Superior Incompleto,Técnico,Nenhum,Em processo seletivo,07-12-2018,11,Nao informado,0,Nao informado,Nao informado,Nao informado
3,CLT,Não,São Paulo,Não,Analista,Ensino Superior Incompleto,Técnico,Nenhum,Em processo seletivo,07-12-2018,11,Nao informado,0,Nao informado,Nao informado,Nao informado
4,CLT,Não,São Paulo,Não,Analista,Ensino Superior Incompleto,Técnico,Nenhum,Em processo seletivo,07-12-2018,11,Nao informado,0,Nao informado,Nao informado,Nao informado


In [51]:
df['prospect_candidate_status'].value_counts()

prospect_candidate_status
Em processo seletivo    34779
Reprovado                5202
Aprovado                 2620
Desistiu                 2470
Name: count, dtype: int64

### Attribute matching

Direct matches between candidate profile and vacancy requirements.

In [52]:
def match_exact(a, b):
    if pd.isna(a) or pd.isna(b): return 0
    return int(str(a).strip().lower() == str(b).strip().lower())

df['match_education_level'] = df.apply(lambda x: match_exact(x['vacancy_education_level'], x['candidate_academic_level']), axis=1)
df['match_english_level'] = df.apply(lambda x: match_exact(x['vacancy_english_level'], x['candidate_english_level']), axis=1)
df['match_spanish_level'] = df.apply(lambda x: match_exact(x['vacancy_spanish_level'], x['candidate_spanish_level']), axis=1)
df['match_pcd'] = df.apply(lambda x: int(x['vacancy_pcd'] == 'Sim' and x['candidate_pcd'] == 'Sim'), axis=1)


### Geographic Proximity

For São Paulo region, we check if the candidate has a mobile phone with a matching area code.

In [53]:
sp_ddds = ['11', '12', '13', '14', '15', '16', '17', '18', '19']
df['mobile_region_match'] = df.apply(lambda x: int(str(x['vacancy_region']).lower() == 'são paulo' and str(x['candidate_ddd_mobile']) in sp_ddds), axis=1)

### Final Feature Preview

In [54]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

df_encoded = df.copy()

df_encoded.drop(columns=['prospect_application_date', 'candidate_ddd_mobile'], inplace=True)

relevant_cols_for_onehot = [
    'vacancy_contract_type', 'vacancy_sap', 'vacancy_region', 'vacancy_english_level',
    'vacancy_professional_level', 'vacancy_education_level', 'vacancy_spanish_level',
    'vacancy_pcd', 'prospect_candidate_status', 'candidate_academic_level',
    'candidate_english_level', 'candidate_spanish_level', 'candidate_pcd'
]

label_encoders = {}
one_hot_cols = []

for col in relevant_cols_for_onehot:
    if col in df_encoded.columns:
        n_unique = df_encoded[col].nunique()
        if n_unique > 2:
            one_hot_cols.append(col)
            print(f"OneHotEncode: {col} ({n_unique} categories)")
        else:
            le = LabelEncoder()
            df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
            label_encoders[col] = le
            print(f"LabelEncoded: {col} ({n_unique} categories)")
    else:
        print(f"Coluna '{col}' não encontrada no dataframe.")

df_encoded = pd.get_dummies(df_encoded, columns=one_hot_cols)
dummy_cols = [col for col in df_encoded.columns if any(prefix in col for prefix in one_hot_cols)]
df_encoded[dummy_cols] = df_encoded[dummy_cols].astype(int)

print("Encoding completed!")
display(df_encoded.head())


OneHotEncode: vacancy_contract_type (6 categories)
LabelEncoded: vacancy_sap (2 categories)
OneHotEncode: vacancy_region (26 categories)
OneHotEncode: vacancy_english_level (6 categories)
OneHotEncode: vacancy_professional_level (13 categories)
OneHotEncode: vacancy_education_level (15 categories)
OneHotEncode: vacancy_spanish_level (7 categories)
OneHotEncode: vacancy_pcd (3 categories)
OneHotEncode: prospect_candidate_status (4 categories)
OneHotEncode: candidate_academic_level (22 categories)
OneHotEncode: candidate_english_level (6 categories)
OneHotEncode: candidate_spanish_level (6 categories)
OneHotEncode: candidate_pcd (3 categories)
Encoding completed!


Unnamed: 0,vacancy_sap,candidate_certifications,match_education_level,match_english_level,match_spanish_level,match_pcd,mobile_region_match,vacancy_contract_type_CLT,vacancy_contract_type_Cooperado,vacancy_contract_type_Estagiário,...,candidate_english_level_Nenhum,candidate_spanish_level_Avançado,candidate_spanish_level_Básico,candidate_spanish_level_Fluente,candidate_spanish_level_Intermediário,candidate_spanish_level_Nao informado,candidate_spanish_level_Nenhum,candidate_pcd_Nao informado,candidate_pcd_Não,candidate_pcd_Sim
0,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,1,0,0
1,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,1,0,0
2,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,1,0,0
3,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,1,0,0
4,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,1,0,0


### Saving Features Data

In [55]:
from pathlib import Path


PROCESSED_DIR = Path("../data/encoded/")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

df_encoded.to_csv(PROCESSED_DIR / "df_encoded.csv", index=False)

print("Encoded files saved in:", PROCESSED_DIR)

Encoded files saved in: ..\data\encoded
