# Preprocessing
This notebook shows an example of how to use the preprocessing APIs of findhr package on the sample data loaded from the Example_InputDataSources notebook.

In [1]:
# Load and join raw data sources and their metadata.
%run Example_InputDataSources.ipynb

In [2]:
# Joined DataFrame.
df_all.head()

Unnamed: 0,id_c,education_background_c,professional_experience_c,skills_c,gender_c,agg_perceived_foreign_c,id_j,education_reqs_j,experience_reqs_role_j,experience_reqs_duration_j,skills_j,gender_j,agg_perceived_foreign_j,score,ranking,shortlisted
0,5,[{'institution': 'Complutense University Of Ma...,"[{'institution': 'Stylo Milano', 'start_date':...","[Communications, Social Integration, Microsoft...",Man,No,5,"[Law Bachelor, Degree In Law, Higher Degree In...",[Consultant],12,"[Punctuality, Organization, Accounting, Englis...",Man,No,0.0,138,0
1,6,[{'institution': 'Coronel Rosales Agricultural...,"[{'institution': 'Securitas Direct', 'start_da...","[Refinancing, Economy, Microsoft Excel, Collec...",Man,No,3,[],"[Sales Assistant, Saleswoman, Commercial Advisor]",12,"[English, Spanish, Communications, Communicati...",Man,No,0.0,89,0
2,10,[{'institution': 'Complutense University Of Ma...,"[{'institution': 'Carrefour Express', 'start_d...","[Entrepreneurship, Literacy, Web Design, Adobe...",Woman,No,5,"[Law Bachelor, Degree In Law, Higher Degree In...",[Consultant],12,"[Punctuality, Organization, Accounting, Englis...",Man,No,0.492754,55,0
3,11,"[{'institution': 'Les Ribera De Los Molinos', ...","[{'institution': 'Decimas Sl', 'start_date': '...","[Consulting, Sap Crm, Collections, Automation,...",Woman,No,3,[],"[Sales Assistant, Saleswoman, Commercial Advisor]",12,"[English, Spanish, Communications, Communicati...",Man,No,0.492754,35,0
4,15,[{'institution': 'Escuela Politcnica Superior ...,"[{'institution': 'Reintegrate', 'start_date': ...","[Microsoft Word, Biofuels, English, Entreprene...",Man,No,3,[],"[Sales Assistant, Saleswoman, Commercial Advisor]",12,"[English, Spanish, Communications, Communicati...",Man,No,0.453089,49,0


In [3]:
# Joined metadata.
md_all

{'id_c': 
 	SCHEMA = {'type': 'number'}
 	ATTR_TYPE = object
 	ATTR_USAGE = default
 	KNOWLEDGE_BASE = None,
 'education_background_c': 
 	SCHEMA = {'type': 'array', 'items': {'type': 'object', 'properties': {'institution': {'type': 'string'}, 'end_date': {'type': 'string'}, 'degree': {'type': 'string'}, 'duration': {'type': 'string'}}}}
 	ATTR_TYPE = object
 	ATTR_USAGE = default
 	KNOWLEDGE_BASE = None,
 'professional_experience_c': 
 	SCHEMA = {'type': 'array', 'items': {'type': 'object', 'properties': {'institution': {'type': 'string'}, 'end_date': {'type': 'string'}, 'role': {'type': 'string'}, 'duration': {'type': 'string'}}}}
 	ATTR_TYPE = object
 	ATTR_USAGE = default
 	KNOWLEDGE_BASE = None,
 'skills_c': 
 	SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
 	ATTR_TYPE = object
 	ATTR_USAGE = default
 	KNOWLEDGE_BASE = None,
 'gender_c': 
 	SCHEMA = {'enum': ['Man', 'Woman', 'Any']}
 	ATTR_TYPE = category
 	ATTR_USAGE = sensitive
 	KNOWLEDGE_BASE = None,
 'agg_perceived_f

In [4]:
# Setting category columns in DataFrame based on metadata.
cat_cols = [k for k, v in md_all.items() if v.attr_type=='category']
df_all[cat_cols] = df_all[cat_cols].astype('category')
# Dataframe metadata.
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1643 entries, 0 to 1642
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   id_c                        1643 non-null   int64   
 1   education_background_c      1643 non-null   object  
 2   professional_experience_c   1643 non-null   object  
 3   skills_c                    1643 non-null   object  
 4   gender_c                    1643 non-null   category
 5   agg_perceived_foreign_c     1643 non-null   category
 6   id_j                        1643 non-null   int64   
 7   education_reqs_j            1643 non-null   object  
 8   experience_reqs_role_j      1643 non-null   object  
 9   experience_reqs_duration_j  1643 non-null   int64   
 10  skills_j                    1643 non-null   object  
 11  gender_j                    1643 non-null   category
 12  agg_perceived_foreign_j     1643 non-null   category
 13  score             

In [5]:
# Define ids, target feature(s), and predictive features.
id_cols = ['id_c', 'id_j']
target_cols = ['score', 'ranking', 'shortlisted']
pred_cols = df_all.columns.difference(target_cols + id_cols)

In [6]:
df_ADS

Unnamed: 0,id_c,id_j,score,ranking,shortlisted
0,6,3,0.000000,89,0
1,11,3,0.492754,35,0
3,15,3,0.453089,49,0
4,17,3,0.049689,81,0
5,19,3,0.080268,78,0
...,...,...,...,...,...
1537,2204,1,0.550725,44,0
1538,2211,1,0.550725,44,0
1539,2213,1,0.478261,123,0
1540,2214,1,0.478261,123,0


In [12]:
# Derived column transformations.
from findhr.preprocess.example_mappings import RelevantExperienceForRole, ExtractMonthDurationJob, MatchOrdinal, ExtractListOfProperty, MatchFeatureAtLeastInList, MatchFeatureSet, MatchBinary

# Calculated features.
maps_derived_1 = {
    (('professional_experience_c', 'experience_reqs_role_j',), ('relevant_exp_role_c',)): RelevantExperienceForRole(),
}

maps_derived_2 = {
        (('relevant_exp_role_c',), ('role_duration_months_c',)): ExtractMonthDurationJob(duration_key='duration_months'),
        (('education_background_c',), ('degree_list_c',)): ExtractListOfProperty(property_key='degree')
}

# Fitness features about the matching between candidate's features and job's requirements.
maps_matching = {
    (('experience_reqs_duration_j', 'role_duration_months_c'), ('fitness_experience',)): MatchOrdinal(),
    (('education_reqs_j', 'education_background_c'), ('fitness_education',)): MatchFeatureAtLeastInList(),
    (('skills_j', 'skills_c'), ('fitness_skills',)): MatchFeatureSet(),
    (('gender_j', 'gender_c'), ('fitness_gender',)): MatchBinary(),
    (('agg_perceived_foreign_j', 'agg_perceived_foreign_c'), ('fitness_foreign',)): MatchBinary()
}

# Helper variable for the fitness features
list_cols_fitness = ['fitness_experience', 'fitness_education', 'fitness_skills', 'fitness_gender', 'fitness_foreign']
maps_matching

{(('experience_reqs_duration_j', 'role_duration_months_c'),
  ('fitness_experience',)): MatchOrdinal(),
 (('education_reqs_j', 'education_background_c'),
  ('fitness_education',)): MatchFeatureAtLeastInList(),
 (('skills_j', 'skills_c'), ('fitness_skills',)): MatchFeatureSet(),
 (('gender_j', 'gender_c'), ('fitness_gender',)): MatchBinary(),
 (('agg_perceived_foreign_j', 'agg_perceived_foreign_c'),
  ('fitness_foreign',)): MatchBinary()}

In [13]:
# Scikit-learn transformation for numeric and categorical features
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer

numeric_features = list_cols_fitness
categorical_features = ['gender_c', 'agg_perceived_foreign_c']
# imputing and scaling numeric features
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), # Not needed for the used dataset.
        ("scaler", StandardScaler()) # Not needed for the decision tree, let's keep it for the sake of generality.
    ]
)
# imputing and encoding categorical features
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), # Not needed for the used dataset, again for the sake of generality.
        ("encoder", OneHotEncoder()), # Convert to one-hot encoding
    ]
)
# combining the two above
column_preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
   ]
)

In [14]:
from findhr.preprocess.mapping import AttachMetadata, DetachMetadata, DerivedColumn
# The pipeline is composed of two phases:
# 1. Preprocessing with metadata (using findhr package)
pipeline_derived = Pipeline(steps=[
    ("init", AttachMetadata(md_all)),
    ('mapping_1', DerivedColumn(maps_derived_1, verbose=True)),
    ('mapping_2', DerivedColumn(maps_derived_2, verbose=True)),
    ("matching", DerivedColumn(maps_matching, verbose=True)),
    # ("fitness", GroundTruthLinearWeightedScorer(gt_weights_fair)),
    ("end", DetachMetadata())
])
# 2. Standard scikit-learn preprocessing to prepare the data for the model covered by column preprocessor.


In [15]:
# Complete preprocessing pipeline.
pipeline_pre = Pipeline(
    steps=[
        # first phase: preprocessing with metadata
        ('fitness_value', pipeline_derived),
        # second phase: preprocessing without metadata (standard scikit-learn)
        ("column_preprocessor", column_preprocessor)
       ]
)
# Pipeline for regression model on the target feature "score".
pipeline_regr = Pipeline(
    steps=[
        # preprocessing with metadata
        ("preprocessing", pipeline_pre),
        # model inference
        ("regressor", DecisionTreeRegressor(max_depth=3))
       ]
)

In [16]:
# Model fit.
pipeline_regr.fit(df_all.loc[:, pred_cols], df_all.loc[:, 'score'])

Fitting mapping RelevantExperienceForRole for input columns ('professional_experience_c', 'experience_reqs_role_j') and output columns ('relevant_exp_role_c',)
X.columns = Index(['agg_perceived_foreign_c', 'agg_perceived_foreign_j',
       'education_background_c', 'education_reqs_j',
       'experience_reqs_duration_j', 'experience_reqs_role_j', 'gender_c',
       'gender_j', 'professional_experience_c', 'skills_c', 'skills_j'],
      dtype='object')
Fitting mapping ExtractMonthDurationJob for input columns ('relevant_exp_role_c',) and output columns ('role_duration_months_c',)
X.columns = Index(['agg_perceived_foreign_c', 'agg_perceived_foreign_j',
       'education_background_c', 'education_reqs_j',
       'experience_reqs_duration_j', 'experience_reqs_role_j', 'gender_c',
       'gender_j', 'professional_experience_c', 'skills_c', 'skills_j',
       'relevant_exp_role_c'],
      dtype='object')
Fitting mapping ExtractListOfProperty for input columns ('education_background_c',) and 

In [17]:
transformed_data = pipeline_pre.fit_transform(df_all.loc[:, pred_cols])
pd.DataFrame(transformed_data).describe()

Fitting mapping RelevantExperienceForRole for input columns ('professional_experience_c', 'experience_reqs_role_j') and output columns ('relevant_exp_role_c',)
X.columns = Index(['agg_perceived_foreign_c', 'agg_perceived_foreign_j',
       'education_background_c', 'education_reqs_j',
       'experience_reqs_duration_j', 'experience_reqs_role_j', 'gender_c',
       'gender_j', 'professional_experience_c', 'skills_c', 'skills_j'],
      dtype='object')
Fitting mapping ExtractMonthDurationJob for input columns ('relevant_exp_role_c',) and output columns ('role_duration_months_c',)
X.columns = Index(['agg_perceived_foreign_c', 'agg_perceived_foreign_j',
       'education_background_c', 'education_reqs_j',
       'experience_reqs_duration_j', 'experience_reqs_role_j', 'gender_c',
       'gender_j', 'professional_experience_c', 'skills_c', 'skills_j',
       'relevant_exp_role_c'],
      dtype='object')
Fitting mapping ExtractListOfProperty for input columns ('education_background_c',) and 

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,1643.0,1643.0,1643.0,1643.0,1643.0,1643.0,1643.0,1643.0,1643.0
mean,6.270767e-17,4.757133e-17,3.4597330000000004e-17,7.7844e-17,2.162333e-18,0.480828,0.519172,0.877663,0.122337
std,1.000304,1.000304,1.000304,1.000304,1.000304,0.499784,0.499784,0.327775,0.327775
min,-0.358377,-0.2435499,-1.2232,-0.9623632,-2.678457,0.0,0.0,0.0,0.0
25%,-0.358377,-0.2435499,-0.7784247,-0.9623632,0.3733493,0.0,0.0,1.0,0.0
50%,-0.358377,-0.2435499,-0.3336493,-0.9623632,0.3733493,0.0,1.0,1.0,0.0
75%,-0.358377,-0.2435499,0.5559016,1.039109,0.3733493,1.0,1.0,1.0,0.0
max,2.790358,4.105934,3.224554,1.039109,0.3733493,1.0,1.0,1.0,1.0


In [18]:
# Model prediction.
pipeline_regr.predict(df_all.loc[:, pred_cols])

list_dict = [{'institution': 'Complutense University Of Madrid', 'start_date': 'January 2023', 'end_date': 'Ongoing', 'degree': 'Degree In Law'}]
list_dict = [{'institution': 'Coronel Rosales Agricultural School', 'start_date': 'January 1988', 'end_date': 'December 1993', 'degree': 'Agricultural Technical Engineer'}]
list_dict = [{'institution': 'Complutense University Of Madrid', 'start_date': 'January 2023', 'end_date': 'Ongoing', 'degree': 'Degree In Law'}]
list_dict = [{'institution': 'Les Ribera De Los Molinos', 'start_date': 'January 2016', 'end_date': 'December 2020', 'degree': 'Degree In Aesthetics And Beauty'}]
list_dict = [{'institution': 'Escuela Politcnica Superior De Lugo', 'start_date': 'January 1988', 'end_date': 'December 1993', 'degree': 'Agricultural Technical Engineer'}]
list_dict = [{'institution': 'Complutense University Of Madrid', 'start_date': 'January 2023', 'end_date': 'Ongoing', 'degree': 'Degree In Law'}]
list_dict = [{'institution': 'Complutense University 



array([0.45328752, 0.44748211, 0.45328752, ..., 0.45328752, 0.46784556,
       0.42921962])

In [19]:
# Metadata at the end of the first phase of the pipeline.
# See the metadata generated for the derived columns (last three ones).